Package advene :: Package model :: Package parsers :: Module advene_xml
[hide private]
[frames] | no frames]

Source Code for Module advene.model.parsers.advene_xml

  1  """ 
  2  Unstable and experimental parser implementation. 
  3  """ 
  4   
  5  import base64 
  6  from functools import partial 
  7  from os import path 
  8  from os.path import exists 
  9  from xml.etree.ElementTree import iterparse 
 10  from xml.parsers.expat import ExpatError 
 11   
 12  from advene.model.consts import ADVENE_XML, PARSER_META_PREFIX, PACKAGED_ROOT 
 13  from advene.model.parsers.base_xml import XmlParserBase 
 14  from advene.model.parsers.exceptions import ParserError 
 15  import advene.model.serializers.advene_xml as serializer 
 16  from advene.util.files import get_path, is_local 
17 18 -class Parser(XmlParserBase):
19 20 NAME = serializer.NAME 21 EXTENSION = serializer.EXTENSION 22 MIMETYPE = serializer.MIMETYPE 23 SERIALIZER = serializer # may be None for some parsers 24 25 @classmethod
26 - def claims_for_parse(cls, file_):
27 """Is this parser likely to parse that file-like object? 28 29 `file_` is a readable file-like object. It is the responsability of the 30 caller to close it. 31 32 Return an int between 00 and 99, indicating the likelyhood of this parser 33 to handle correctly the given URL. 70 is used as a standard value when the 34 parser is pretty sure it can handle the URL. 35 """ 36 r = 0 37 if hasattr(file_, "seek"): 38 # try to open it as xml file and get the root element 39 t = file_.tell() 40 file_.seek(0) 41 it = iterparse(file_, events=("start",)) 42 try: 43 ev, el = it.next() 44 except ExpatError, e: 45 return 0 46 else: 47 if el.tag == "{%s}package" % cls._NAMESPACE_URI: 48 return 80 49 else: 50 return 0 51 file_.seek(0) 52 53 info = getattr(file_, "info", lambda: {})() 54 mimetype = info.get("content-type", "") 55 if mimetype.startswith(cls.MIMETYPE): 56 r = 80 57 else: 58 if mimetype.startswith("application/xml") \ 59 or mimetype.startswith("text/xml"): 60 r += 20 61 fpath = get_path(file_) 62 if fpath.endswith(cls.EXTENSION): 63 r += 50 64 elif fpath.endswith(".xml"): 65 r += 20 66 return r
67 68 @classmethod
69 - def make_parser(cls, file_, package):
70 """Return a parser that will parse `file_` into `package`. 71 72 `file_` is a writable file-like object. It is the responsability of the 73 caller to close it. 74 75 The returned object must implement the interface for which 76 :class:`_Parser` is the reference implementation. 77 """ 78 return cls(file_, package)
79 80 @classmethod
81 - def parse_into(cls, file_, package):
82 """A shortcut for ``make_parser(file_, package).parse()``. 83 84 See also `make_parser`. 85 """ 86 cls(file_, package).parse()
87
88 - def parse(self):
89 "Do the actual parsing." 90 file_ = self.file 91 fpath = get_path(file_) 92 if is_local(file_) and fpath.endswith("content.xml"): 93 # looks like this is a manually-unzipped package, 94 dirname = path.split(fpath)[0] 95 mfn = path.join(dirname, "mimetype") 96 if exists(mfn): 97 f = open(mfn) 98 mimetype = f.read() 99 f.close() 100 if mimetype == self.MIMETYPE: 101 self.package.set_meta(PACKAGED_ROOT, dirname) 102 XmlParserBase.parse(self)
103 104 # end of public interface 105 106 _NAMESPACE_URI = ADVENE_XML 107
108 - def __init__(self, file_, package):
109 assert self.__class__.claims_for_parse(file_) > 0 110 XmlParserBase.__init__(self, file_, package, self._NAMESPACE_URI, 111 "package") 112 self._postponed = []
113
114 - def do_or_postpone(self, id, function, function2=None):
115 """ 116 If `identified` an imported element, function is invoked with `id` as 117 its argument. 118 119 If `id` is a plain identifier, it is checked whether `self.package` has 120 such an element. If so, function is invoked with that element as its 121 argument; else, its execution is postponed. 122 123 This is useful because some elements in the serialization may refer to 124 other elements that are defined further. 125 126 If function2 is provided and the invocation is postponed, then it will 127 be function2 rather than function that will be invoked. 128 """ 129 colon = id.find(":") 130 if colon > 0: 131 elt = id 132 do_it_now = self.package.get(id[:colon]) is not None 133 else: 134 elt = self.package.get(id) 135 do_it_now = elt is not None 136 if do_it_now: 137 try_enter_no_event_section(elt, function) 138 try: 139 function(elt) 140 finally: 141 try_exit_no_event_section(elt, function) 142 else: 143 self._postponed.append((function2 or function, id))
144
145 - def optional_sequence(self, tag, *args, **kw):
146 items_name = kw.pop("items_name", None) 147 if items_name is None: 148 items_name = tag[:-1] # remove terminal 's' 149 stream = self.stream 150 151 stream.forward() 152 elem = stream.elem 153 if stream.event == "start" \ 154 and elem.tag == self.tag_template % tag: 155 self.sequence(items_name, *args, **kw) 156 self._check_end(elem) 157 else: 158 stream.pushback()
159
161 """ 162 This method may be overridden by application model parsers having a 163 syntax simiar to the generic advene format - like the cinelab parser. 164 """ 165 self.optional_sequence("imports") 166 self.optional_sequence("tags") 167 self.optional_sequence("medias") 168 self.optional_sequence("resources") 169 self.optional_sequence("annotations") 170 self.optional_sequence("relations") 171 self.optional_sequence("views") 172 self.optional_sequence("queries", items_name="query") 173 self.optional_sequence("lists") 174 self.optional_sequence("external-tag-associations", 175 items_name="association")
176
177 - def handle_package(self):
178 """ 179 Subclasses should normally not override this method, but rather 180 `manage_package_subelements`. 181 """ 182 pa = self.package 183 namespaces = "\n".join([ " ".join(el) 184 for el in self.ns_stack if el[0] ]) 185 if namespaces: 186 pa.set_meta(PARSER_META_PREFIX+"namespaces", namespaces) 187 uri = self.current.get("uri") 188 if uri is not None: 189 pa.uri = uri 190 self.optional("meta", pa) 191 self.manage_package_subelements() 192 for f, id in self._postponed: 193 if id.find(":") > 0: # imported 194 f(id) 195 else: 196 elt = self.package.get(id) 197 try_enter_no_event_section(elt, f) 198 try: 199 f(elt) 200 finally: 201 try_exit_no_event_section(elt, f)
202
203 - def handle_import(self):
204 id = self.get_attribute("id") 205 url = self.get_attribute("url") 206 uri = self.get_attribute("uri", "") 207 elt = self.package._create_import_in_parser(id, url, uri) 208 elt.enter_no_event_section() 209 try: 210 self.optional_sequence("tags", element=elt) 211 self.optional("meta", elt) 212 finally: 213 elt.exit_no_event_section()
214
215 - def handle_tag(self, element=None):
216 if element is None: 217 # tag definition in package 218 id = self.get_attribute("id") 219 elt = self.package.create_tag(id) 220 elt.enter_no_event_section() 221 try: 222 self.optional_sequence( 223 "imported-elements", items_name="element", advene_tag=elt) 224 self.optional_sequence("tags", element=elt) 225 self.optional("meta", elt) 226 finally: 227 elt.exit_no_event_section() 228 else: 229 # tag association in element 230 id = self.get_attribute("id-ref") 231 self.do_or_postpone(id, 232 partial(self.package.associate_tag, element))
233
234 - def handle_media(self):
235 id = self.get_attribute("id") 236 url = self.get_attribute("url") 237 foref = self.get_attribute("frame-of-reference") 238 elt = self.package.create_media(id, url, foref) 239 elt.enter_no_event_section() 240 try: 241 self.optional_sequence("tags", element=elt) 242 self.optional("meta", elt) 243 finally: 244 elt.exit_no_event_section()
245
246 - def handle_resource(self):
247 id = self.get_attribute("id") 248 elt = self.required("content", self.package.create_resource, id) 249 elt.enter_no_event_section() 250 try: 251 self.optional_sequence("tags", element=elt) 252 self.optional("meta", elt) 253 finally: 254 elt.exit_no_event_section()
255
256 - def handle_annotation(self):
257 id = self.get_attribute("id") 258 media = self.get_attribute("media") 259 if media.find(":") <= 0: # same package 260 media = self.package.get(media) 261 if media is None: 262 raise ParserError("unknown media %s" % self.get_attribute("media")) 263 begin = self.get_attribute("begin") 264 try: 265 begin = int(begin) 266 except ValueError: 267 raise ParserError("wrong begin value for %s" % id) 268 end = self.get_attribute("end") 269 try: 270 end = int(end) 271 except ValueError: 272 raise ParserError("wrong end value for %s" % id) 273 if end < begin: 274 raise ParserError("end is before begin in %s" % id) 275 elt = self.required("content", self.package.create_annotation, 276 id, media, begin, end) 277 elt.enter_no_event_section() 278 try: 279 self.optional_sequence("tags", element=elt) 280 self.optional("meta", elt) 281 finally: 282 elt.exit_no_event_section()
283
284 - def handle_relation(self):
285 id = self.get_attribute("id") 286 elt = self.package.create_relation(id, "x-advene/none") 287 def update_content_info(mimetype, model, url): 288 elt.content_mimetype = mimetype 289 elt.content_model = model 290 elt.content_url = url 291 return elt
292 elt.enter_no_event_section() 293 try: 294 self.optional_sequence("members", elt) 295 self.optional("content", update_content_info) 296 self.optional_sequence("tags", element=elt) 297 self.optional("meta", elt) 298 finally: 299 elt.exit_no_event_section()
300
301 - def handle_view(self):
302 id = self.get_attribute("id") 303 elt = self.required("content", self.package.create_view, id) 304 elt.enter_no_event_section() 305 try: 306 self.optional_sequence("tags", element=elt) 307 self.optional("meta", elt) 308 finally: 309 elt.exit_no_event_section()
310
311 - def handle_query(self):
312 id = self.get_attribute("id") 313 elt = self.required("content", self.package.create_query, id) 314 elt.enter_no_event_section() 315 try: 316 self.optional_sequence("tags", element=elt) 317 self.optional("meta", elt) 318 finally: 319 elt.exit_no_event_section()
320
321 - def handle_list(self):
322 id = self.get_attribute("id") 323 elt = self.package.create_list(id) 324 elt.enter_no_event_section() 325 try: 326 self.optional_sequence("items", elt, [0]) 327 self.optional_sequence("tags", element=elt) 328 self.optional("meta", elt) 329 finally: 330 elt.exit_no_event_section()
331 332 # utility methods 333
334 - def handle_meta(self, obj):
335 elem = self.complete_current() 336 for child in elem: 337 key = child.tag 338 if key.startswith("{"): 339 cut = key.find("}") 340 key = key[1:cut] + key[cut+1:] 341 if len(child): 342 raise ParserError("Unexpected sub-element in metadata %s" % 343 key) 344 val = child.get("id-ref") 345 if val is None: 346 text = child.text or "" # because child.text could be None 347 obj.enter_no_event_section() 348 try: 349 obj.set_meta(key, text, False) 350 finally: 351 obj.exit_no_event_section() 352 elif val.find(":") > 0: # imported 353 obj.enter_no_event_section() 354 try: 355 obj.set_meta(key, val, True) 356 finally: 357 obj.exit_no_event_section() 358 else: 359 self.do_or_postpone(val, partial(obj.set_meta, key))
360
361 - def handle_content(self, creation_method, *args):
362 mimetype = self.get_attribute("mimetype") 363 url = self.get_attribute("url", "") 364 model = self.get_attribute("model", "") 365 encoding = self.get_attribute("encoding", "") 366 elt = creation_method(*args + (mimetype, "", url)) 367 self.do_or_postpone(model, elt._set_content_model) 368 elem = self.complete_current() 369 if len(elem): 370 raise ParserError("no XML tag allowed in content; use &lt;tag>") 371 data = elem.text 372 if url and data and data.strip(): 373 raise ParserError("content can not have both url (%s) and data" % 374 url) 375 elif data: 376 if encoding: 377 if encoding == "base64": 378 data = base64.decodestring(data) 379 else: 380 raise ParserError("encoding %s is not supported", encoding) 381 elt.enter_no_event_section() 382 try: 383 elt.content_data = data 384 finally: 385 elt.exit_no_event_section() 386 return elt
387
388 - def handle_member(self, relation):
389 a = self.get_attribute("id-ref") 390 if ":" not in a: 391 a = self.package.get(a) 392 relation.append(a)
393
394 - def handle_item(self, lst, c):
395 # c is a 1-item list containing the virtual length of the list, 396 # i.e. the length taking into account the postponed elements 397 # it is used to insert postponed elements at the right index 398 id = self.get_attribute("id-ref") 399 self.do_or_postpone(id, lst.append, partial(lst.insert, c[0])) 400 c[0] += 1
401
402 - def handle_element(self, advene_tag):
403 id = self.get_attribute("id-ref") 404 # should only be imported, so no check 405 self.package.associate_tag(id, advene_tag)
406
407 - def handle_association(self):
408 elt_id = self.get_attribute("element") 409 tag_id = self.get_attribute("tag") 410 # both tag and element should be imported, so no check 411 self.package.associate_tag(elt_id, tag_id)
412
413 -def try_enter_no_event_section(elt, function):
414 getattr(elt, "enter_no_event_section", lambda: None)() 415 # try to also find an element in 'function' 416 function = getattr(function, "func", function) # unwrap partial function 417 im_self = getattr(function, "im_self", None) 418 getattr(im_self, "enter_no_event_section", lambda: None)()
419
420 -def try_exit_no_event_section(elt, function):
421 # try to find an element in 'function' 422 function = getattr(function, "func", function) # unwrap partial function 423 im_self = getattr(function, "im_self", None) 424 getattr(im_self, "exit_no_event_section", lambda: None)() 425 getattr(elt, "exit_no_event_section", lambda: None)()
426 427 # 428