Package advene :: Package model :: Package parsers :: Module base_xml
[hide private]
[frames] | no frames]

Source Code for Module advene.model.parsers.base_xml

  1  """I provide base classes for XML parsers. 
  2   
  3  TODO: document the provided classes. 
  4  """ 
  5  from xml.etree.ElementTree import iterparse 
  6   
  7  from advene.model.consts import _RAISE 
  8  from advene.model.parsers.exceptions import ParserError 
9 10 -class XmlParserBase(object):
11 """ 12 TODO write a better documentation 13 14 The idea is that subclasses define ``handle_X`` methods where X is the 15 unqualified tag. 16 17 Property `package` holds the package to parse into. 18 19 DEPRECATED: Property `backend` and `package_id` are useful to feed the 20 package's backend. 21 22 Property `current` always points to the current element (with the 23 ElementTree API). Note that the element will have its attribute, but not 24 its text nor its sub-elements. To wait for an element to be completely 25 constructed, invoke method `complete_current`. However, to parse 26 subelements, you may prefer to use methods `required`, `optional` and 27 `sequence`, that will check the structure of subelements, then invoke the 28 corresponding `handle_X` methods. Note that `required` and `optional` will 29 return the value returned by `handle_X` (`optional` returns None if the 30 element is not found). 31 32 Method `get_attribute` is a shortcut for ``current.get(k[, d])`` but will 33 raise a `ParseError` with the appopriate message if the attribute is 34 missing and no default value is provided. 35 36 Property `ns_stack` is a list of (prefix, uri) pairs used as a stack for 37 namespaces. 38 39 For advanced use, property `stream` holds the underlying `Stream` instance. 40 41 See `advene.model.parsers.advene_xml` for an example. 42 """ 43
44 - def __init__(self, file_, package, namespace_uri, root):
45 self.file = file_ 46 self.package = package 47 self.namespace_uri = namespace_uri 48 self.tag_template = "{%s}%%s" % namespace_uri 49 self.root = root 50 self.clear_after_handle = True 51 self._completed = 0 52 self.cut = len(namespace_uri)+2
53 54 @property
55 - def current(self):
56 return self.stream.elem
57 58 @property
59 - def ns_stack(self):
60 return self.stream.namespaces
61
62 - def get_attribute(self, key, default=_RAISE):
63 e = self.stream.elem 64 r = e.get(key, default) 65 if r is _RAISE: 66 raise ParserError("missing attribute %s in %s" % 67 (key, e.tag[self.cut:])) 68 else: 69 return r
70
71 - def parse(self):
72 f = self.file 73 self.backend = self.package._backend # TODO: remove (deprecated) 74 self.package_id = self.package._id # TODO: remove (deprecated) 75 self.stream = st = Stream(f) 76 expected = self.tag_template % self.root 77 if st.elem.tag != expected: 78 raise ParserError("expecting %s, found %s" % 79 (expected, self.stream.elem.tag)) 80 self.package.enter_no_event_section() 81 try: 82 self._handle([], {}) 83 finally: 84 self.package.exit_no_event_section()
85
86 - def required(self, tag, *args, **kw):
87 stream = self.stream 88 stream.forward() 89 elem = stream.elem 90 if stream.event != "start" or elem.tag != self.tag_template % tag: 91 raise ParserError("expecting %s, found %s" % 92 (tag, self.stream.elem.tag)) 93 r = self._handle(args, kw) 94 self._check_end(elem) 95 return r
96
97 - def optional(self, tag, *args, **kw):
98 stream = self.stream 99 stream.forward() 100 elem = stream.elem 101 if stream.event == "start" and elem.tag == self.tag_template % tag: 102 r = self._handle(args, kw) 103 self._check_end(elem) 104 return r 105 else: 106 self.stream.pushback() 107 return None
108
109 - def sequence(self, tag, *args, **kw):
110 """NB: this methods allows an *empty* sequence. 111 112 If you want a sequence with at least 1 element, use the following 113 pattern::: 114 115 required(mytag) 116 sequence(mytag) 117 """ 118 stream = self.stream 119 tag = self.tag_template % tag 120 stream.forward() 121 while stream.event == "start" and stream.elem.tag == tag: 122 elem = stream.elem 123 self._handle(args, kw) 124 self._check_end(elem) 125 stream.forward() 126 stream.pushback()
127
128 - def complete_current(self):
129 self._completed += 1 130 tag = self.stream.elem.tag 131 for ev, el in self.stream: 132 if ev == "end" and el.tag == tag: 133 break 134 return el
135
136 - def _handle(self, args, kw, ):
137 stream = self.stream 138 elem = self.stream.elem 139 assert elem.tag.startswith("{%s}" % self.namespace_uri) 140 n = t = elem.tag[self.cut:] 141 i = n.find("-") 142 while i >= 0: 143 n = n[:i] + "_" + n[i+1:] 144 i = n[i+1:].find("-") 145 h = getattr(self, "handle_%s" % n, None) 146 if h is None: 147 raise NotImplementedError("don't know what to do with tag %s" % t) 148 return h(*args, **kw)
149
150 - def _check_end(self, elem):
151 stream = self.stream 152 if self._completed: 153 self._completed -= 1 154 else: 155 stream.forward() 156 event = stream.event 157 if event == "end" and stream.elem == elem: 158 if self.clear_after_handle: 159 # makes parsing less memory-consuming 160 elem.clear() 161 else: 162 if event == "end": 163 # meaning that stream.elem != elem, should not happen 164 raise ParserError("unbalanced parsing of %s" % 165 (elem.tag[self.cut:],)) 166 else: 167 raise ParserError("unexpected child %s in %s" % 168 (stream.elem.tag, elem.tag[self.cut:]))
169
170 171 -class Stream(object):
172 """ 173 Wrap the result of iterparse: 174 * start-ns and end-ns are interpreted, (prefix, uri) pairs being pushed 175 and popped accordingly in attribute `namespaces` 176 * start and end events are accessible through the `event` and `elem` 177 178 Unlike iterators, a `Stream` has a notion of "current" item (accessible 179 through `event` and `elem`. To access the next element, the `forward` 180 method must be explicitly invoked. If it reaches the end, `event` and 181 `elem` will be None. 182 183 Note that a `Stream` is also iterable. The first yielded item will be the 184 current item. If the iteration is interrupted, the current item will be 185 the last yielded item. 186 """
187 - def __init__(self, filelike):
188 if hasattr(filelike, "seek"): 189 # may be required, because claims_for_url messes with seek 190 filelike.seek(0) 191 self._it = iterparse(filelike, 192 events=("start", "end", "start-ns", "end-ns",)) 193 self.namespaces = [] 194 self._event = None 195 self._elem = None 196 self._prev = None 197 self._next = None 198 self.forward()
199 200 @property
201 - def event(self):
202 return self._event
203 204 @property
205 - def elem(self):
206 return self._elem
207
208 - def forward(self):
209 if self._it is None: return False 210 if self._next: 211 self._prev = self._event, self._elem 212 self._event, self._elem = self._next 213 self._next = None 214 return True 215 stop = False 216 namespaces = self.namespaces 217 try: 218 while not stop: 219 ev, el = self._it.next() 220 if ev == "start-ns": 221 namespaces.append(el) 222 elif ev == "end-ns": 223 namespaces.pop(-1) 224 else: 225 stop = True 226 except StopIteration: 227 ev, el = None, None 228 self._it = None 229 self._prev = self._event, self._elem 230 self._event = ev 231 self._elem = el 232 return ev is not None
233
234 - def pushback(self):
235 """Push the last item back in the stream. 236 237 Note that no more than one item can be pushed back. 238 239 Limitation: this does not change `namespaces` accordingly! 240 """ 241 if self._prev is None or self._prev[0] is None: 242 raise ValueError("nothing to pushback") 243 elif self._next is not None: 244 raise Exception("can only pushback one step") 245 else: 246 self._next = self._event, self._elem 247 self._event, self._elem = self._prev
248
249 - def __iter__(self):
250 while self._event is not None: 251 yield self._event, self._elem 252 self.forward()
253