Package xmlschema_acue :: Module etree

Source Code for Module xmlschema_acue.etree

  1  # -*- coding: utf-8 -*- 
  2  # 
  3  # Copyright (c), 2016-2019, SISSA (International School for Advanced Studies). 
  4  # All rights reserved. 
  5  # This file is distributed under the terms of the MIT License. 
  6  # See the file 'LICENSE' in the root directory of the present 
  7  # distribution, or http://opensource.org/licenses/MIT. 
  8  # 
  9  # @author Davide Brunato <brunato@sissa.it> 
 10  # 
 11  """ 
 12  This module contains ElementTree setup and helpers for xmlschema package. 
 13  """ 
 14  from __future__ import unicode_literals 
 15  import sys 
 16  import re 
 17  import importlib 
 18  from collections import Counter 
 19   
 20  try: 
 21      import lxml.etree as lxml_etree 
 22  except ImportError: 
 23      lxml_etree = None 
 24   
 25  from .compat import PY3 
 26  from .exceptions import XMLSchemaValueError, XMLSchemaTypeError 
 27  from .namespaces import XSLT_NAMESPACE, HFP_NAMESPACE, VC_NAMESPACE 
 28  from .helpers import get_namespace, get_qname, qname_to_prefixed 
 29  from .xpath import ElementPathMixin 
 30   
 31  ### 
 32  # Programmatic import of xml.etree.ElementTree 
 33  # 
 34  # In Python 3 the pure python implementation is overwritten by the C module API, 
 35  # so use a programmatic re-import to obtain the pure Python module, necessary for 
 36  # defining a safer XMLParser. 
 37  # 
 38  if not PY3: 
 39      # Python 2.7: nothing have to be done because it's not overridden by C implementation 
 40      ElementTree = PyElementTree = importlib.import_module('xml.etree.ElementTree') 
 41   
 42  elif '_elementtree' in sys.modules: 
 43      # Temporary remove the loaded modules 
 44      ElementTree = sys.modules.pop('xml.etree.ElementTree', None) 
 45      _cmod = sys.modules.pop('_elementtree') 
 46   
 47      # Load the pure Python module 
 48      sys.modules['_elementtree'] = None 
 49      PyElementTree = importlib.import_module('xml.etree.ElementTree') 
 50   
 51      # Restore original modules 
 52      sys.modules['_elementtree'] = _cmod 
 53      if ElementTree is not None: 
 54          sys.modules['xml.etree.ElementTree'] = ElementTree 
 55      else: 
 56          ElementTree = PyElementTree 
 57   
 58  else: 
 59      # Load the pure Python module 
 60      sys.modules['_elementtree'] = None 
 61      PyElementTree = importlib.import_module('xml.etree.ElementTree') 
 62   
 63      # Remove the pure Python module from imported modules 
 64      del sys.modules['xml.etree.ElementTree'] 
 65      del sys.modules['_elementtree'] 
 66   
 67      # Load the C optimized ElementTree module 
 68      ElementTree = importlib.import_module('xml.etree.ElementTree') 
 69   
 70   
 71  # ElementTree APIs 
 72  etree_element = ElementTree.Element 
 73  etree_register_namespace = ElementTree.register_namespace 
 74  ParseError = ElementTree.ParseError 
 75   
 76  etree_register_namespace('xslt', XSLT_NAMESPACE) 
 77  etree_register_namespace('hfp', HFP_NAMESPACE) 
 78  etree_register_namespace('vc', VC_NAMESPACE) 
 79   
 80   
 81  # Pure Python ElementTree APIs 
 82  py_etree_element = PyElementTree.Element 
 83  py_etree_register_namespace = ElementTree.register_namespace 
 84   
 85  py_etree_register_namespace('xslt', XSLT_NAMESPACE) 
 86  py_etree_register_namespace('hfp', HFP_NAMESPACE) 
 87  py_etree_register_namespace('vc', VC_NAMESPACE) 
 88   
 89   
 90  # Lxml APIs 
 91  if lxml_etree is not None: 
 92      lxml_etree_element = lxml_etree.Element 
 93      lxml_etree_comment = lxml_etree.Comment 
 94      lxml_etree_register_namespace = lxml_etree.register_namespace 
 95   
 96      lxml_etree_register_namespace('xslt', XSLT_NAMESPACE) 
 97      lxml_etree_register_namespace('hfp', HFP_NAMESPACE) 
 98      lxml_etree_register_namespace('vc', VC_NAMESPACE) 
 99  else: 
100      lxml_etree_element = None 
101      lxml_etree_comment = None 
102      lxml_etree_register_namespace = None 
103   
104   
105 -class SafeXMLParser(PyElementTree.XMLParser):
106 """ 107 An XMLParser that forbids entities processing. Drops the *html* argument that is deprecated 108 since version 3.4. 109 110 :param target: the target object called by the `feed()` method of the parser, \ 111 that defaults to `TreeBuilder`. 112 :param encoding: if provided, its value overrides the encoding specified in the XML file. 113 """
114 - def __init__(self, target=None, encoding=None):
115 super(SafeXMLParser, self).__init__(target=target, encoding=encoding) 116 parser = self.parser if PY3 else self._parser 117 parser.EntityDeclHandler = self.entity_declaration 118 parser.UnparsedEntityDeclHandler = self.unparsed_entity_declaration 119 parser.ExternalEntityRefHandler = self.external_entity_reference
120
121 - def entity_declaration(self, entity_name, is_parameter_entity, value, base, system_id, public_id, notation_name):
122 raise PyElementTree.ParseError("Entities are forbidden (entity_name={!r})".format(entity_name))
123
124 - def unparsed_entity_declaration(self, entity_name, base, system_id, public_id, notation_name):
125 raise PyElementTree.ParseError("Entities are forbidden (entity_name={!r})".format(entity_name))
126
127 - def external_entity_reference(self, context, base, system_id, public_id):
128 raise PyElementTree.ParseError( 129 "External references are forbidden (system_id={!r}, public_id={!r})".format(system_id, public_id) 130 )
131 132
133 -def is_etree_element(elem):
134 """More safer test for matching ElementTree elements.""" 135 return hasattr(elem, 'tag') and hasattr(elem, 'attrib') and not isinstance(elem, ElementPathMixin)
136 137
138 -def etree_tostring(elem, namespaces=None, indent='', max_lines=None, spaces_for_tab=4, xml_declaration=False):
139 """ 140 Serialize an Element tree to a string. Tab characters are replaced by whitespaces. 141 142 :param elem: the Element instance. 143 :param namespaces: is an optional mapping from namespace prefix to URI. Provided namespaces are \ 144 registered before serialization. 145 :param indent: the base line indentation. 146 :param max_lines: if truncate serialization after a number of lines (default: do not truncate). 147 :param spaces_for_tab: number of spaces for replacing tab characters (default is 4). 148 :param xml_declaration: if set to `True` inserts the XML declaration at the head. 149 :return: a Unicode string. 150 """ 151 def reindent(line): 152 if not line: 153 return line 154 elif line.startswith(min_indent): 155 return line[start:] if start >= 0 else indent[start:] + line 156 else: 157 return indent + line
158 159 if isinstance(elem, etree_element): 160 if namespaces: 161 for prefix, uri in namespaces.items(): 162 etree_register_namespace(prefix, uri) 163 tostring = ElementTree.tostring 164 165 elif isinstance(elem, py_etree_element): 166 if namespaces: 167 for prefix, uri in namespaces.items(): 168 PyElementTree.register_namespace(prefix, uri) 169 tostring = PyElementTree.tostring 170 171 elif lxml_etree is not None: 172 if namespaces: 173 for prefix, uri in namespaces.items(): 174 if prefix: 175 lxml_etree_register_namespace(prefix, uri) 176 tostring = lxml_etree.tostring 177 else: 178 raise XMLSchemaTypeError("cannot serialize %r: lxml library not available." % type(elem)) 179 180 if PY3: 181 xml_text = tostring(elem, encoding="unicode").replace('\t', ' ' * spaces_for_tab) 182 else: 183 xml_text = unicode(tostring(elem)).replace('\t', ' ' * spaces_for_tab) # @UndefinedVariable 184 185 lines = ['<?xml version="1.0" encoding="UTF-8"?>'] if xml_declaration else [] 186 lines.extend(xml_text.splitlines()) 187 while lines and not lines[-1].strip(): 188 lines.pop(-1) 189 190 last_indent = ' ' * min(k for k in range(len(lines[-1])) if lines[-1][k] != ' ') 191 if len(lines) > 2: 192 child_indent = ' ' * min(k for line in lines[1:-1] for k in range(len(line)) if line[k] != ' ') 193 min_indent = min(child_indent, last_indent) 194 else: 195 min_indent = child_indent = last_indent 196 197 start = len(min_indent) - len(indent) 198 199 if max_lines is not None and len(lines) > max_lines + 2: 200 lines = lines[:max_lines] + [child_indent + '...'] * 2 + lines[-1:] 201 202 return '\n'.join(reindent(line) for line in lines) 203 204
205 -def etree_iterpath(elem, tag=None, path='.', namespaces=None, add_position=False):
206 """ 207 Creates an iterator for the element and its subelements that yield elements and paths. 208 If tag is not `None` or '*', only elements whose matches tag are returned from the iterator. 209 210 :param elem: the element to iterate. 211 :param tag: tag filtering. 212 :param path: the current path, '.' for default. 213 :param add_position: add context position to child elements that appear multiple times. 214 :param namespaces: is an optional mapping from namespace prefix to URI. 215 """ 216 if tag == "*": 217 tag = None 218 if tag is None or elem.tag == tag: 219 yield elem, path 220 221 if add_position: 222 children_tags = Counter([e.tag for e in elem]) 223 positions = Counter([t for t in children_tags if children_tags[t] > 1]) 224 else: 225 positions = () 226 227 for child in elem: 228 if callable(child.tag): 229 continue # Skip lxml comments 230 231 child_name = child.tag if namespaces is None else qname_to_prefixed(child.tag, namespaces) 232 if path == '/': 233 child_path = '/%s' % child_name 234 elif path: 235 child_path = '/'.join((path, child_name)) 236 else: 237 child_path = child_name 238 239 if child.tag in positions: 240 child_path += '[%d]' % positions[child.tag] 241 positions[child.tag] += 1 242 243 for _child, _child_path in etree_iterpath(child, tag, child_path, namespaces): 244 yield _child, _child_path
245 246
247 -def etree_getpath(elem, root, namespaces=None, relative=True, add_position=False):
248 """ 249 Returns the XPath path from *root* to descendant *elem* element. 250 251 :param elem: the descendant element. 252 :param root: the root element. 253 :param namespaces: is an optional mapping from namespace prefix to URI. 254 :param relative: returns a relative path. 255 :param add_position: add context position to child elements that appear multiple times. 256 :return: An XPath expression or `None` if *elem* is not a descendant of *root*. 257 """ 258 if relative: 259 path = '.' 260 elif namespaces: 261 path = '/%s' % qname_to_prefixed(root.tag, namespaces) 262 else: 263 path = '/%s' % root.tag 264 265 for e, path in etree_iterpath(root, elem.tag, path, namespaces, add_position): 266 if e is elem: 267 return path
268 269
270 -def etree_last_child(elem):
271 """Returns the last child of the element, ignoring children that are lxml comments.""" 272 for child in reversed(elem): 273 if not callable(child.tag): 274 return child
275 276
277 -def etree_child_index(elem, child):
278 """Return the index or raise ValueError if it is not a *child* of *elem*.""" 279 for index in range(len(elem)): 280 if elem[index] is child: 281 return index 282 raise XMLSchemaValueError("%r is not a child of %r" % (child, elem))
283 284
285 -def etree_elements_assert_equal(elem, other, strict=True, skip_comments=True):
286 """ 287 Tests the equality of two XML Element trees. 288 289 :param elem: the master Element tree, reference for namespace mapping. 290 :param other: the other Element tree that has to be compared. 291 :param strict: asserts strictly equality. `True` for default. 292 :param skip_comments: Skip comments for e 293 :raise: an AssertionError containing information about first difference encountered. 294 """ 295 _REGEX_SPACES = re.compile(r'\s+') 296 297 other_elements = iter(other.iter()) 298 namespace = '' 299 for e1 in elem.iter(): 300 if skip_comments and e1.tag is lxml_etree_comment: 301 continue 302 303 try: 304 e2 = next(other_elements) 305 except StopIteration: 306 assert False, "Second tree ends before the first: %r." % e1 307 308 if strict or e1 is elem: 309 assert e1.tag == e2.tag, "%r != %r: tags differ." % (e1, e2) 310 else: 311 namespace = get_namespace(e1.tag) or namespace 312 assert get_qname(namespace, e1.tag) == get_qname(namespace, e1.tag), "%r != %r: tags differ." % (e1, e2) 313 314 # Attributes 315 if e1.attrib != e2.attrib: 316 if strict: 317 raise AssertionError("%r != %r: attribute differ: %r != %r." % (e1, e2, e1.attrib, e2.attrib)) 318 else: 319 assert e1.attrib.keys() == e2.attrib.keys(), \ 320 "%r != %r: attribute keys differ: %r != %r." % (e1, e2, e1.attrib.keys(), e2.attrib.keys()) 321 for k in e1.attrib: 322 a1, a2 = e1.attrib[k].strip(), e2.attrib[k].strip() 323 if a1 != a2: 324 try: 325 assert float(a1) == float(a2) 326 except (AssertionError, ValueError, TypeError): 327 raise AssertionError( 328 "%r != %r: attribute %r differ: %r != %r." % (e1, e2, k, a1, a2) 329 ) 330 331 # Number of children 332 if skip_comments: 333 nc1 = len([c for c in e1 if c.tag is not lxml_etree_comment]) 334 nc2 = len([c for c in e2 if c.tag is not lxml_etree_comment]) 335 else: 336 nc1 = len(e1) 337 nc2 = len(e2) 338 assert nc1 == nc2, "%r != %r: children number differ: %r != %r." % (e1, e2, nc1, nc2) 339 340 # Text 341 if e1.text != e2.text: 342 message = "%r != %r: texts differ: %r != %r." % (e1, e2, e1.text, e2.text) 343 if strict: 344 raise AssertionError(message) 345 elif e1.text is None: 346 assert not e2.text.strip(), message 347 elif e2.text is None: 348 assert not e1.text.strip(), message 349 elif _REGEX_SPACES.sub(e1.text.strip(), '') != _REGEX_SPACES.sub(e2.text.strip(), ''): 350 try: 351 assert float(e1.text.strip()) == float(e2.text.strip()) 352 except (AssertionError, ValueError, TypeError): 353 raise AssertionError(message) 354 355 # Tail 356 if e1.tail != e2.tail: 357 message = "%r != %r: tails differ: %r != %r." % (e1, e2, e1.tail, e2.tail) 358 if strict: 359 raise AssertionError(message) 360 elif e1.tail is None: 361 assert not e2.tail.strip(), message 362 elif e2.text is None: 363 assert not e1.tail.strip(), message 364 else: 365 assert e1.tail.strip() == e2.tail.strip(), message 366 367 try: 368 e2 = next(other_elements) 369 except StopIteration: 370 pass 371 else: 372 assert False, "First tree ends before the second: %r." % e2
373