Package xmlschema_acue :: Module resources

Source Code for Module xmlschema_acue.resources

  1  # -*- coding: utf-8 -*- 
  2  # 
  3  # Copyright (c), 2016-2019, SISSA (International School for Advanced Studies). 
  4  # All rights reserved. 
  5  # This file is distributed under the terms of the MIT License. 
  6  # See the file 'LICENSE' in the root directory of the present 
  7  # distribution, or http://opensource.org/licenses/MIT. 
  8  # 
  9  # @author Davide Brunato <brunato@sissa.it> 
 10  # 
 11  import os.path 
 12  import re 
 13  import codecs 
 14   
 15  from xmlschema_acue.compat import ( 
 16      PY3, StringIO, BytesIO, string_base_type, urlopen, urlsplit, urljoin, urlunsplit, 
 17      pathname2url, URLError, uses_relative 
 18  ) 
 19  from xmlschema_acue.exceptions import XMLSchemaTypeError, XMLSchemaValueError, XMLSchemaURLError, XMLSchemaOSError 
 20  from xmlschema_acue.qnames import XSI_SCHEMA_LOCATION, XSI_NONS_SCHEMA_LOCATION 
 21  from xmlschema_acue.helpers import get_namespace 
 22  from xmlschema_acue.etree import ElementTree, PyElementTree, SafeXMLParser, is_etree_element, etree_tostring 
 23   
 24   
 25  DEFUSE_MODES = ('always', 'remote', 'never') 
26 27 28 -def is_remote_url(url):
29 return url is not None and urlsplit(url).scheme not in ('', 'file')
30
31 32 -def url_path_is_directory(url):
33 return os.path.isdir(urlsplit(url).path)
34
35 36 -def url_path_is_file(url):
37 return os.path.isfile(urlsplit(url).path)
38
39 40 -def normalize_url(url, base_url=None, keep_relative=False):
41 """ 42 Returns a normalized URL doing a join with a base URL. URL scheme defaults to 'file' and 43 backslashes are replaced with slashes. For file paths the os.path.join is used instead of 44 urljoin. 45 46 :param url: a relative or absolute URL. 47 :param base_url: the reference base URL for construct the normalized URL from the argument. \ 48 For compatibility between "os.path.join" and "urljoin" a trailing '/' is added to not empty paths. 49 :param keep_relative: if set to `True` keeps relative file paths, which would not strictly \ 50 conformant to URL format specification. 51 :return: A normalized URL. 52 """ 53 def add_trailing_slash(r): 54 return urlunsplit((r[0], r[1], r[2] + '/' if r[2] and r[2][-1] != '/' else r[2], r[3], r[4]))
55 56 if base_url is not None: 57 base_url = base_url.replace('\\', '/') 58 base_url_parts = urlsplit(base_url) 59 base_url = add_trailing_slash(base_url_parts) 60 if base_url_parts.scheme not in uses_relative: 61 base_url_parts = urlsplit('file:///{}'.format(base_url)) 62 else: 63 base_url_parts = urlsplit(base_url) 64 65 if base_url_parts.scheme not in ('', 'file'): 66 url = urljoin(base_url, url) 67 else: 68 # For file schemes uses the os.path.join instead of urljoin 69 url_parts = urlsplit(url) 70 if url_parts.scheme not in ('', 'file'): 71 url = urljoin(base_url, url) 72 elif not url_parts.netloc or base_url_parts.netloc == url_parts.netloc: 73 # Join paths only if host parts (netloc) are equal 74 url = urlunsplit(( 75 '', 76 base_url_parts.netloc, 77 os.path.normpath(os.path.join(base_url_parts.path, url_parts.path)), 78 url_parts.query, 79 url_parts.fragment, 80 )) 81 82 url = url.replace('\\', '/') 83 url_parts = urlsplit(url, scheme='file') 84 if url_parts.scheme not in uses_relative: 85 return 'file:///{}'.format(url_parts.geturl()) # Eg. k:/Python/lib/.... 86 elif url_parts.scheme != 'file': 87 return urlunsplit(( 88 url_parts.scheme, 89 url_parts.netloc, 90 pathname2url(url_parts.path), 91 url_parts.query, 92 url_parts.fragment, 93 )) 94 elif os.path.isabs(url_parts.path): 95 return url_parts.geturl() 96 elif keep_relative: 97 # Can't use urlunsplit with a scheme because it converts relative paths to absolute ones. 98 return 'file:{}'.format(urlunsplit(('',) + url_parts[1:])) 99 else: 100 return urlunsplit(( 101 url_parts.scheme, 102 url_parts.netloc, 103 os.path.abspath(url_parts.path), 104 url_parts.query, 105 url_parts.fragment, 106 )) 107
108 109 -def fetch_resource(location, base_url=None, timeout=30):
110 """ 111 Fetch a resource trying to accessing it. If the resource is accessible 112 returns the URL, otherwise raises an error (XMLSchemaURLError). 113 114 :param location: an URL or a file path. 115 :param base_url: reference base URL for normalizing local and relative URLs. 116 :param timeout: the timeout in seconds for the connection attempt in case of remote data. 117 :return: a normalized URL. 118 """ 119 if not location: 120 raise XMLSchemaValueError("'location' argument must contains a not empty string.") 121 122 url = normalize_url(location, base_url) 123 try: 124 resource = urlopen(url, timeout=timeout) 125 except URLError as err: 126 # fallback joining the path without a base URL 127 url = normalize_url(location) 128 try: 129 resource = urlopen(url, timeout=timeout) 130 except URLError: 131 raise XMLSchemaURLError(reason=err.reason) 132 else: 133 resource.close() 134 return url 135 else: 136 resource.close() 137 return url
138
139 140 -def fetch_schema_locations(source, locations=None, **resource_options):
141 """ 142 Fetches the schema URL for the source's root of an XML data source and a list of location hints. 143 If an accessible schema location is not found raises a ValueError. 144 145 :param source: an Element or an Element Tree with XML data or an URL or a file-like object. 146 :param locations: a dictionary or dictionary items with Schema location hints. 147 :param resource_options: keyword arguments for providing :class:`XMLResource` class init options. 148 :return: A tuple with the URL referring to the first reachable schema resource, a list \ 149 of dictionary items with normalized location hints. 150 """ 151 base_url = resource_options.pop('base_url', None) 152 timeout = resource_options.pop('timeout', 30) 153 resource = XMLResource(source, base_url, timeout=timeout, **resource_options) 154 155 base_url = resource.base_url 156 namespace = resource.namespace 157 locations = resource.get_locations(locations) 158 for ns, url in filter(lambda x: x[0] == namespace, locations): 159 try: 160 return fetch_resource(url, base_url, timeout), locations 161 except XMLSchemaURLError: 162 pass 163 raise XMLSchemaValueError("not found a schema for XML data resource %r (namespace=%r)." % (source, namespace))
164
165 166 -def fetch_schema(source, locations=None, **resource_options):
167 """ 168 Fetches the schema URL for the source's root of an XML data source. 169 If an accessible schema location is not found raises a ValueError. 170 171 :param source: An an Element or an Element Tree with XML data or an URL or a file-like object. 172 :param locations: A dictionary or dictionary items with schema location hints. 173 :param resource_options: keyword arguments for providing :class:`XMLResource` class init options. 174 :return: An URL referring to a reachable schema resource. 175 """ 176 return fetch_schema_locations(source, locations, **resource_options)[0]
177
178 179 -def fetch_namespaces(source, **resource_options):
180 """ 181 Extracts namespaces with related prefixes from the XML data source. If the source is 182 an lxml's ElementTree/Element returns the nsmap attribute of the root. If a duplicate 183 prefix declaration is encountered then adds the namespace using a different prefix, 184 but only in the case if the namespace URI is not already mapped by another prefix. 185 186 :param source: a string containing the XML document or file path or an url \ 187 or a file like object or an ElementTree or Element. 188 :param resource_options: keyword arguments for providing :class:`XMLResource` init options. 189 :return: A dictionary for mapping namespace prefixes to full URI. 190 """ 191 timeout = resource_options.pop('timeout', 30) 192 return XMLResource(source, timeout=timeout, **resource_options).get_namespaces()
193
194 195 -def load_xml_resource(source, element_only=True, **resource_options):
196 """ 197 Load XML data source into an Element tree, returning the root Element, the XML text and an 198 url, if available. Usable for XML data files of small or medium sizes, as XSD schemas. 199 200 :param source: an URL, a filename path or a file-like object. 201 :param element_only: if True the function returns only the root Element of the tree. 202 :param resource_options: keyword arguments for providing :class:`XMLResource` init options. 203 :return: a tuple with three items (root Element, XML text and XML URL) or \ 204 only the root Element if 'element_only' argument is True. 205 """ 206 lazy = resource_options.pop('lazy', False) 207 source = XMLResource(source, lazy=lazy, **resource_options) 208 if element_only: 209 return source.root 210 else: 211 source.load() 212 return source.root, source.text, source.url
213
214 215 -class XMLResource(object):
216 """ 217 XML resource reader based on ElementTree and urllib. 218 219 :param source: a string containing the XML document or file path or an URL or a file like \ 220 object or an ElementTree or an Element. 221 :param base_url: is an optional base URL, used for the normalization of relative paths when \ 222 the URL of the resource can't be obtained from the source argument. 223 :param defuse: set the usage of SafeXMLParser for XML data. Can be 'always', 'remote' or 'never'. \ 224 Default is 'remote' that uses the defusedxml only when loading remote data. 225 :param timeout: the timeout in seconds for the connection attempt in case of remote data. 226 :param lazy: if set to `False` the source is fully loaded into and processed from memory. Default is `True`. 227 """
228 - def __init__(self, source, base_url=None, defuse='remote', timeout=300, lazy=True):
229 if base_url is not None and not isinstance(base_url, string_base_type): 230 raise XMLSchemaValueError(u"'base_url' argument has to be a string: {!r}".format(base_url)) 231 232 self._root = self._document = self._url = self._text = None 233 self._base_url = base_url 234 self.defuse = defuse 235 self.timeout = timeout 236 self._lazy = lazy 237 self.source = source
238
239 - def __str__(self):
240 # noinspection PyCompatibility,PyUnresolvedReferences 241 return unicode(self).encode("utf-8") # @UndefinedVariable
242
243 - def __unicode__(self):
244 return self.__repr__()
245 246 if PY3: 247 __str__ = __unicode__ 248
249 - def __repr__(self):
250 if self._root is None: 251 return u'%s()' % self.__class__.__name__ 252 elif self._url is None: 253 return u'%s(tag=%r)' % (self.__class__.__name__, self._root.tag) 254 else: 255 return u'%s(tag=%r, basename=%r)' % ( 256 self.__class__.__name__, self._root.tag, os.path.basename(self._url) 257 )
258
259 - def __setattr__(self, name, value):
260 if name == 'source': 261 self._root, self._document, self._text, self._url = self._fromsource(value) 262 elif name == 'defuse' and value not in DEFUSE_MODES: 263 raise XMLSchemaValueError(u"'defuse' attribute: {!r} is not a defuse mode.".format(value)) 264 elif name == 'timeout' and (not isinstance(value, int) or value <= 0): 265 raise XMLSchemaValueError(u"'timeout' attribute must be a positive integer: {!r}".format(value)) 266 elif name == 'lazy' and not isinstance(value, bool): 267 raise XMLSchemaValueError(u"'lazy' attribute must be a boolean: {!r}".format(value)) 268 super(XMLResource, self).__setattr__(name, value)
269
270 - def _fromsource(self, source):
271 url, lazy = None, self._lazy 272 if is_etree_element(source): 273 return source, None, None, None # Source is already an Element --> nothing to load 274 elif isinstance(source, string_base_type): 275 _url, self._url = self._url, None 276 try: 277 if lazy: 278 # check if source is a string containing a valid XML root 279 for _, root in self.iterparse(StringIO(source), events=('start',)): 280 return root, None, source, None 281 else: 282 return self.fromstring(source), None, source, None 283 except (ElementTree.ParseError, PyElementTree.ParseError, UnicodeEncodeError): 284 if '\n' in source: 285 raise 286 finally: 287 self._url = _url 288 url = normalize_url(source) if '\n' not in source else None 289 290 elif isinstance(source, StringIO): 291 _url, self._url = self._url, None 292 try: 293 if lazy: 294 for _, root in self.iterparse(source, events=('start',)): 295 return root, None, source.getvalue(), None 296 else: 297 document = self.parse(source) 298 return document.getroot(), document, source.getvalue(), None 299 finally: 300 self._url = _url 301 302 elif hasattr(source, 'read'): 303 # source should be a file-like object 304 try: 305 if hasattr(source, 'url'): 306 url = source.url 307 else: 308 url = normalize_url(source.name) 309 except AttributeError: 310 pass 311 else: 312 _url, self._url = self._url, url 313 try: 314 if lazy: 315 for _, root in self.iterparse(source, events=('start',)): 316 return root, None, None, url 317 else: 318 document = self.parse(source) 319 return document.getroot(), document, None, url 320 finally: 321 self._url = _url 322 323 else: 324 # Try ElementTree object at last 325 try: 326 root = source.getroot() 327 except (AttributeError, TypeError): 328 pass 329 else: 330 if is_etree_element(root): 331 return root, source, None, None 332 333 if url is None: 334 raise XMLSchemaTypeError( 335 "wrong type %r for 'source' attribute: an ElementTree object or an Element instance or a " 336 "string containing XML data or an URL or a file-like object is required." % type(source) 337 ) 338 else: 339 resource = urlopen(url, timeout=self.timeout) 340 _url, self._url = self._url, url 341 try: 342 if lazy: 343 for _, root in self.iterparse(resource, events=('start',)): 344 return root, None, None, url 345 else: 346 document = self.parse(resource) 347 root = document.getroot() 348 return root, document, None, url 349 finally: 350 self._url = _url 351 resource.close()
352 353 @property
354 - def root(self):
355 """The XML tree root Element.""" 356 return self._root
357 358 @property
359 - def document(self):
360 """ 361 The ElementTree document, `None` if the instance is lazy or is not created 362 from another document or from an URL. 363 """ 364 return self._document
365 366 @property
367 - def text(self):
368 """The XML text source, `None` if it's not available.""" 369 return self._text
370 371 @property
372 - def url(self):
373 """The source URL, `None` if the instance is created from an Element tree or from a string.""" 374 return self._url
375 376 @property
377 - def base_url(self):
378 """The base URL for completing relative locations.""" 379 return os.path.dirname(self._url) if self._url else self._base_url
380 381 @property
382 - def namespace(self):
383 """The namespace of the XML document.""" 384 return get_namespace(self._root.tag) if self._root is not None else None
385 386 @staticmethod
387 - def defusing(source):
388 """ 389 Defuse an XML source, raising an `ElementTree.ParseError` if the source contains entity 390 definitions or remote entity loading. 391 392 :param source: a filename or file object containing XML data. 393 """ 394 parser = SafeXMLParser(target=PyElementTree.TreeBuilder()) 395 try: 396 for _, _ in PyElementTree.iterparse(source, ('start',), parser): 397 break 398 except PyElementTree.ParseError as err: 399 raise ElementTree.ParseError(str(err))
400
401 - def parse(self, source):
402 """ 403 An equivalent of *ElementTree.parse()* that can protect from XML entities attacks. When 404 protection is applied XML data are loaded and defused before building the ElementTree instance. 405 406 :param source: a filename or file object containing XML data. 407 :returns: an ElementTree instance. 408 """ 409 if self.defuse == 'always' or self.defuse == 'remote' and is_remote_url(self._url): 410 text = source.read() 411 if isinstance(text, bytes): 412 self.defusing(BytesIO(text)) 413 return ElementTree.parse(BytesIO(text)) 414 else: 415 self.defusing(StringIO(text)) 416 return ElementTree.parse(StringIO(text)) 417 else: 418 return ElementTree.parse(source)
419
420 - def iterparse(self, source, events=None):
421 """ 422 An equivalent of *ElementTree.iterparse()* that can protect from XML entities attacks. 423 When protection is applied the iterator yields pure-Python Element instances. 424 425 :param source: a filename or file object containing XML data. 426 :param events: a list of events to report back. If omitted, only “end” events are reported. 427 """ 428 if self.defuse == 'always' or self.defuse == 'remote' and is_remote_url(self._url): 429 parser = SafeXMLParser(target=PyElementTree.TreeBuilder()) 430 try: 431 return PyElementTree.iterparse(source, events, parser) 432 except PyElementTree.ParseError as err: 433 raise ElementTree.ParseError(str(err)) 434 else: 435 return ElementTree.iterparse(source, events)
436
437 - def fromstring(self, text):
438 """ 439 An equivalent of *ElementTree.fromstring()* that can protect from XML entities attacks. 440 441 :param text: a string containing XML data. 442 :returns: the root Element instance. 443 """ 444 if self.defuse == 'always' or self.defuse == 'remote' and is_remote_url(self._url): 445 self.defusing(StringIO(text)) 446 return ElementTree.fromstring(text)
447
448 - def tostring(self, indent='', max_lines=None, spaces_for_tab=4, xml_declaration=False):
449 """Generates a string representation of the XML resource.""" 450 return etree_tostring(self._root, self.get_namespaces(), indent, max_lines, spaces_for_tab, xml_declaration)
451
452 - def copy(self, **kwargs):
453 """Resource copy method. Change init parameters with keyword arguments.""" 454 obj = type(self)( 455 source=self.source, 456 base_url=kwargs.get('base_url', self.base_url), 457 defuse=kwargs.get('defuse', self.defuse), 458 timeout=kwargs.get('timeout', self.timeout), 459 lazy=kwargs.get('lazy', self._lazy) 460 ) 461 if obj._text is None and self._text is not None: 462 obj._text = self._text 463 return obj
464
465 - def open(self):
466 """Returns a opened resource reader object for the instance URL.""" 467 if self._url is None: 468 raise XMLSchemaValueError("can't open, the resource has no URL associated.") 469 try: 470 return urlopen(self._url, timeout=self.timeout) 471 except URLError as err: 472 raise XMLSchemaURLError(reason="cannot access to resource %r: %s" % (self._url, err.reason))
473
474 - def load(self):
475 """ 476 Loads the XML text from the data source. If the data source is an Element 477 the source XML text can't be retrieved. 478 """ 479 if self._url is None: 480 return # Created from Element or text source --> already loaded 481 482 resource = self.open() 483 try: 484 data = resource.read() 485 except (OSError, IOError) as err: 486 raise XMLSchemaOSError("cannot load data from %r: %s" % (self._url, err)) 487 finally: 488 resource.close() 489 490 try: 491 self._text = data.decode('utf-8') if PY3 else data.encode('utf-8') 492 except UnicodeDecodeError: 493 if PY3: 494 self._text = data.decode('iso-8859-1') 495 else: 496 with codecs.open(urlsplit(self._url).path, mode='rb', encoding='iso-8859-1') as f: 497 self._text = f.read().encode('iso-8859-1')
498
499 - def is_lazy(self):
500 """Gets `True` the XML resource is lazy.""" 501 return self._lazy
502
503 - def is_loaded(self):
504 """Gets `True` the XML text of the data source is loaded.""" 505 return self._text is not None
506
507 - def iter(self, tag=None):
508 """XML resource tree elements lazy iterator.""" 509 if not self._lazy: 510 for elem in self._root.iter(tag): 511 yield elem 512 return 513 elif self._url is not None: 514 resource = urlopen(self._url, timeout=self.timeout) 515 else: 516 resource = StringIO(self._text) 517 518 try: 519 for event, elem in self.iterparse(resource, events=('start', 'end')): 520 if event == 'end': 521 elem.clear() 522 elif tag is None or elem.tag == tag: 523 yield elem 524 finally: 525 resource.close()
526
527 - def iter_location_hints(self):
528 """Yields schema location hints from the XML tree.""" 529 for elem in self.iter(): 530 try: 531 locations = elem.attrib[XSI_SCHEMA_LOCATION] 532 except KeyError: 533 pass 534 else: 535 locations = locations.split() 536 for ns, url in zip(locations[0::2], locations[1::2]): 537 yield ns, url 538 539 try: 540 locations = elem.attrib[XSI_NONS_SCHEMA_LOCATION] 541 except KeyError: 542 pass 543 else: 544 for url in locations.split(): 545 yield '', url
546
547 - def get_namespaces(self):
548 """ 549 Extracts namespaces with related prefixes from the XML resource. If a duplicate 550 prefix declaration is encountered then adds the namespace using a different prefix, 551 but only in the case if the namespace URI is not already mapped by another prefix. 552 553 :return: A dictionary for mapping namespace prefixes to full URI. 554 """ 555 def update_nsmap(prefix, uri): 556 if prefix not in nsmap and (prefix or not local_root): 557 nsmap[prefix] = uri 558 elif not any(uri == ns for ns in nsmap.values()): 559 if not prefix: 560 try: 561 prefix = re.search(r'(\w+)$', uri.strip()).group() 562 except AttributeError: 563 return 564 565 while prefix in nsmap: 566 match = re.search(r'(\d+)$', prefix) 567 if match: 568 index = int(match.group()) + 1 569 prefix = prefix[:match.span()[0]] + str(index) 570 else: 571 prefix += '2' 572 nsmap[prefix] = uri
573 574 local_root = self.root.tag[0] != '{' 575 nsmap = {} 576 577 if self._url is not None: 578 resource = self.open() 579 try: 580 for event, node in self.iterparse(resource, events=('start-ns',)): 581 update_nsmap(*node) 582 except (ElementTree.ParseError, PyElementTree.ParseError, UnicodeEncodeError): 583 pass 584 finally: 585 resource.close() 586 elif isinstance(self._text, string_base_type): 587 try: 588 for event, node in self.iterparse(StringIO(self._text), events=('start-ns',)): 589 update_nsmap(*node) 590 except (ElementTree.ParseError, PyElementTree.ParseError, UnicodeEncodeError): 591 pass 592 else: 593 # Warning: can extracts namespace information only from lxml etree structures 594 try: 595 for elem in self._root.iter(): 596 for k, v in elem.nsmap.items(): 597 update_nsmap(k if k is not None else '', v) 598 except (AttributeError, TypeError): 599 pass # Not an lxml's tree or element 600 601 return nsmap
602
603 - def get_locations(self, locations=None):
604 """ 605 Returns a list of schema location hints. The locations are normalized using the 606 base URL of the instance. The *locations* argument can be a dictionary or a list 607 of namespace resources, that are inserted before the schema location hints extracted 608 from the XML resource. 609 """ 610 base_url = self.base_url 611 location_hints = [] 612 if locations is not None: 613 try: 614 for ns, value in locations.items(): 615 if isinstance(value, list): 616 location_hints.extend([(ns, normalize_url(url, base_url)) for url in value]) 617 else: 618 location_hints.append((ns, normalize_url(value, base_url))) 619 except AttributeError: 620 location_hints.extend([(ns, normalize_url(url, base_url)) for ns, url in locations]) 621 622 location_hints.extend([(ns, normalize_url(url, base_url)) for ns, url in self.iter_location_hints()]) 623 return location_hints
624