xmlschema_acue.resources

216 """ 217 XML resource reader based on ElementTree and urllib. 218 219 :param source: a string containing the XML document or file path or an URL or a file like \ 220 object or an ElementTree or an Element. 221 :param base_url: is an optional base URL, used for the normalization of relative paths when \ 222 the URL of the resource can't be obtained from the source argument. 223 :param defuse: set the usage of SafeXMLParser for XML data. Can be 'always', 'remote' or 'never'. \ 224 Default is 'remote' that uses the defusedxml only when loading remote data. 225 :param timeout: the timeout in seconds for the connection attempt in case of remote data. 226 :param lazy: if set to `False` the source is fully loaded into and processed from memory. Default is `True`. 227 """

228 - def __init__(self, source, base_url=None, defuse='remote', timeout=300, lazy=True):

229 if base_url is not None and not isinstance(base_url, string_base_type): 230 raise XMLSchemaValueError(u"'base_url' argument has to be a string: {!r}".format(base_url)) 231 232 self._root = self._document = self._url = self._text = None 233 self._base_url = base_url 234 self.defuse = defuse 235 self.timeout = timeout 236 self._lazy = lazy 237 self.source = source

238

239 - def __str__(self):

240 # noinspection PyCompatibility,PyUnresolvedReferences 241 return unicode(self).encode("utf-8") # @UndefinedVariable

242

243 - def __unicode__(self):

244 return self.__repr__()

245 246 if PY3: 247 __str__ = __unicode__ 248

249 - def __repr__(self):

250 if self._root is None: 251 return u'%s()' % self.__class__.__name__ 252 elif self._url is None: 253 return u'%s(tag=%r)' % (self.__class__.__name__, self._root.tag) 254 else: 255 return u'%s(tag=%r, basename=%r)' % ( 256 self.__class__.__name__, self._root.tag, os.path.basename(self._url) 257 )

258

259 - def __setattr__(self, name, value):

260 if name == 'source': 261 self._root, self._document, self._text, self._url = self._fromsource(value) 262 elif name == 'defuse' and value not in DEFUSE_MODES: 263 raise XMLSchemaValueError(u"'defuse' attribute: {!r} is not a defuse mode.".format(value)) 264 elif name == 'timeout' and (not isinstance(value, int) or value <= 0): 265 raise XMLSchemaValueError(u"'timeout' attribute must be a positive integer: {!r}".format(value)) 266 elif name == 'lazy' and not isinstance(value, bool): 267 raise XMLSchemaValueError(u"'lazy' attribute must be a boolean: {!r}".format(value)) 268 super(XMLResource, self).__setattr__(name, value)

269

270 - def _fromsource(self, source):

271 url, lazy = None, self._lazy 272 if is_etree_element(source): 273 return source, None, None, None # Source is already an Element --> nothing to load 274 elif isinstance(source, string_base_type): 275 _url, self._url = self._url, None 276 try: 277 if lazy: 278 # check if source is a string containing a valid XML root 279 for _, root in self.iterparse(StringIO(source), events=('start',)): 280 return root, None, source, None 281 else: 282 return self.fromstring(source), None, source, None 283 except (ElementTree.ParseError, PyElementTree.ParseError, UnicodeEncodeError): 284 if '\n' in source: 285 raise 286 finally: 287 self._url = _url 288 url = normalize_url(source) if '\n' not in source else None 289 290 elif isinstance(source, StringIO): 291 _url, self._url = self._url, None 292 try: 293 if lazy: 294 for _, root in self.iterparse(source, events=('start',)): 295 return root, None, source.getvalue(), None 296 else: 297 document = self.parse(source) 298 return document.getroot(), document, source.getvalue(), None 299 finally: 300 self._url = _url 301 302 elif hasattr(source, 'read'): 303 # source should be a file-like object 304 try: 305 if hasattr(source, 'url'): 306 url = source.url 307 else: 308 url = normalize_url(source.name) 309 except AttributeError: 310 pass 311 else: 312 _url, self._url = self._url, url 313 try: 314 if lazy: 315 for _, root in self.iterparse(source, events=('start',)): 316 return root, None, None, url 317 else: 318 document = self.parse(source) 319 return document.getroot(), document, None, url 320 finally: 321 self._url = _url 322 323 else: 324 # Try ElementTree object at last 325 try: 326 root = source.getroot() 327 except (AttributeError, TypeError): 328 pass 329 else: 330 if is_etree_element(root): 331 return root, source, None, None 332 333 if url is None: 334 raise XMLSchemaTypeError( 335 "wrong type %r for 'source' attribute: an ElementTree object or an Element instance or a " 336 "string containing XML data or an URL or a file-like object is required." % type(source) 337 ) 338 else: 339 resource = urlopen(url, timeout=self.timeout) 340 _url, self._url = self._url, url 341 try: 342 if lazy: 343 for _, root in self.iterparse(resource, events=('start',)): 344 return root, None, None, url 345 else: 346 document = self.parse(resource) 347 root = document.getroot() 348 return root, document, None, url 349 finally: 350 self._url = _url 351 resource.close()

352 353 @property

354 - def root(self):

355 """The XML tree root Element.""" 356 return self._root

357 358 @property

359 - def document(self):

360 """ 361 The ElementTree document, `None` if the instance is lazy or is not created 362 from another document or from an URL. 363 """ 364 return self._document

365 366 @property

367 - def text(self):

368 """The XML text source, `None` if it's not available.""" 369 return self._text

370 371 @property

372 - def url(self):

373 """The source URL, `None` if the instance is created from an Element tree or from a string.""" 374 return self._url

375 376 @property

377 - def base_url(self):

378 """The base URL for completing relative locations.""" 379 return os.path.dirname(self._url) if self._url else self._base_url

380 381 @property

382 - def namespace(self):

383 """The namespace of the XML document.""" 384 return get_namespace(self._root.tag) if self._root is not None else None

385 386 @staticmethod

387 - def defusing(source):

388 """ 389 Defuse an XML source, raising an `ElementTree.ParseError` if the source contains entity 390 definitions or remote entity loading. 391 392 :param source: a filename or file object containing XML data. 393 """ 394 parser = SafeXMLParser(target=PyElementTree.TreeBuilder()) 395 try: 396 for _, _ in PyElementTree.iterparse(source, ('start',), parser): 397 break 398 except PyElementTree.ParseError as err: 399 raise ElementTree.ParseError(str(err))

400

401 - def parse(self, source):

402 """ 403 An equivalent of *ElementTree.parse()* that can protect from XML entities attacks. When 404 protection is applied XML data are loaded and defused before building the ElementTree instance. 405 406 :param source: a filename or file object containing XML data. 407 :returns: an ElementTree instance. 408 """ 409 if self.defuse == 'always' or self.defuse == 'remote' and is_remote_url(self._url): 410 text = source.read() 411 if isinstance(text, bytes): 412 self.defusing(BytesIO(text)) 413 return ElementTree.parse(BytesIO(text)) 414 else: 415 self.defusing(StringIO(text)) 416 return ElementTree.parse(StringIO(text)) 417 else: 418 return ElementTree.parse(source)

419

420 - def iterparse(self, source, events=None):

421 """ 422 An equivalent of *ElementTree.iterparse()* that can protect from XML entities attacks. 423 When protection is applied the iterator yields pure-Python Element instances. 424 425 :param source: a filename or file object containing XML data. 426 :param events: a list of events to report back. If omitted, only “end” events are reported. 427 """ 428 if self.defuse == 'always' or self.defuse == 'remote' and is_remote_url(self._url): 429 parser = SafeXMLParser(target=PyElementTree.TreeBuilder()) 430 try: 431 return PyElementTree.iterparse(source, events, parser) 432 except PyElementTree.ParseError as err: 433 raise ElementTree.ParseError(str(err)) 434 else: 435 return ElementTree.iterparse(source, events)

436

437 - def fromstring(self, text):

438 """ 439 An equivalent of *ElementTree.fromstring()* that can protect from XML entities attacks. 440 441 :param text: a string containing XML data. 442 :returns: the root Element instance. 443 """ 444 if self.defuse == 'always' or self.defuse == 'remote' and is_remote_url(self._url): 445 self.defusing(StringIO(text)) 446 return ElementTree.fromstring(text)

447

448 - def tostring(self, indent='', max_lines=None, spaces_for_tab=4, xml_declaration=False):

449 """Generates a string representation of the XML resource.""" 450 return etree_tostring(self._root, self.get_namespaces(), indent, max_lines, spaces_for_tab, xml_declaration)

451

452 - def copy(self, **kwargs):

453 """Resource copy method. Change init parameters with keyword arguments.""" 454 obj = type(self)( 455 source=self.source, 456 base_url=kwargs.get('base_url', self.base_url), 457 defuse=kwargs.get('defuse', self.defuse), 458 timeout=kwargs.get('timeout', self.timeout), 459 lazy=kwargs.get('lazy', self._lazy) 460 ) 461 if obj._text is None and self._text is not None: 462 obj._text = self._text 463 return obj

464

465 - def open(self):

466 """Returns a opened resource reader object for the instance URL.""" 467 if self._url is None: 468 raise XMLSchemaValueError("can't open, the resource has no URL associated.") 469 try: 470 return urlopen(self._url, timeout=self.timeout) 471 except URLError as err: 472 raise XMLSchemaURLError(reason="cannot access to resource %r: %s" % (self._url, err.reason))

473

474 - def load(self):

475 """ 476 Loads the XML text from the data source. If the data source is an Element 477 the source XML text can't be retrieved. 478 """ 479 if self._url is None: 480 return # Created from Element or text source --> already loaded 481 482 resource = self.open() 483 try: 484 data = resource.read() 485 except (OSError, IOError) as err: 486 raise XMLSchemaOSError("cannot load data from %r: %s" % (self._url, err)) 487 finally: 488 resource.close() 489 490 try: 491 self._text = data.decode('utf-8') if PY3 else data.encode('utf-8') 492 except UnicodeDecodeError: 493 if PY3: 494 self._text = data.decode('iso-8859-1') 495 else: 496 with codecs.open(urlsplit(self._url).path, mode='rb', encoding='iso-8859-1') as f: 497 self._text = f.read().encode('iso-8859-1')

498

499 - def is_lazy(self):

500 """Gets `True` the XML resource is lazy.""" 501 return self._lazy

502

503 - def is_loaded(self):

504 """Gets `True` the XML text of the data source is loaded.""" 505 return self._text is not None

506

507 - def iter(self, tag=None):

508 """XML resource tree elements lazy iterator.""" 509 if not self._lazy: 510 for elem in self._root.iter(tag): 511 yield elem 512 return 513 elif self._url is not None: 514 resource = urlopen(self._url, timeout=self.timeout) 515 else: 516 resource = StringIO(self._text) 517 518 try: 519 for event, elem in self.iterparse(resource, events=('start', 'end')): 520 if event == 'end': 521 elem.clear() 522 elif tag is None or elem.tag == tag: 523 yield elem 524 finally: 525 resource.close()

526

527 - def iter_location_hints(self):

528 """Yields schema location hints from the XML tree.""" 529 for elem in self.iter(): 530 try: 531 locations = elem.attrib[XSI_SCHEMA_LOCATION] 532 except KeyError: 533 pass 534 else: 535 locations = locations.split() 536 for ns, url in zip(locations[0::2], locations[1::2]): 537 yield ns, url 538 539 try: 540 locations = elem.attrib[XSI_NONS_SCHEMA_LOCATION] 541 except KeyError: 542 pass 543 else: 544 for url in locations.split(): 545 yield '', url

546

547 - def get_namespaces(self):

548 """ 549 Extracts namespaces with related prefixes from the XML resource. If a duplicate 550 prefix declaration is encountered then adds the namespace using a different prefix, 551 but only in the case if the namespace URI is not already mapped by another prefix. 552 553 :return: A dictionary for mapping namespace prefixes to full URI. 554 """ 555 def update_nsmap(prefix, uri): 556 if prefix not in nsmap and (prefix or not local_root): 557 nsmap[prefix] = uri 558 elif not any(uri == ns for ns in nsmap.values()): 559 if not prefix: 560 try: 561 prefix = re.search(r'(\w+)$', uri.strip()).group() 562 except AttributeError: 563 return 564 565 while prefix in nsmap: 566 match = re.search(r'(\d+)$', prefix) 567 if match: 568 index = int(match.group()) + 1 569 prefix = prefix[:match.span()[0]] + str(index) 570 else: 571 prefix += '2' 572 nsmap[prefix] = uri

573 574 local_root = self.root.tag[0] != '{' 575 nsmap = {} 576 577 if self._url is not None: 578 resource = self.open() 579 try: 580 for event, node in self.iterparse(resource, events=('start-ns',)): 581 update_nsmap(*node) 582 except (ElementTree.ParseError, PyElementTree.ParseError, UnicodeEncodeError): 583 pass 584 finally: 585 resource.close() 586 elif isinstance(self._text, string_base_type): 587 try: 588 for event, node in self.iterparse(StringIO(self._text), events=('start-ns',)): 589 update_nsmap(*node) 590 except (ElementTree.ParseError, PyElementTree.ParseError, UnicodeEncodeError): 591 pass 592 else: 593 # Warning: can extracts namespace information only from lxml etree structures 594 try: 595 for elem in self._root.iter(): 596 for k, v in elem.nsmap.items(): 597 update_nsmap(k if k is not None else '', v) 598 except (AttributeError, TypeError): 599 pass # Not an lxml's tree or element 600 601 return nsmap

Copyright(C) 2019 Arno-Can Uestuensoez @Ingenieurbuero Arno-Can Uestuensoez	https://arnocan.wordpress.com
Generated by Epydoc 4.0.4 / Python-3.8 / fedora27 on Fri Dec 13 15:25:35 2019	http://epydoc.sourceforge.net

Source Code for Module xmlschema_acue.resources