Package xmlschema_acue ::
Module resources
1
2
3
4
5
6
7
8
9
10
11 import os.path
12 import re
13 import codecs
14
15 from xmlschema_acue.compat import (
16 PY3, StringIO, BytesIO, string_base_type, urlopen, urlsplit, urljoin, urlunsplit,
17 pathname2url, URLError, uses_relative
18 )
19 from xmlschema_acue.exceptions import XMLSchemaTypeError, XMLSchemaValueError, XMLSchemaURLError, XMLSchemaOSError
20 from xmlschema_acue.qnames import XSI_SCHEMA_LOCATION, XSI_NONS_SCHEMA_LOCATION
21 from xmlschema_acue.helpers import get_namespace
22 from xmlschema_acue.etree import ElementTree, PyElementTree, SafeXMLParser, is_etree_element, etree_tostring
23
24
25 DEFUSE_MODES = ('always', 'remote', 'never')
29 return url is not None and urlsplit(url).scheme not in ('', 'file')
30
33 return os.path.isdir(urlsplit(url).path)
34
37 return os.path.isfile(urlsplit(url).path)
38
41 """
42 Returns a normalized URL doing a join with a base URL. URL scheme defaults to 'file' and
43 backslashes are replaced with slashes. For file paths the os.path.join is used instead of
44 urljoin.
45
46 :param url: a relative or absolute URL.
47 :param base_url: the reference base URL for construct the normalized URL from the argument. \
48 For compatibility between "os.path.join" and "urljoin" a trailing '/' is added to not empty paths.
49 :param keep_relative: if set to `True` keeps relative file paths, which would not strictly \
50 conformant to URL format specification.
51 :return: A normalized URL.
52 """
53 def add_trailing_slash(r):
54 return urlunsplit((r[0], r[1], r[2] + '/' if r[2] and r[2][-1] != '/' else r[2], r[3], r[4]))
55
56 if base_url is not None:
57 base_url = base_url.replace('\\', '/')
58 base_url_parts = urlsplit(base_url)
59 base_url = add_trailing_slash(base_url_parts)
60 if base_url_parts.scheme not in uses_relative:
61 base_url_parts = urlsplit('file:///{}'.format(base_url))
62 else:
63 base_url_parts = urlsplit(base_url)
64
65 if base_url_parts.scheme not in ('', 'file'):
66 url = urljoin(base_url, url)
67 else:
68
69 url_parts = urlsplit(url)
70 if url_parts.scheme not in ('', 'file'):
71 url = urljoin(base_url, url)
72 elif not url_parts.netloc or base_url_parts.netloc == url_parts.netloc:
73
74 url = urlunsplit((
75 '',
76 base_url_parts.netloc,
77 os.path.normpath(os.path.join(base_url_parts.path, url_parts.path)),
78 url_parts.query,
79 url_parts.fragment,
80 ))
81
82 url = url.replace('\\', '/')
83 url_parts = urlsplit(url, scheme='file')
84 if url_parts.scheme not in uses_relative:
85 return 'file:///{}'.format(url_parts.geturl())
86 elif url_parts.scheme != 'file':
87 return urlunsplit((
88 url_parts.scheme,
89 url_parts.netloc,
90 pathname2url(url_parts.path),
91 url_parts.query,
92 url_parts.fragment,
93 ))
94 elif os.path.isabs(url_parts.path):
95 return url_parts.geturl()
96 elif keep_relative:
97
98 return 'file:{}'.format(urlunsplit(('',) + url_parts[1:]))
99 else:
100 return urlunsplit((
101 url_parts.scheme,
102 url_parts.netloc,
103 os.path.abspath(url_parts.path),
104 url_parts.query,
105 url_parts.fragment,
106 ))
107
110 """
111 Fetch a resource trying to accessing it. If the resource is accessible
112 returns the URL, otherwise raises an error (XMLSchemaURLError).
113
114 :param location: an URL or a file path.
115 :param base_url: reference base URL for normalizing local and relative URLs.
116 :param timeout: the timeout in seconds for the connection attempt in case of remote data.
117 :return: a normalized URL.
118 """
119 if not location:
120 raise XMLSchemaValueError("'location' argument must contains a not empty string.")
121
122 url = normalize_url(location, base_url)
123 try:
124 resource = urlopen(url, timeout=timeout)
125 except URLError as err:
126
127 url = normalize_url(location)
128 try:
129 resource = urlopen(url, timeout=timeout)
130 except URLError:
131 raise XMLSchemaURLError(reason=err.reason)
132 else:
133 resource.close()
134 return url
135 else:
136 resource.close()
137 return url
138
141 """
142 Fetches the schema URL for the source's root of an XML data source and a list of location hints.
143 If an accessible schema location is not found raises a ValueError.
144
145 :param source: an Element or an Element Tree with XML data or an URL or a file-like object.
146 :param locations: a dictionary or dictionary items with Schema location hints.
147 :param resource_options: keyword arguments for providing :class:`XMLResource` class init options.
148 :return: A tuple with the URL referring to the first reachable schema resource, a list \
149 of dictionary items with normalized location hints.
150 """
151 base_url = resource_options.pop('base_url', None)
152 timeout = resource_options.pop('timeout', 30)
153 resource = XMLResource(source, base_url, timeout=timeout, **resource_options)
154
155 base_url = resource.base_url
156 namespace = resource.namespace
157 locations = resource.get_locations(locations)
158 for ns, url in filter(lambda x: x[0] == namespace, locations):
159 try:
160 return fetch_resource(url, base_url, timeout), locations
161 except XMLSchemaURLError:
162 pass
163 raise XMLSchemaValueError("not found a schema for XML data resource %r (namespace=%r)." % (source, namespace))
164
165
166 -def fetch_schema(source, locations=None, **resource_options):
167 """
168 Fetches the schema URL for the source's root of an XML data source.
169 If an accessible schema location is not found raises a ValueError.
170
171 :param source: An an Element or an Element Tree with XML data or an URL or a file-like object.
172 :param locations: A dictionary or dictionary items with schema location hints.
173 :param resource_options: keyword arguments for providing :class:`XMLResource` class init options.
174 :return: An URL referring to a reachable schema resource.
175 """
176 return fetch_schema_locations(source, locations, **resource_options)[0]
177
180 """
181 Extracts namespaces with related prefixes from the XML data source. If the source is
182 an lxml's ElementTree/Element returns the nsmap attribute of the root. If a duplicate
183 prefix declaration is encountered then adds the namespace using a different prefix,
184 but only in the case if the namespace URI is not already mapped by another prefix.
185
186 :param source: a string containing the XML document or file path or an url \
187 or a file like object or an ElementTree or Element.
188 :param resource_options: keyword arguments for providing :class:`XMLResource` init options.
189 :return: A dictionary for mapping namespace prefixes to full URI.
190 """
191 timeout = resource_options.pop('timeout', 30)
192 return XMLResource(source, timeout=timeout, **resource_options).get_namespaces()
193
196 """
197 Load XML data source into an Element tree, returning the root Element, the XML text and an
198 url, if available. Usable for XML data files of small or medium sizes, as XSD schemas.
199
200 :param source: an URL, a filename path or a file-like object.
201 :param element_only: if True the function returns only the root Element of the tree.
202 :param resource_options: keyword arguments for providing :class:`XMLResource` init options.
203 :return: a tuple with three items (root Element, XML text and XML URL) or \
204 only the root Element if 'element_only' argument is True.
205 """
206 lazy = resource_options.pop('lazy', False)
207 source = XMLResource(source, lazy=lazy, **resource_options)
208 if element_only:
209 return source.root
210 else:
211 source.load()
212 return source.root, source.text, source.url
213
216 """
217 XML resource reader based on ElementTree and urllib.
218
219 :param source: a string containing the XML document or file path or an URL or a file like \
220 object or an ElementTree or an Element.
221 :param base_url: is an optional base URL, used for the normalization of relative paths when \
222 the URL of the resource can't be obtained from the source argument.
223 :param defuse: set the usage of SafeXMLParser for XML data. Can be 'always', 'remote' or 'never'. \
224 Default is 'remote' that uses the defusedxml only when loading remote data.
225 :param timeout: the timeout in seconds for the connection attempt in case of remote data.
226 :param lazy: if set to `False` the source is fully loaded into and processed from memory. Default is `True`.
227 """
228 - def __init__(self, source, base_url=None, defuse='remote', timeout=300, lazy=True):
238
240
241 return unicode(self).encode("utf-8")
242
245
246 if PY3:
247 __str__ = __unicode__
248
250 if self._root is None:
251 return u'%s()' % self.__class__.__name__
252 elif self._url is None:
253 return u'%s(tag=%r)' % (self.__class__.__name__, self._root.tag)
254 else:
255 return u'%s(tag=%r, basename=%r)' % (
256 self.__class__.__name__, self._root.tag, os.path.basename(self._url)
257 )
258
260 if name == 'source':
261 self._root, self._document, self._text, self._url = self._fromsource(value)
262 elif name == 'defuse' and value not in DEFUSE_MODES:
263 raise XMLSchemaValueError(u"'defuse' attribute: {!r} is not a defuse mode.".format(value))
264 elif name == 'timeout' and (not isinstance(value, int) or value <= 0):
265 raise XMLSchemaValueError(u"'timeout' attribute must be a positive integer: {!r}".format(value))
266 elif name == 'lazy' and not isinstance(value, bool):
267 raise XMLSchemaValueError(u"'lazy' attribute must be a boolean: {!r}".format(value))
268 super(XMLResource, self).__setattr__(name, value)
269
271 url, lazy = None, self._lazy
272 if is_etree_element(source):
273 return source, None, None, None
274 elif isinstance(source, string_base_type):
275 _url, self._url = self._url, None
276 try:
277 if lazy:
278
279 for _, root in self.iterparse(StringIO(source), events=('start',)):
280 return root, None, source, None
281 else:
282 return self.fromstring(source), None, source, None
283 except (ElementTree.ParseError, PyElementTree.ParseError, UnicodeEncodeError):
284 if '\n' in source:
285 raise
286 finally:
287 self._url = _url
288 url = normalize_url(source) if '\n' not in source else None
289
290 elif isinstance(source, StringIO):
291 _url, self._url = self._url, None
292 try:
293 if lazy:
294 for _, root in self.iterparse(source, events=('start',)):
295 return root, None, source.getvalue(), None
296 else:
297 document = self.parse(source)
298 return document.getroot(), document, source.getvalue(), None
299 finally:
300 self._url = _url
301
302 elif hasattr(source, 'read'):
303
304 try:
305 if hasattr(source, 'url'):
306 url = source.url
307 else:
308 url = normalize_url(source.name)
309 except AttributeError:
310 pass
311 else:
312 _url, self._url = self._url, url
313 try:
314 if lazy:
315 for _, root in self.iterparse(source, events=('start',)):
316 return root, None, None, url
317 else:
318 document = self.parse(source)
319 return document.getroot(), document, None, url
320 finally:
321 self._url = _url
322
323 else:
324
325 try:
326 root = source.getroot()
327 except (AttributeError, TypeError):
328 pass
329 else:
330 if is_etree_element(root):
331 return root, source, None, None
332
333 if url is None:
334 raise XMLSchemaTypeError(
335 "wrong type %r for 'source' attribute: an ElementTree object or an Element instance or a "
336 "string containing XML data or an URL or a file-like object is required." % type(source)
337 )
338 else:
339 resource = urlopen(url, timeout=self.timeout)
340 _url, self._url = self._url, url
341 try:
342 if lazy:
343 for _, root in self.iterparse(resource, events=('start',)):
344 return root, None, None, url
345 else:
346 document = self.parse(resource)
347 root = document.getroot()
348 return root, document, None, url
349 finally:
350 self._url = _url
351 resource.close()
352
353 @property
355 """The XML tree root Element."""
356 return self._root
357
358 @property
360 """
361 The ElementTree document, `None` if the instance is lazy or is not created
362 from another document or from an URL.
363 """
364 return self._document
365
366 @property
368 """The XML text source, `None` if it's not available."""
369 return self._text
370
371 @property
373 """The source URL, `None` if the instance is created from an Element tree or from a string."""
374 return self._url
375
376 @property
378 """The base URL for completing relative locations."""
379 return os.path.dirname(self._url) if self._url else self._base_url
380
381 @property
383 """The namespace of the XML document."""
384 return get_namespace(self._root.tag) if self._root is not None else None
385
386 @staticmethod
400
401 - def parse(self, source):
402 """
403 An equivalent of *ElementTree.parse()* that can protect from XML entities attacks. When
404 protection is applied XML data are loaded and defused before building the ElementTree instance.
405
406 :param source: a filename or file object containing XML data.
407 :returns: an ElementTree instance.
408 """
409 if self.defuse == 'always' or self.defuse == 'remote' and is_remote_url(self._url):
410 text = source.read()
411 if isinstance(text, bytes):
412 self.defusing(BytesIO(text))
413 return ElementTree.parse(BytesIO(text))
414 else:
415 self.defusing(StringIO(text))
416 return ElementTree.parse(StringIO(text))
417 else:
418 return ElementTree.parse(source)
419
436
438 """
439 An equivalent of *ElementTree.fromstring()* that can protect from XML entities attacks.
440
441 :param text: a string containing XML data.
442 :returns: the root Element instance.
443 """
444 if self.defuse == 'always' or self.defuse == 'remote' and is_remote_url(self._url):
445 self.defusing(StringIO(text))
446 return ElementTree.fromstring(text)
447
448 - def tostring(self, indent='', max_lines=None, spaces_for_tab=4, xml_declaration=False):
449 """Generates a string representation of the XML resource."""
450 return etree_tostring(self._root, self.get_namespaces(), indent, max_lines, spaces_for_tab, xml_declaration)
451
452 - def copy(self, **kwargs):
453 """Resource copy method. Change init parameters with keyword arguments."""
454 obj = type(self)(
455 source=self.source,
456 base_url=kwargs.get('base_url', self.base_url),
457 defuse=kwargs.get('defuse', self.defuse),
458 timeout=kwargs.get('timeout', self.timeout),
459 lazy=kwargs.get('lazy', self._lazy)
460 )
461 if obj._text is None and self._text is not None:
462 obj._text = self._text
463 return obj
464
466 """Returns a opened resource reader object for the instance URL."""
467 if self._url is None:
468 raise XMLSchemaValueError("can't open, the resource has no URL associated.")
469 try:
470 return urlopen(self._url, timeout=self.timeout)
471 except URLError as err:
472 raise XMLSchemaURLError(reason="cannot access to resource %r: %s" % (self._url, err.reason))
473
475 """
476 Loads the XML text from the data source. If the data source is an Element
477 the source XML text can't be retrieved.
478 """
479 if self._url is None:
480 return
481
482 resource = self.open()
483 try:
484 data = resource.read()
485 except (OSError, IOError) as err:
486 raise XMLSchemaOSError("cannot load data from %r: %s" % (self._url, err))
487 finally:
488 resource.close()
489
490 try:
491 self._text = data.decode('utf-8') if PY3 else data.encode('utf-8')
492 except UnicodeDecodeError:
493 if PY3:
494 self._text = data.decode('iso-8859-1')
495 else:
496 with codecs.open(urlsplit(self._url).path, mode='rb', encoding='iso-8859-1') as f:
497 self._text = f.read().encode('iso-8859-1')
498
500 """Gets `True` the XML resource is lazy."""
501 return self._lazy
502
504 """Gets `True` the XML text of the data source is loaded."""
505 return self._text is not None
506
507 - def iter(self, tag=None):
508 """XML resource tree elements lazy iterator."""
509 if not self._lazy:
510 for elem in self._root.iter(tag):
511 yield elem
512 return
513 elif self._url is not None:
514 resource = urlopen(self._url, timeout=self.timeout)
515 else:
516 resource = StringIO(self._text)
517
518 try:
519 for event, elem in self.iterparse(resource, events=('start', 'end')):
520 if event == 'end':
521 elem.clear()
522 elif tag is None or elem.tag == tag:
523 yield elem
524 finally:
525 resource.close()
526
528 """Yields schema location hints from the XML tree."""
529 for elem in self.iter():
530 try:
531 locations = elem.attrib[XSI_SCHEMA_LOCATION]
532 except KeyError:
533 pass
534 else:
535 locations = locations.split()
536 for ns, url in zip(locations[0::2], locations[1::2]):
537 yield ns, url
538
539 try:
540 locations = elem.attrib[XSI_NONS_SCHEMA_LOCATION]
541 except KeyError:
542 pass
543 else:
544 for url in locations.split():
545 yield '', url
546
548 """
549 Extracts namespaces with related prefixes from the XML resource. If a duplicate
550 prefix declaration is encountered then adds the namespace using a different prefix,
551 but only in the case if the namespace URI is not already mapped by another prefix.
552
553 :return: A dictionary for mapping namespace prefixes to full URI.
554 """
555 def update_nsmap(prefix, uri):
556 if prefix not in nsmap and (prefix or not local_root):
557 nsmap[prefix] = uri
558 elif not any(uri == ns for ns in nsmap.values()):
559 if not prefix:
560 try:
561 prefix = re.search(r'(\w+)$', uri.strip()).group()
562 except AttributeError:
563 return
564
565 while prefix in nsmap:
566 match = re.search(r'(\d+)$', prefix)
567 if match:
568 index = int(match.group()) + 1
569 prefix = prefix[:match.span()[0]] + str(index)
570 else:
571 prefix += '2'
572 nsmap[prefix] = uri
573
574 local_root = self.root.tag[0] != '{'
575 nsmap = {}
576
577 if self._url is not None:
578 resource = self.open()
579 try:
580 for event, node in self.iterparse(resource, events=('start-ns',)):
581 update_nsmap(*node)
582 except (ElementTree.ParseError, PyElementTree.ParseError, UnicodeEncodeError):
583 pass
584 finally:
585 resource.close()
586 elif isinstance(self._text, string_base_type):
587 try:
588 for event, node in self.iterparse(StringIO(self._text), events=('start-ns',)):
589 update_nsmap(*node)
590 except (ElementTree.ParseError, PyElementTree.ParseError, UnicodeEncodeError):
591 pass
592 else:
593
594 try:
595 for elem in self._root.iter():
596 for k, v in elem.nsmap.items():
597 update_nsmap(k if k is not None else '', v)
598 except (AttributeError, TypeError):
599 pass
600
601 return nsmap
602
604 """
605 Returns a list of schema location hints. The locations are normalized using the
606 base URL of the instance. The *locations* argument can be a dictionary or a list
607 of namespace resources, that are inserted before the schema location hints extracted
608 from the XML resource.
609 """
610 base_url = self.base_url
611 location_hints = []
612 if locations is not None:
613 try:
614 for ns, value in locations.items():
615 if isinstance(value, list):
616 location_hints.extend([(ns, normalize_url(url, base_url)) for url in value])
617 else:
618 location_hints.append((ns, normalize_url(value, base_url)))
619 except AttributeError:
620 location_hints.extend([(ns, normalize_url(url, base_url)) for ns, url in locations])
621
622 location_hints.extend([(ns, normalize_url(url, base_url)) for ns, url in self.iter_location_hints()])
623 return location_hints
624