Package xmlschema_acue ::
Module regex
1
2
3
4
5
6
7
8
9
10
11 """
12 Parse and translate XML regular expressions to Python regex syntax.
13 """
14 from __future__ import unicode_literals
15 from __future__ import absolute_import
16 import re
17 from sys import maxunicode
18
19 from xmlschema_acue.compat import PY3, unicode_type, string_base_type, MutableSet
20 from xmlschema_acue.exceptions import XMLSchemaValueError, XMLSchemaRegexError
21 from xmlschema_acue.codepoints import UnicodeSubset
22 from xmlschema_acue.codepoints import UNICODE_BLOCKS
23 from xmlschema_acue.codepoints import UNICODE_CATEGORIES
24
25 _RE_QUANTIFIER = re.compile(r'{\d+(,(\d+)?)?}')
26 _RE_FORBIDDEN_ESCAPES = re.compile(
27 r'(?<!\\)\\(U[0-9a-fA-F]{8}|u[0-9a-fA-F]{4}|x[0-9a-fA-F]{2}|o{\d+}|\d+|A|Z|z|B|b|o)'
28 )
29
30 _UNICODE_SUBSETS = UNICODE_CATEGORIES.copy()
31 _UNICODE_SUBSETS.update(UNICODE_BLOCKS)
32
33
39
40
41 I_SHORTCUT_REPLACE = (
42 ":A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF"
43 "\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD"
44 )
45
46 C_SHORTCUT_REPLACE = (
47 "-.0-9:A-Z_a-z\u00B7\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u037D\u037F-\u1FFF\u200C-"
48 "\u200D\u203F\u2040\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD"
49 )
50
51 S_SHORTCUT_SET = UnicodeSubset(' \n\t\r')
52 D_SHORTCUT_SET = UnicodeSubset('0-9')
53 I_SHORTCUT_SET = UnicodeSubset(I_SHORTCUT_REPLACE)
54 C_SHORTCUT_SET = UnicodeSubset(C_SHORTCUT_REPLACE)
55 W_SHORTCUT_SET = UnicodeSubset()
56 W_SHORTCUT_SET._code_points = sorted(
57 UNICODE_CATEGORIES['P'].code_points + UNICODE_CATEGORIES['Z'].code_points +
58 UNICODE_CATEGORIES['C'].code_points, key=lambda x: x if isinstance(x, int) else x[0]
59 )
60
61
62 CHARACTER_ESCAPES = {
63
64 '\\n': '\n',
65 '\\r': '\r',
66 '\\t': '\t',
67 '\\|': '|',
68 '\\.': '.',
69 '\\-': '-',
70 '\\^': '^',
71 '\\?': '?',
72 '\\*': '*',
73 '\\+': '+',
74 '\\{': '{',
75 '\\}': '}',
76 '\\(': '(',
77 '\\)': ')',
78 '\\[': '[',
79 '\\]': ']',
80 '\\\\': '\\',
81
82
83 '\\s': S_SHORTCUT_SET,
84 '\\S': S_SHORTCUT_SET,
85 '\\d': D_SHORTCUT_SET,
86 '\\D': D_SHORTCUT_SET,
87 '\\i': I_SHORTCUT_SET,
88 '\\I': I_SHORTCUT_SET,
89 '\\c': C_SHORTCUT_SET,
90 '\\C': C_SHORTCUT_SET,
91 '\\w': W_SHORTCUT_SET,
92 '\\W': W_SHORTCUT_SET,
93 }
94
95
97 """
98 A set subclass to represent XML Schema regex character groups.
99 """
100 _re_char_group = re.compile(r'(?<!.-)(\\[nrt|.\-^?*+{}()\]sSdDiIcCwW]|\\[pP]{[a-zA-Z\-0-9]+})')
101 _re_unicode_ref = re.compile(r'\\([pP]){([\w\d-]+)}')
102
108
110 return '<%s at %d>' % (self.__class__.__name__, id(self))
111
113 return unicode(self).encode("utf-8")
114
124
125 if PY3:
126 __str__ = __unicode__
127
129 if self.negative:
130 return ord(char) not in self.negative or ord(char) in self.positive
131 return ord(char) in self.positive
132
134 if self.negative:
135 return (
136 cp for cp in range(maxunicode + 1)
137 if cp in self.positive or cp not in self.negative
138 )
139 return iter(sorted(self.positive))
140
142 return len(self.positive) + len(self.negative)
143
144
146 if self.negative:
147 self.positive |= (other.negative - self.negative)
148 if other.negative:
149 self.negative.clear()
150 elif other.negative:
151 self.positive &= other.negative
152 self.positive -= other.positive
153 return self
154
175
196
198 self.positive.clear()
199 self.negative.clear()
200
202 self.positive, self.negative = self.negative, self.positive
203
204
206 """
207 Parses a character class of an XML Schema regular expression.
208
209 :param xml_regex: the source XML Schema regular expression.
210 :param class_pos: the position of the character class in the source string, \
211 must coincide with a '[' character.
212 :return: an `XsdRegexCharGroup` instance and the first position after the character class.
213 """
214 if xml_regex[class_pos] != '[':
215 raise XMLSchemaRegexError('not a character class at position %d: %r' % (class_pos, xml_regex))
216
217 pos = class_pos + 1
218 if xml_regex[pos] == '^':
219 pos += 1
220 negative = True
221 else:
222 negative = False
223
224 group_pos = pos
225 while True:
226 if xml_regex[pos] == '[':
227 raise XMLSchemaRegexError("'[' is invalid in a character class: %r" % xml_regex)
228 elif xml_regex[pos] == '\\':
229 pos += 2
230 elif xml_regex[pos] == ']' or xml_regex[pos:pos + 2] == '-[':
231 if pos == group_pos:
232 raise XMLSchemaRegexError("empty character class at position %d: %r" % (class_pos, xml_regex))
233 char_group = XsdRegexCharGroup(xml_regex[group_pos:pos])
234 if negative:
235 char_group.complement()
236 break
237 else:
238 pos += 1
239
240 if xml_regex[pos] != ']':
241
242 pos += 1
243 subtracted_group, pos = parse_character_class(xml_regex, pos)
244 pos += 1
245 if xml_regex[pos] != ']':
246 raise XMLSchemaRegexError("unterminated character group at position %d: %r" % (class_pos, xml_regex))
247 char_group -= subtracted_group
248
249 return char_group, pos
250
251
253 """
254 Translates an XML regex expression to a Python compatible expression.
255 """
256 regex = ['^(']
257 pos = 0
258 xml_regex_len = len(xml_regex)
259 nested_groups = 0
260
261 match = _RE_FORBIDDEN_ESCAPES.search(xml_regex)
262 if match:
263 raise XMLSchemaRegexError(
264 "not allowed escape sequence %r at position %d: %r" % (match.group(), match.span()[0], xml_regex)
265 )
266
267 while pos < xml_regex_len:
268 ch = xml_regex[pos]
269 if ch == '.':
270 regex.append('[^\r\n]')
271 elif ch in ('^', '$'):
272 regex.append(r'\%s' % ch)
273 elif ch == '[':
274 try:
275 char_group, pos = parse_character_class(xml_regex, pos)
276 except IndexError:
277 raise XMLSchemaRegexError(
278 "unterminated character group at position %d: %r" % (pos, xml_regex)
279 )
280 else:
281 char_group_repr = unicode_type(char_group)
282 if char_group_repr == '[^]':
283 regex.append(r'[\w\W]')
284 elif char_group_repr == '[]':
285 regex.append(r'[^\w\W]')
286 else:
287 regex.append(char_group_repr)
288
289 elif ch == '{':
290 if pos == 0:
291 raise XMLSchemaRegexError("unexpected quantifier %r at position %d: %r" % (ch, pos, xml_regex))
292 match = _RE_QUANTIFIER.match(xml_regex[pos:])
293 if match is None:
294 raise XMLSchemaRegexError("invalid quantifier %r at position %d: %r" % (ch, pos, xml_regex))
295 regex.append(match.group())
296 pos += len(match.group())
297 if pos < xml_regex_len and xml_regex[pos] in ('?', '+', '*'):
298 raise XMLSchemaRegexError(
299 "unexpected meta character %r at position %d: %r" % (xml_regex[pos], pos, xml_regex)
300 )
301 continue
302
303 elif ch == '(':
304 if xml_regex[pos:pos + 2] == '(?':
305 raise XMLSchemaRegexError("'(?...)' extension notation is not allowed: %r" % xml_regex)
306 nested_groups += 1
307 regex.append(ch)
308 elif ch == ']':
309 raise XMLSchemaRegexError("unexpected meta character %r at position %d: %r" % (ch, pos, xml_regex))
310 elif ch == ')':
311 if nested_groups == 0:
312 raise XMLSchemaRegexError("unbalanced parenthesis ')' at position %d: %r" % (pos, xml_regex))
313 nested_groups -= 1
314 regex.append(ch)
315 elif ch in ('?', '+', '*'):
316 if pos == 0:
317 raise XMLSchemaRegexError("unexpected quantifier %r at position %d: %r" % (ch, pos, xml_regex))
318 elif pos < xml_regex_len - 1 and xml_regex[pos+1] in ('?', '+', '*', '{'):
319 raise XMLSchemaRegexError(
320 "unexpected meta character %r at position %d: %r" % (xml_regex[pos+1], pos+1, xml_regex)
321 )
322 regex.append(ch)
323 elif ch == '\\':
324 pos += 1
325 if pos >= xml_regex_len:
326 regex.append('\\')
327 elif xml_regex[pos] == 'i':
328 regex.append('[%s]' % I_SHORTCUT_REPLACE)
329 elif xml_regex[pos] == 'I':
330 regex.append('[^%s]' % I_SHORTCUT_REPLACE)
331 elif xml_regex[pos] == 'c':
332 regex.append('[%s]' % C_SHORTCUT_REPLACE)
333 elif xml_regex[pos] == 'C':
334 regex.append('[^%s]' % C_SHORTCUT_REPLACE)
335 elif xml_regex[pos] in 'pP':
336 block_pos = pos - 1
337 try:
338 if xml_regex[pos + 1] != '{':
339 raise XMLSchemaValueError("a '{' expected, found %r." % xml_regex[pos + 1])
340 while xml_regex[pos] != '}':
341 pos += 1
342 except (IndexError, ValueError):
343 raise XMLSchemaRegexError(
344 "truncated unicode block escape at position %d: %r" % (block_pos, xml_regex))
345
346 p_shortcut_set = get_unicode_subset(xml_regex[block_pos + 3:pos])
347 if xml_regex[block_pos + 1] == 'p':
348 regex.append('[%s]' % p_shortcut_set)
349 else:
350 regex.append('[^%s]' % p_shortcut_set)
351 else:
352 regex.append('\\%s' % xml_regex[pos])
353 else:
354 regex.append(ch)
355 pos += 1
356
357 if nested_groups > 0:
358 raise XMLSchemaRegexError("unterminated subpattern in expression: %r" % xml_regex)
359 regex.append(r')$')
360 return ''.join(regex)
361