Package xmlschema_acue :: Module regex

Source Code for Module xmlschema_acue.regex

  1  # -*- coding: utf-8 -*- 
  2  # 
  3  # Copyright (c), 2016-2019, SISSA (International School for Advanced Studies). 
  4  # All rights reserved. 
  5  # This file is distributed under the terms of the MIT License. 
  6  # See the file 'LICENSE' in the root directory of the present 
  7  # distribution, or http://opensource.org/licenses/MIT. 
  8  # 
  9  # @author Davide Brunato <brunato@sissa.it> 
 10  # 
 11  """ 
 12  Parse and translate XML regular expressions to Python regex syntax. 
 13  """ 
 14  from __future__ import unicode_literals 
 15  from __future__ import absolute_import 
 16  import re 
 17  from sys import maxunicode 
 18   
 19  from xmlschema_acue.compat import PY3, unicode_type, string_base_type, MutableSet 
 20  from xmlschema_acue.exceptions import XMLSchemaValueError, XMLSchemaRegexError 
 21  from xmlschema_acue.codepoints import UnicodeSubset 
 22  from xmlschema_acue.codepoints import UNICODE_BLOCKS 
 23  from xmlschema_acue.codepoints import UNICODE_CATEGORIES 
 24   
 25  _RE_QUANTIFIER = re.compile(r'{\d+(,(\d+)?)?}') 
 26  _RE_FORBIDDEN_ESCAPES = re.compile( 
 27      r'(?<!\\)\\(U[0-9a-fA-F]{8}|u[0-9a-fA-F]{4}|x[0-9a-fA-F]{2}|o{\d+}|\d+|A|Z|z|B|b|o)' 
 28  ) 
 29   
 30  _UNICODE_SUBSETS = UNICODE_CATEGORIES.copy() 
 31  _UNICODE_SUBSETS.update(UNICODE_BLOCKS) 
 32   
 33   
34 -def get_unicode_subset(key):
35 try: 36 return _UNICODE_SUBSETS[key] 37 except KeyError: 38 raise XMLSchemaRegexError("%r don't match to any Unicode category or block.")
39 40 41 I_SHORTCUT_REPLACE = ( 42 ":A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF" 43 "\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD" 44 ) 45 46 C_SHORTCUT_REPLACE = ( 47 "-.0-9:A-Z_a-z\u00B7\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u037D\u037F-\u1FFF\u200C-" 48 "\u200D\u203F\u2040\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD" 49 ) 50 51 S_SHORTCUT_SET = UnicodeSubset(' \n\t\r') 52 D_SHORTCUT_SET = UnicodeSubset('0-9') 53 I_SHORTCUT_SET = UnicodeSubset(I_SHORTCUT_REPLACE) 54 C_SHORTCUT_SET = UnicodeSubset(C_SHORTCUT_REPLACE) 55 W_SHORTCUT_SET = UnicodeSubset() 56 W_SHORTCUT_SET._code_points = sorted( 57 UNICODE_CATEGORIES['P'].code_points + UNICODE_CATEGORIES['Z'].code_points + 58 UNICODE_CATEGORIES['C'].code_points, key=lambda x: x if isinstance(x, int) else x[0] 59 ) 60 61 # Single and Multi character escapes 62 CHARACTER_ESCAPES = { 63 # Single-character escapes 64 '\\n': '\n', 65 '\\r': '\r', 66 '\\t': '\t', 67 '\\|': '|', 68 '\\.': '.', 69 '\\-': '-', 70 '\\^': '^', 71 '\\?': '?', 72 '\\*': '*', 73 '\\+': '+', 74 '\\{': '{', 75 '\\}': '}', 76 '\\(': '(', 77 '\\)': ')', 78 '\\[': '[', 79 '\\]': ']', 80 '\\\\': '\\', 81 82 # Multi-character escapes 83 '\\s': S_SHORTCUT_SET, 84 '\\S': S_SHORTCUT_SET, 85 '\\d': D_SHORTCUT_SET, 86 '\\D': D_SHORTCUT_SET, 87 '\\i': I_SHORTCUT_SET, 88 '\\I': I_SHORTCUT_SET, 89 '\\c': C_SHORTCUT_SET, 90 '\\C': C_SHORTCUT_SET, 91 '\\w': W_SHORTCUT_SET, 92 '\\W': W_SHORTCUT_SET, 93 } 94 95
96 -class XsdRegexCharGroup(MutableSet):
97 """ 98 A set subclass to represent XML Schema regex character groups. 99 """ 100 _re_char_group = re.compile(r'(?<!.-)(\\[nrt|.\-^?*+{}()\]sSdDiIcCwW]|\\[pP]{[a-zA-Z\-0-9]+})') 101 _re_unicode_ref = re.compile(r'\\([pP]){([\w\d-]+)}') 102
103 - def __init__(self, *args):
104 self.positive = UnicodeSubset() 105 self.negative = UnicodeSubset() 106 for char in args: 107 self.add(char)
108
109 - def __repr__(self):
110 return '<%s at %d>' % (self.__class__.__name__, id(self))
111
112 - def __str__(self):
113 return unicode(self).encode("utf-8") # @UndefinedVariable
114
115 - def __unicode__(self):
116 if not self.negative: 117 return '[%s]' % unicode_type(self.positive) 118 elif not self.positive: 119 return '[^%s]' % unicode_type(self.negative) 120 else: 121 return '[%s%s]' % ( 122 unicode_type(UnicodeSubset(self.negative.complement())), unicode_type(self.positive) 123 )
124 125 if PY3: 126 __str__ = __unicode__ 127
128 - def __contains__(self, char):
129 if self.negative: 130 return ord(char) not in self.negative or ord(char) in self.positive 131 return ord(char) in self.positive
132
133 - def __iter__(self):
134 if self.negative: 135 return ( 136 cp for cp in range(maxunicode + 1) 137 if cp in self.positive or cp not in self.negative 138 ) 139 return iter(sorted(self.positive))
140
141 - def __len__(self):
142 return len(self.positive) + len(self.negative)
143 144 # Operators override
145 - def __isub__(self, other):
146 if self.negative: 147 self.positive |= (other.negative - self.negative) 148 if other.negative: 149 self.negative.clear() 150 elif other.negative: 151 self.positive &= other.negative 152 self.positive -= other.positive 153 return self
154
155 - def add(self, s):
156 for part in self._re_char_group.split(s): 157 if part in CHARACTER_ESCAPES: 158 value = CHARACTER_ESCAPES[part] 159 if isinstance(value, string_base_type): 160 self.positive.update(value) 161 elif part[-1].islower(): 162 self.positive |= value 163 else: 164 self.negative |= value 165 elif part.startswith('\\p'): 166 if self._re_unicode_ref.search(part) is None: 167 raise XMLSchemaValueError("wrong Unicode subset specification %r" % part) 168 self.positive |= get_unicode_subset(part[3:-1]) 169 elif part.startswith('\\P'): 170 if self._re_unicode_ref.search(part) is None: 171 raise XMLSchemaValueError("wrong Unicode subset specification %r" % part) 172 self.negative |= get_unicode_subset(part[3:-1]) 173 else: 174 self.positive.update(part)
175
176 - def discard(self, s):
177 for part in self._re_char_group.split(s): 178 if part in CHARACTER_ESCAPES: 179 value = CHARACTER_ESCAPES[part] 180 if isinstance(value, string_base_type): 181 self.positive.difference_update(value) 182 elif part[-1].islower(): 183 self.positive -= value 184 else: 185 self.negative -= value 186 elif part.startswith('\\p'): 187 if self._re_unicode_ref.search(part) is None: 188 raise XMLSchemaValueError("wrong Unicode subset specification %r" % part) 189 self.positive -= get_unicode_subset(part[3:-1]) 190 elif part.startswith('\\P'): 191 if self._re_unicode_ref.search(part) is None: 192 raise XMLSchemaValueError("wrong Unicode subset specification %r" % part) 193 self.negative -= get_unicode_subset(part[3:-1]) 194 else: 195 self.positive.difference_update(part)
196
197 - def clear(self):
198 self.positive.clear() 199 self.negative.clear()
200
201 - def complement(self):
202 self.positive, self.negative = self.negative, self.positive
203 204
205 -def parse_character_class(xml_regex, class_pos):
206 """ 207 Parses a character class of an XML Schema regular expression. 208 209 :param xml_regex: the source XML Schema regular expression. 210 :param class_pos: the position of the character class in the source string, \ 211 must coincide with a '[' character. 212 :return: an `XsdRegexCharGroup` instance and the first position after the character class. 213 """ 214 if xml_regex[class_pos] != '[': 215 raise XMLSchemaRegexError('not a character class at position %d: %r' % (class_pos, xml_regex)) 216 217 pos = class_pos + 1 218 if xml_regex[pos] == '^': 219 pos += 1 220 negative = True 221 else: 222 negative = False 223 224 group_pos = pos 225 while True: 226 if xml_regex[pos] == '[': 227 raise XMLSchemaRegexError("'[' is invalid in a character class: %r" % xml_regex) 228 elif xml_regex[pos] == '\\': 229 pos += 2 230 elif xml_regex[pos] == ']' or xml_regex[pos:pos + 2] == '-[': 231 if pos == group_pos: 232 raise XMLSchemaRegexError("empty character class at position %d: %r" % (class_pos, xml_regex)) 233 char_group = XsdRegexCharGroup(xml_regex[group_pos:pos]) 234 if negative: 235 char_group.complement() 236 break 237 else: 238 pos += 1 239 240 if xml_regex[pos] != ']': 241 # Parse a group subtraction 242 pos += 1 243 subtracted_group, pos = parse_character_class(xml_regex, pos) 244 pos += 1 245 if xml_regex[pos] != ']': 246 raise XMLSchemaRegexError("unterminated character group at position %d: %r" % (class_pos, xml_regex)) 247 char_group -= subtracted_group 248 249 return char_group, pos
250 251
252 -def get_python_regex(xml_regex):
253 """ 254 Translates an XML regex expression to a Python compatible expression. 255 """ 256 regex = ['^('] 257 pos = 0 258 xml_regex_len = len(xml_regex) 259 nested_groups = 0 260 261 match = _RE_FORBIDDEN_ESCAPES.search(xml_regex) 262 if match: 263 raise XMLSchemaRegexError( 264 "not allowed escape sequence %r at position %d: %r" % (match.group(), match.span()[0], xml_regex) 265 ) 266 267 while pos < xml_regex_len: 268 ch = xml_regex[pos] 269 if ch == '.': 270 regex.append('[^\r\n]') 271 elif ch in ('^', '$'): 272 regex.append(r'\%s' % ch) 273 elif ch == '[': 274 try: 275 char_group, pos = parse_character_class(xml_regex, pos) 276 except IndexError: 277 raise XMLSchemaRegexError( 278 "unterminated character group at position %d: %r" % (pos, xml_regex) 279 ) 280 else: 281 char_group_repr = unicode_type(char_group) 282 if char_group_repr == '[^]': 283 regex.append(r'[\w\W]') 284 elif char_group_repr == '[]': 285 regex.append(r'[^\w\W]') 286 else: 287 regex.append(char_group_repr) 288 289 elif ch == '{': 290 if pos == 0: 291 raise XMLSchemaRegexError("unexpected quantifier %r at position %d: %r" % (ch, pos, xml_regex)) 292 match = _RE_QUANTIFIER.match(xml_regex[pos:]) 293 if match is None: 294 raise XMLSchemaRegexError("invalid quantifier %r at position %d: %r" % (ch, pos, xml_regex)) 295 regex.append(match.group()) 296 pos += len(match.group()) 297 if pos < xml_regex_len and xml_regex[pos] in ('?', '+', '*'): 298 raise XMLSchemaRegexError( 299 "unexpected meta character %r at position %d: %r" % (xml_regex[pos], pos, xml_regex) 300 ) 301 continue 302 303 elif ch == '(': 304 if xml_regex[pos:pos + 2] == '(?': 305 raise XMLSchemaRegexError("'(?...)' extension notation is not allowed: %r" % xml_regex) 306 nested_groups += 1 307 regex.append(ch) 308 elif ch == ']': 309 raise XMLSchemaRegexError("unexpected meta character %r at position %d: %r" % (ch, pos, xml_regex)) 310 elif ch == ')': 311 if nested_groups == 0: 312 raise XMLSchemaRegexError("unbalanced parenthesis ')' at position %d: %r" % (pos, xml_regex)) 313 nested_groups -= 1 314 regex.append(ch) 315 elif ch in ('?', '+', '*'): 316 if pos == 0: 317 raise XMLSchemaRegexError("unexpected quantifier %r at position %d: %r" % (ch, pos, xml_regex)) 318 elif pos < xml_regex_len - 1 and xml_regex[pos+1] in ('?', '+', '*', '{'): 319 raise XMLSchemaRegexError( 320 "unexpected meta character %r at position %d: %r" % (xml_regex[pos+1], pos+1, xml_regex) 321 ) 322 regex.append(ch) 323 elif ch == '\\': 324 pos += 1 325 if pos >= xml_regex_len: 326 regex.append('\\') 327 elif xml_regex[pos] == 'i': 328 regex.append('[%s]' % I_SHORTCUT_REPLACE) 329 elif xml_regex[pos] == 'I': 330 regex.append('[^%s]' % I_SHORTCUT_REPLACE) 331 elif xml_regex[pos] == 'c': 332 regex.append('[%s]' % C_SHORTCUT_REPLACE) 333 elif xml_regex[pos] == 'C': 334 regex.append('[^%s]' % C_SHORTCUT_REPLACE) 335 elif xml_regex[pos] in 'pP': 336 block_pos = pos - 1 337 try: 338 if xml_regex[pos + 1] != '{': 339 raise XMLSchemaValueError("a '{' expected, found %r." % xml_regex[pos + 1]) 340 while xml_regex[pos] != '}': 341 pos += 1 342 except (IndexError, ValueError): 343 raise XMLSchemaRegexError( 344 "truncated unicode block escape at position %d: %r" % (block_pos, xml_regex)) 345 346 p_shortcut_set = get_unicode_subset(xml_regex[block_pos + 3:pos]) 347 if xml_regex[block_pos + 1] == 'p': 348 regex.append('[%s]' % p_shortcut_set) 349 else: 350 regex.append('[^%s]' % p_shortcut_set) 351 else: 352 regex.append('\\%s' % xml_regex[pos]) 353 else: 354 regex.append(ch) 355 pos += 1 356 357 if nested_groups > 0: 358 raise XMLSchemaRegexError("unterminated subpattern in expression: %r" % xml_regex) 359 regex.append(r')$') 360 return ''.join(regex)
361