Package xmlschema_acue ::
Module codepoints
1
2
3
4
5
6
7
8
9
10
11 """
12 This module defines Unicode character categories and blocks, defined as sets of code points.
13 """
14 from __future__ import unicode_literals
15 from __future__ import absolute_import
16
17 import json
18 import os
19 from sys import maxunicode
20
21 from xmlschema_acue.compat import PY3, unicode_chr, string_base_type, Iterable, MutableSet
22 from xmlschema_acue.exceptions import XMLSchemaValueError, XMLSchemaTypeError, XMLSchemaRegexError
23
24 CHARACTER_GROUP_ESCAPED = {ord(c) for c in r'-|.^?*+{}()[]\\'}
25 """Code Points of escaped chars in a character group."""
26
27 UCS4_MAXUNICODE = 1114111
31 """Ordering function for code points."""
32 return cp if isinstance(cp, int) else cp[0]
33
36 """Reverse ordering function for code points."""
37 return cp if isinstance(cp, int) else cp[1] - 1
38
41 """
42 Iterates a code points sequence. The code points are accorpated in ranges when are contiguous.
43
44 :param code_points: an iterable with code points and code point ranges.
45 :param reverse: if `True` reverses the order of the sequence.
46 :return: yields code points or code point ranges.
47 """
48 start_cp = end_cp = None
49 if reverse:
50 code_points = sorted(code_points, key=code_point_reverse_order, reverse=True)
51 else:
52 code_points = sorted(code_points, key=code_point_order)
53
54 for cp in code_points:
55 if isinstance(cp, int):
56 cp = cp, cp + 1
57
58 if start_cp is None:
59 start_cp, end_cp = cp
60 continue
61 elif reverse:
62 if start_cp <= cp[1]:
63 start_cp = min(start_cp, cp[0])
64 continue
65 elif end_cp >= cp[0]:
66 end_cp = max(end_cp, cp[1])
67 continue
68
69 if end_cp > start_cp + 1:
70 yield start_cp, end_cp
71 else:
72 yield start_cp
73 start_cp, end_cp = cp
74 else:
75 if start_cp is not None:
76 if end_cp > start_cp + 1:
77 yield start_cp, end_cp
78 else:
79 yield start_cp
80
83 """
84 Checks a code point or code point range.
85
86 :return: a valid code point range.
87 """
88 if isinstance(cp, int):
89 if not (0 <= cp <= maxunicode):
90 raise XMLSchemaValueError("not a Unicode code point: %r" % cp)
91 return cp, cp + 1
92 else:
93 if not (0 <= cp[0] < cp[1] <= maxunicode + 1) \
94 or not isinstance(cp[0], int) or not isinstance(cp[1], int):
95 raise XMLSchemaValueError("not a Unicode code point range: %r" % cp)
96 return cp
97
100 """
101 Returns the string representation of a code point.
102
103 :param cp: an integer or a tuple with at least two integers. Values must be in interval [0, sys.maxunicode].
104 """
105 if isinstance(cp, int):
106 if cp in CHARACTER_GROUP_ESCAPED:
107 return r'\%s' % unicode_chr(cp)
108 return unicode_chr(cp)
109
110 if cp[0] in CHARACTER_GROUP_ESCAPED:
111 start_char = r'\%s' % unicode_chr(cp[0])
112 else:
113 start_char = unicode_chr(cp[0])
114
115 end_cp = cp[1] - 1
116 if end_cp in CHARACTER_GROUP_ESCAPED:
117 end_char = r'\%s' % unicode_chr(end_cp)
118 else:
119 end_char = unicode_chr(end_cp)
120
121 if end_cp > cp[0] + 1:
122 return '%s-%s' % (start_char, end_char)
123 else:
124 return start_char + end_char
125
128 """
129 Parse a regex character group part, generating a sequence of code points
130 and code points ranges. An unescaped hyphen (-) that is not at the start
131 or at the and is interpreted as range specifier.
132
133 :param s: a string representing a character group part.
134 :param expand_ranges: if set to `True` then expands character ranges.
135 :return: yields integers or couples of integers.
136 """
137 escaped = False
138 on_range = False
139 char = None
140 length = len(s)
141 string_iter = iter(range(len(s)))
142 for k in string_iter:
143 if k == 0:
144 char = s[0]
145 if char == '\\':
146 escaped = True
147 elif char in r'[]' and length > 1:
148 raise XMLSchemaRegexError("bad character %r at position 0" % char)
149 elif expand_ranges:
150 yield ord(char)
151 elif length <= 2 or s[1] != '-':
152 yield ord(char)
153 elif s[k] == '-':
154 if escaped or (k == length - 1):
155 char = s[k]
156 yield ord(char)
157 escaped = False
158 elif on_range:
159 char = s[k]
160 yield ord(char)
161 on_range = False
162 else:
163
164 on_range = True
165 try:
166 k = next(string_iter)
167 end_char = s[k]
168 if end_char == '\\' and (k < length - 1):
169 if s[k+1] in r'-|.^?*+{}()[]':
170 k = next(string_iter)
171 end_char = s[k]
172 elif s[k+1] in r'sSdDiIcCwWpP':
173 msg = "bad character range '%s-\\%s' at position %d: %r" % (char, s[k+1], k-2, s)
174 raise XMLSchemaRegexError(msg)
175 except StopIteration:
176 msg = "bad character range '%s-%s' at position %d: %r" % (char, s[-1], k-2, s)
177 raise XMLSchemaRegexError(msg)
178
179 if ord(char) > ord(end_char):
180 msg = "bad character range '%s-%s' at position %d: %r" % (char, end_char, k-2, s)
181 raise XMLSchemaRegexError(msg)
182 elif expand_ranges:
183 for cp in range(ord(char) + 1, ord(end_char) + 1):
184 yield cp
185 else:
186 yield ord(char), ord(end_char) + 1
187 elif s[k] in r'|.^?*+{}()':
188 if escaped:
189 escaped = False
190 on_range = False
191 char = s[k]
192 yield ord(char)
193 elif s[k] in r'[]':
194 if not escaped and length > 1:
195 raise XMLSchemaRegexError("bad character %r at position %d" % (s[k], k))
196 escaped = on_range = False
197 char = s[k]
198 if k >= length-1 or s[k+1] != '-':
199 yield ord(char)
200 elif s[k] == '\\':
201 if escaped:
202 escaped = on_range = False
203 char = '\\'
204 yield ord(char)
205 else:
206 escaped = True
207 else:
208 if escaped:
209 escaped = False
210 yield ord('\\')
211 on_range = False
212 char = s[k]
213 if k >= length-1 or s[k+1] != '-':
214 yield ord(char)
215 if escaped:
216 yield ord('\\')
217
220 """
221 Represent a subset of Unicode code points, implemented with an ordered list of integer values
222 and ranges. It manages character ranges for adding or for discarding elements from a string
223 and for a compressed representation.
224 """
225
227 if len(args) > 1:
228 raise XMLSchemaTypeError(
229 '%s expected at most 1 arguments, got %d' % (self.__class__.__name__, len(args))
230 )
231 if kwargs:
232 raise XMLSchemaTypeError(
233 '%s does not take keyword arguments' % self.__class__.__name__
234 )
235
236 if not args:
237 self._code_points = list()
238 elif isinstance(args[0], UnicodeSubset):
239 self._code_points = args[0].code_points.copy()
240 else:
241 self._code_points = list()
242 self.update(args[0])
243
244 @classmethod
249
250 @property
252 return self._code_points
253
255 return "<%s %r at %d>" % (self.__class__.__name__, str(self._code_points), id(self))
256
258 return unicode(self).encode("utf-8")
259
262
263 if PY3:
264 __str__ = __unicode__
265
268
271
273 for item in reversed(self._code_points):
274 if isinstance(item, int):
275 yield item
276 else:
277 for cp in reversed(range(item[0], item[1])):
278 yield cp
279
281 last_cp = 0
282 for cp in self._code_points:
283 if last_cp > maxunicode:
284 break
285 elif isinstance(cp, int):
286 cp = cp, cp + 1
287
288 diff = cp[0] - last_cp
289 if diff > 2:
290 yield last_cp, cp[0]
291 elif diff == 2:
292 yield last_cp
293 yield last_cp + 1
294 elif diff == 1:
295 yield last_cp
296 elif diff != 0:
297 raise XMLSchemaValueError("instance code points unordered")
298 last_cp = cp[1]
299
300 if last_cp < maxunicode:
301 yield last_cp, maxunicode + 1
302 elif last_cp == maxunicode:
303 yield maxunicode
304
307
308
309
311 if not isinstance(value, int):
312 try:
313 value = ord(value)
314 except TypeError:
315 raise XMLSchemaTypeError("%r: argument must be a code point or a character." % value)
316
317 for cp in self._code_points:
318 if not isinstance(cp, int):
319 if cp[0] > value:
320 return False
321 elif cp[1] <= value:
322 continue
323 else:
324 return True
325 elif cp > value:
326 return False
327 elif cp == value:
328 return True
329 return False
330
332 for cp in self._code_points:
333 if isinstance(cp, int):
334 yield cp
335 else:
336 for k in range(*cp):
337 yield k
338
340 k = 0
341 for _ in self:
342 k += 1
343 return k
344
353
354 - def add(self, value):
355 start_value, end_value = check_code_point(value)
356 code_points = self._code_points
357 last_index = len(code_points) - 1
358 for k, cp in enumerate(code_points):
359 if isinstance(cp, int):
360 cp = cp, cp + 1
361
362 if end_value < cp[0]:
363 code_points.insert(k, value if isinstance(value, int) else tuple(value))
364 elif start_value > cp[1]:
365 continue
366 elif end_value > cp[1]:
367 if k == last_index:
368 code_points[k] = min(cp[0], start_value), end_value
369 else:
370 next_cp = code_points[k + 1]
371 higher_bound = next_cp if isinstance(next_cp, int) else next_cp[0]
372 if end_value <= higher_bound:
373 code_points[k] = min(cp[0], start_value), end_value
374 else:
375 code_points[k] = min(cp[0], start_value), higher_bound
376 start_value = higher_bound
377 continue
378 elif start_value < cp[0]:
379 code_points[k] = start_value, cp[1]
380 break
381 else:
382 self._code_points.append(tuple(value) if isinstance(value, list) else value)
383
392
394 start_cp, end_cp = check_code_point(value)
395 code_points = self._code_points
396 for k in reversed(range(len(code_points))):
397 cp = code_points[k]
398 if isinstance(cp, int):
399 cp = cp, cp + 1
400
401 if start_cp >= cp[1]:
402 break
403 elif end_cp >= cp[1]:
404 if start_cp <= cp[0]:
405 del code_points[k]
406 elif start_cp - cp[0] > 1:
407 code_points[k] = cp[0], start_cp
408 else:
409 code_points[k] = cp[0]
410 elif end_cp > cp[0]:
411 if start_cp <= cp[0]:
412 if cp[1] - end_cp > 1:
413 code_points[k] = end_cp, cp[1]
414 else:
415 code_points[k] = cp[1] - 1
416 else:
417 if cp[1] - end_cp > 1:
418 code_points.insert(k + 1, (end_cp, cp[1]))
419 else:
420 code_points.insert(k + 1, cp[1] - 1)
421 if start_cp - cp[0] > 1:
422 code_points[k] = cp[0], start_cp
423 else:
424 code_points[k] = cp[0]
425
426
427
429 del self._code_points[:]
430
432 if not isinstance(other, Iterable):
433 return NotImplemented
434 elif isinstance(other, UnicodeSubset):
435 return self._code_points == other._code_points
436 else:
437 return self._code_points == other
438
440 if not isinstance(other, Iterable):
441 return NotImplemented
442 elif isinstance(other, UnicodeSubset):
443 other = reversed(other._code_points)
444 else:
445 other = iter_code_points(other, reverse=True)
446
447 for cp in other:
448 self.add(cp)
449 return self
450
452 if not isinstance(other, Iterable):
453 return NotImplemented
454 elif isinstance(other, UnicodeSubset):
455 other = reversed(other._code_points)
456 else:
457 other = iter_code_points(other, reverse=True)
458
459 for cp in other:
460 self.discard(cp)
461 return self
462
466
467 __rsub__ = __sub__
468
470 for value in (self - other):
471 self.discard(value)
472 return self
473
475 if other is self:
476 self.clear()
477 return self
478 elif not isinstance(other, Iterable):
479 return NotImplemented
480 elif not isinstance(other, UnicodeSubset):
481 other = UnicodeSubset(other)
482
483 for value in other:
484 if value in self:
485 self.discard(value)
486 else:
487 self.add(value)
488 return self
489
492 """
493 Extracts Unicode categories information from unicodedata library. Each category is
494 represented with an ordered list containing code points and code point ranges.
495
496 :return: a dictionary with category names as keys and lists as values.
497 """
498 from unicodedata import category
499
500 categories = {k: [] for k in (
501 'C', 'Cc', 'Cf', 'Cs', 'Co', 'Cn',
502 'L', 'Lu', 'Ll', 'Lt', 'Lm', 'Lo',
503 'M', 'Mn', 'Mc', 'Me',
504 'N', 'Nd', 'Nl', 'No',
505 'P', 'Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po',
506 'S', 'Sm', 'Sc', 'Sk', 'So',
507 'Z', 'Zs', 'Zl', 'Zp'
508 )}
509
510 minor_category = 'Cc'
511 start_cp, next_cp = 0, 1
512 for cp in range(maxunicode + 1):
513
514
515
516 try:
517 _cat = unicode_chr(cp)
518 except ValueError:
519
520
521 if os.name == 'java':
522 continue
523 raise
524
525 _cat = category(_cat)
526
527 if _cat != minor_category:
528 if cp > next_cp:
529 categories[minor_category].append((start_cp, cp))
530 categories[minor_category[0]].append(categories[minor_category][-1])
531 else:
532 categories[minor_category].append(start_cp)
533 categories[minor_category[0]].append(start_cp)
534
535 minor_category = _cat
536 start_cp, next_cp = cp, cp + 1
537 else:
538 if next_cp == maxunicode + 1:
539 categories[minor_category].append(start_cp)
540 categories[minor_category[0]].append(start_cp)
541 else:
542 categories[minor_category].append((start_cp, maxunicode + 1))
543 categories[minor_category[0]].append(categories[minor_category][-1])
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565 return categories
566
569 """
570 Save Unicode categories to a JSON file.
571
572 :param filename: the JSON file to save. If it's `None` uses the predefined filename
573 'unicode_categories.json' and try to save in the directory of this module.
574 """
575 if filename is None:
576 filename = os.path.join(os.path.dirname(__file__), 'unicode_categories.json')
577
578 print("Saving Unicode categories to %r" % filename)
579 with open(filename, 'w') as fp:
580 json.dump(get_unicodedata_categories(), fp)
581
584 """
585 Builds the Unicode categories as `UnicodeSubset` instances. For a fast building a pre-built
586 JSON file with Unicode categories data can be used. If the JSON file is missing or is not
587 accessible the categories data is rebuild using `unicodedata.category()` API.
588
589 :param filename: the name of the JSON file to load for a fast building of the categories. \
590 If not provided the predefined filename 'unicode_categories.json' is used.
591 :return: a dictionary that associates Unicode category names with `UnicodeSubset` instances.
592 """
593 if maxunicode < UCS4_MAXUNICODE:
594 categories = get_unicodedata_categories()
595 else:
596 if filename is None:
597 filename = os.path.join(os.path.dirname(__file__), 'unicode_categories.json')
598 try:
599 with open(filename, 'r') as fp:
600 categories = json.load(fp)
601 except (IOError, SystemError, ValueError):
602 categories = get_unicodedata_categories()
603 else:
604 if any(not v for v in categories):
605 categories = get_unicodedata_categories()
606
607 return {k: UnicodeSubset.fromlist(v) for k, v in categories.items()}
608
609
610 UNICODE_CATEGORIES = build_unicode_categories()
611
612
613 UNICODE_BLOCKS = {
614 'IsBasicLatin': UnicodeSubset('\u0000-\u007F'),
615 'IsLatin-1Supplement': UnicodeSubset('\u0080-\u00FF'),
616 'IsLatinExtended-A': UnicodeSubset('\u0100-\u017F'),
617 'IsLatinExtended-B': UnicodeSubset('\u0180-\u024F'),
618 'IsIPAExtensions': UnicodeSubset('\u0250-\u02AF'),
619 'IsSpacingModifierLetters': UnicodeSubset('\u02B0-\u02FF'),
620 'IsCombiningDiacriticalMarks': UnicodeSubset('\u0300-\u036F'),
621 'IsGreek': UnicodeSubset('\u0370-\u03FF'),
622 'IsCyrillic': UnicodeSubset('\u0400-\u04FF'),
623 'IsArmenian': UnicodeSubset('\u0530-\u058F'),
624 'IsHebrew': UnicodeSubset('\u0590-\u05FF'),
625 'IsArabic': UnicodeSubset('\u0600-\u06FF'),
626 'IsSyriac': UnicodeSubset('\u0700-\u074F'),
627 'IsThaana': UnicodeSubset('\u0780-\u07BF'),
628 'IsDevanagari': UnicodeSubset('\u0900-\u097F'),
629 'IsBengali': UnicodeSubset('\u0980-\u09FF'),
630 'IsGurmukhi': UnicodeSubset('\u0A00-\u0A7F'),
631 'IsGujarati': UnicodeSubset('\u0A80-\u0AFF'),
632 'IsOriya': UnicodeSubset('\u0B00-\u0B7F'),
633 'IsTamil': UnicodeSubset('\u0B80-\u0BFF'),
634 'IsTelugu': UnicodeSubset('\u0C00-\u0C7F'),
635 'IsKannada': UnicodeSubset('\u0C80-\u0CFF'),
636 'IsMalayalam': UnicodeSubset('\u0D00-\u0D7F'),
637 'IsSinhala': UnicodeSubset('\u0D80-\u0DFF'),
638 'IsThai': UnicodeSubset('\u0E00-\u0E7F'),
639 'IsLao': UnicodeSubset('\u0E80-\u0EFF'),
640 'IsTibetan': UnicodeSubset('\u0F00-\u0FFF'),
641 'IsMyanmar': UnicodeSubset('\u1000-\u109F'),
642 'IsGeorgian': UnicodeSubset('\u10A0-\u10FF'),
643 'IsHangulJamo': UnicodeSubset('\u1100-\u11FF'),
644 'IsEthiopic': UnicodeSubset('\u1200-\u137F'),
645 'IsCherokee': UnicodeSubset('\u13A0-\u13FF'),
646 'IsUnifiedCanadianAboriginalSyllabics': UnicodeSubset('\u1400-\u167F'),
647 'IsOgham': UnicodeSubset('\u1680-\u169F'),
648 'IsRunic': UnicodeSubset('\u16A0-\u16FF'),
649 'IsKhmer': UnicodeSubset('\u1780-\u17FF'),
650 'IsMongolian': UnicodeSubset('\u1800-\u18AF'),
651 'IsLatinExtendedAdditional': UnicodeSubset('\u1E00-\u1EFF'),
652 'IsGreekExtended': UnicodeSubset('\u1F00-\u1FFF'),
653 'IsGeneralPunctuation': UnicodeSubset('\u2000-\u206F'),
654 'IsSuperscriptsandSubscripts': UnicodeSubset('\u2070-\u209F'),
655 'IsCurrencySymbols': UnicodeSubset('\u20A0-\u20CF'),
656 'IsCombiningMarksforSymbols': UnicodeSubset('\u20D0-\u20FF'),
657 'IsLetterlikeSymbols': UnicodeSubset('\u2100-\u214F'),
658 'IsNumberForms': UnicodeSubset('\u2150-\u218F'),
659 'IsArrows': UnicodeSubset('\u2190-\u21FF'),
660 'IsMathematicalOperators': UnicodeSubset('\u2200-\u22FF'),
661 'IsMiscellaneousTechnical': UnicodeSubset('\u2300-\u23FF'),
662 'IsControlPictures': UnicodeSubset('\u2400-\u243F'),
663 'IsOpticalCharacterRecognition': UnicodeSubset('\u2440-\u245F'),
664 'IsEnclosedAlphanumerics': UnicodeSubset('\u2460-\u24FF'),
665 'IsBoxDrawing': UnicodeSubset('\u2500-\u257F'),
666 'IsBlockElements': UnicodeSubset('\u2580-\u259F'),
667 'IsGeometricShapes': UnicodeSubset('\u25A0-\u25FF'),
668 'IsMiscellaneousSymbols': UnicodeSubset('\u2600-\u26FF'),
669 'IsDingbats': UnicodeSubset('\u2700-\u27BF'),
670 'IsBraillePatterns': UnicodeSubset('\u2800-\u28FF'),
671 'IsCJKRadicalsSupplement': UnicodeSubset('\u2E80-\u2EFF'),
672 'IsKangxiRadicals': UnicodeSubset('\u2F00-\u2FDF'),
673 'IsIdeographicDescriptionCharacters': UnicodeSubset('\u2FF0-\u2FFF'),
674 'IsCJKSymbolsandPunctuation': UnicodeSubset('\u3000-\u303F'),
675 'IsHiragana': UnicodeSubset('\u3040-\u309F'),
676 'IsKatakana': UnicodeSubset('\u30A0-\u30FF'),
677 'IsBopomofo': UnicodeSubset('\u3100-\u312F'),
678 'IsHangulCompatibilityJamo': UnicodeSubset('\u3130-\u318F'),
679 'IsKanbun': UnicodeSubset('\u3190-\u319F'),
680 'IsBopomofoExtended': UnicodeSubset('\u31A0-\u31BF'),
681 'IsEnclosedCJKLettersandMonths': UnicodeSubset('\u3200-\u32FF'),
682 'IsCJKCompatibility': UnicodeSubset('\u3300-\u33FF'),
683 'IsCJKUnifiedIdeographsExtensionA': UnicodeSubset('\u3400-\u4DB5'),
684 'IsCJKUnifiedIdeographs': UnicodeSubset('\u4E00-\u9FFF'),
685 'IsYiSyllables': UnicodeSubset('\uA000-\uA48F'),
686
687
688
689
690
691
692 'IsPrivateUse': UnicodeSubset('\uE000-\uF8FF'),
693 'IsCJKCompatibilityIdeographs': UnicodeSubset('\uF900-\uFAFF'),
694 'IsAlphabeticPresentationForms': UnicodeSubset('\uFB00-\uFB4F'),
695 'IsArabicPresentationForms-A': UnicodeSubset('\uFB50-\uFDFF'),
696 'IsCombiningHalfMarks': UnicodeSubset('\uFE20-\uFE2F'),
697 'IsCJKCompatibilityForms': UnicodeSubset('\uFE30-\uFE4F'),
698 'IsSmallFormVariants': UnicodeSubset('\uFE50-\uFE6F'),
699 'IsArabicPresentationForms-B': UnicodeSubset('\uFE70-\uFEFE'),
700 'IsSpecials': UnicodeSubset('\uFEFF\uFFF0-\uFFFD'),
701 'IsHalfwidthandFullwidthForms': UnicodeSubset('\uFF00-\uFFEF')
702 }
703
704 if maxunicode == UCS4_MAXUNICODE:
705 UNICODE_BLOCKS['IsPrivateUse'].update('\U000F0000-\U0010FFFD'),
706 UNICODE_BLOCKS.update({
707 'IsOldItalic': UnicodeSubset('\U00010300-\U0001032F'),
708 'IsGothic': UnicodeSubset('\U00010330-\U0001034F'),
709 'IsDeseret': UnicodeSubset('\U00010400-\U0001044F'),
710 'IsByzantineMusicalSymbols': UnicodeSubset('\U0001D000-\U0001D0FF'),
711 'IsMusicalSymbols': UnicodeSubset('\U0001D100-\U0001D1FF'),
712 'IsMathematicalAlphanumericSymbols': UnicodeSubset('\U0001D400-\U0001D7FF'),
713 'IsCJKUnifiedIdeographsExtensionB': UnicodeSubset('\U00020000-\U0002A6D6'),
714 'IsCJKCompatibilityIdeographsSupplement': UnicodeSubset('\U0002F800-\U0002FA1F'),
715 'IsTags': UnicodeSubset('\U000E0000-\U000E007F')
716 })
717