Package xmlschema_acue :: Module codepoints

Source Code for Module xmlschema_acue.codepoints

  1  # -*- coding: utf-8 -*- 
  2  # 
  3  # Copyright (c), 2016-2019, SISSA (International School for Advanced Studies). 
  4  # All rights reserved. 
  5  # This file is distributed under the terms of the MIT License. 
  6  # See the file 'LICENSE' in the root directory of the present 
  7  # distribution, or http://opensource.org/licenses/MIT. 
  8  # 
  9  # @author Davide Brunato <brunato@sissa.it> 
 10  # 
 11  """ 
 12  This module defines Unicode character categories and blocks, defined as sets of code points. 
 13  """ 
 14  from __future__ import unicode_literals 
 15  from __future__ import absolute_import 
 16   
 17  import json 
 18  import os 
 19  from sys import maxunicode 
 20   
 21  from xmlschema_acue.compat import PY3, unicode_chr, string_base_type, Iterable, MutableSet 
 22  from xmlschema_acue.exceptions import XMLSchemaValueError, XMLSchemaTypeError, XMLSchemaRegexError 
 23   
 24  CHARACTER_GROUP_ESCAPED = {ord(c) for c in r'-|.^?*+{}()[]\\'} 
 25  """Code Points of escaped chars in a character group.""" 
 26   
 27  UCS4_MAXUNICODE = 1114111 
28 29 30 -def code_point_order(cp):
31 """Ordering function for code points.""" 32 return cp if isinstance(cp, int) else cp[0]
33
34 35 -def code_point_reverse_order(cp):
36 """Reverse ordering function for code points.""" 37 return cp if isinstance(cp, int) else cp[1] - 1
38
39 40 -def iter_code_points(code_points, reverse=False):
41 """ 42 Iterates a code points sequence. The code points are accorpated in ranges when are contiguous. 43 44 :param code_points: an iterable with code points and code point ranges. 45 :param reverse: if `True` reverses the order of the sequence. 46 :return: yields code points or code point ranges. 47 """ 48 start_cp = end_cp = None 49 if reverse: 50 code_points = sorted(code_points, key=code_point_reverse_order, reverse=True) 51 else: 52 code_points = sorted(code_points, key=code_point_order) 53 54 for cp in code_points: 55 if isinstance(cp, int): 56 cp = cp, cp + 1 57 58 if start_cp is None: 59 start_cp, end_cp = cp 60 continue 61 elif reverse: 62 if start_cp <= cp[1]: 63 start_cp = min(start_cp, cp[0]) 64 continue 65 elif end_cp >= cp[0]: 66 end_cp = max(end_cp, cp[1]) 67 continue 68 69 if end_cp > start_cp + 1: 70 yield start_cp, end_cp 71 else: 72 yield start_cp 73 start_cp, end_cp = cp 74 else: 75 if start_cp is not None: 76 if end_cp > start_cp + 1: 77 yield start_cp, end_cp 78 else: 79 yield start_cp
80
81 82 -def check_code_point(cp):
83 """ 84 Checks a code point or code point range. 85 86 :return: a valid code point range. 87 """ 88 if isinstance(cp, int): 89 if not (0 <= cp <= maxunicode): 90 raise XMLSchemaValueError("not a Unicode code point: %r" % cp) 91 return cp, cp + 1 92 else: 93 if not (0 <= cp[0] < cp[1] <= maxunicode + 1) \ 94 or not isinstance(cp[0], int) or not isinstance(cp[1], int): 95 raise XMLSchemaValueError("not a Unicode code point range: %r" % cp) 96 return cp
97
98 99 -def code_point_repr(cp):
100 """ 101 Returns the string representation of a code point. 102 103 :param cp: an integer or a tuple with at least two integers. Values must be in interval [0, sys.maxunicode]. 104 """ 105 if isinstance(cp, int): 106 if cp in CHARACTER_GROUP_ESCAPED: 107 return r'\%s' % unicode_chr(cp) 108 return unicode_chr(cp) 109 110 if cp[0] in CHARACTER_GROUP_ESCAPED: 111 start_char = r'\%s' % unicode_chr(cp[0]) 112 else: 113 start_char = unicode_chr(cp[0]) 114 115 end_cp = cp[1] - 1 # Character ranges include the right bound 116 if end_cp in CHARACTER_GROUP_ESCAPED: 117 end_char = r'\%s' % unicode_chr(end_cp) 118 else: 119 end_char = unicode_chr(end_cp) 120 121 if end_cp > cp[0] + 1: 122 return '%s-%s' % (start_char, end_char) 123 else: 124 return start_char + end_char
125
126 127 -def iterparse_character_group(s, expand_ranges=False):
128 """ 129 Parse a regex character group part, generating a sequence of code points 130 and code points ranges. An unescaped hyphen (-) that is not at the start 131 or at the and is interpreted as range specifier. 132 133 :param s: a string representing a character group part. 134 :param expand_ranges: if set to `True` then expands character ranges. 135 :return: yields integers or couples of integers. 136 """ 137 escaped = False 138 on_range = False 139 char = None 140 length = len(s) 141 string_iter = iter(range(len(s))) 142 for k in string_iter: 143 if k == 0: 144 char = s[0] 145 if char == '\\': 146 escaped = True 147 elif char in r'[]' and length > 1: 148 raise XMLSchemaRegexError("bad character %r at position 0" % char) 149 elif expand_ranges: 150 yield ord(char) 151 elif length <= 2 or s[1] != '-': 152 yield ord(char) 153 elif s[k] == '-': 154 if escaped or (k == length - 1): 155 char = s[k] 156 yield ord(char) 157 escaped = False 158 elif on_range: 159 char = s[k] 160 yield ord(char) 161 on_range = False 162 else: 163 # Parse character range 164 on_range = True 165 try: 166 k = next(string_iter) 167 end_char = s[k] 168 if end_char == '\\' and (k < length - 1): 169 if s[k+1] in r'-|.^?*+{}()[]': 170 k = next(string_iter) 171 end_char = s[k] 172 elif s[k+1] in r'sSdDiIcCwWpP': 173 msg = "bad character range '%s-\\%s' at position %d: %r" % (char, s[k+1], k-2, s) 174 raise XMLSchemaRegexError(msg) 175 except StopIteration: 176 msg = "bad character range '%s-%s' at position %d: %r" % (char, s[-1], k-2, s) 177 raise XMLSchemaRegexError(msg) 178 179 if ord(char) > ord(end_char): 180 msg = "bad character range '%s-%s' at position %d: %r" % (char, end_char, k-2, s) 181 raise XMLSchemaRegexError(msg) 182 elif expand_ranges: 183 for cp in range(ord(char) + 1, ord(end_char) + 1): 184 yield cp 185 else: 186 yield ord(char), ord(end_char) + 1 187 elif s[k] in r'|.^?*+{}()': 188 if escaped: 189 escaped = False 190 on_range = False 191 char = s[k] 192 yield ord(char) 193 elif s[k] in r'[]': 194 if not escaped and length > 1: 195 raise XMLSchemaRegexError("bad character %r at position %d" % (s[k], k)) 196 escaped = on_range = False 197 char = s[k] 198 if k >= length-1 or s[k+1] != '-': 199 yield ord(char) 200 elif s[k] == '\\': 201 if escaped: 202 escaped = on_range = False 203 char = '\\' 204 yield ord(char) 205 else: 206 escaped = True 207 else: 208 if escaped: 209 escaped = False 210 yield ord('\\') 211 on_range = False 212 char = s[k] 213 if k >= length-1 or s[k+1] != '-': 214 yield ord(char) 215 if escaped: 216 yield ord('\\')
217
218 219 -class UnicodeSubset(MutableSet):
220 """ 221 Represent a subset of Unicode code points, implemented with an ordered list of integer values 222 and ranges. It manages character ranges for adding or for discarding elements from a string 223 and for a compressed representation. 224 """ 225
226 - def __init__(self, *args, **kwargs):
227 if len(args) > 1: 228 raise XMLSchemaTypeError( 229 '%s expected at most 1 arguments, got %d' % (self.__class__.__name__, len(args)) 230 ) 231 if kwargs: 232 raise XMLSchemaTypeError( 233 '%s does not take keyword arguments' % self.__class__.__name__ 234 ) 235 236 if not args: 237 self._code_points = list() 238 elif isinstance(args[0], UnicodeSubset): 239 self._code_points = args[0].code_points.copy() 240 else: 241 self._code_points = list() 242 self.update(args[0])
243 244 @classmethod
245 - def fromlist(cls, code_points):
246 subset = cls() 247 subset._code_points = sorted(code_points, key=code_point_order) 248 return subset
249 250 @property
251 - def code_points(self):
252 return self._code_points
253
254 - def __repr__(self):
255 return "<%s %r at %d>" % (self.__class__.__name__, str(self._code_points), id(self))
256
257 - def __str__(self):
258 return unicode(self).encode("utf-8") # @UndefinedVariable
259
260 - def __unicode__(self):
261 return ''.join(code_point_repr(cp) for cp in self._code_points)
262 263 if PY3: 264 __str__ = __unicode__ 265
266 - def copy(self):
267 return self.__copy__()
268
269 - def __copy__(self):
270 return UnicodeSubset(self._code_points)
271
272 - def __reversed__(self):
273 for item in reversed(self._code_points): 274 if isinstance(item, int): 275 yield item 276 else: 277 for cp in reversed(range(item[0], item[1])): 278 yield cp
279
280 - def complement(self):
281 last_cp = 0 282 for cp in self._code_points: 283 if last_cp > maxunicode: 284 break 285 elif isinstance(cp, int): 286 cp = cp, cp + 1 287 288 diff = cp[0] - last_cp 289 if diff > 2: 290 yield last_cp, cp[0] 291 elif diff == 2: 292 yield last_cp 293 yield last_cp + 1 294 elif diff == 1: 295 yield last_cp 296 elif diff != 0: 297 raise XMLSchemaValueError("instance code points unordered") 298 last_cp = cp[1] 299 300 if last_cp < maxunicode: 301 yield last_cp, maxunicode + 1 302 elif last_cp == maxunicode: 303 yield maxunicode
304
305 - def iter_characters(self):
306 return map(chr, self.__iter__())
307 308 # 309 # MutableSet's abstract methods implementation
310 - def __contains__(self, value):
311 if not isinstance(value, int): 312 try: 313 value = ord(value) 314 except TypeError: 315 raise XMLSchemaTypeError("%r: argument must be a code point or a character." % value) 316 317 for cp in self._code_points: 318 if not isinstance(cp, int): 319 if cp[0] > value: 320 return False 321 elif cp[1] <= value: 322 continue 323 else: 324 return True 325 elif cp > value: 326 return False 327 elif cp == value: 328 return True 329 return False
330
331 - def __iter__(self):
332 for cp in self._code_points: 333 if isinstance(cp, int): 334 yield cp 335 else: 336 for k in range(*cp): 337 yield k
338
339 - def __len__(self):
340 k = 0 341 for _ in self: 342 k += 1 343 return k
344
345 - def update(self, *others):
346 for value in others: 347 if isinstance(value, string_base_type): 348 for cp in iter_code_points(iterparse_character_group(value), reverse=True): 349 self.add(cp) 350 else: 351 for cp in iter_code_points(value, reverse=True): 352 self.add(cp)
353
354 - def add(self, value):
355 start_value, end_value = check_code_point(value) 356 code_points = self._code_points 357 last_index = len(code_points) - 1 358 for k, cp in enumerate(code_points): 359 if isinstance(cp, int): 360 cp = cp, cp + 1 361 362 if end_value < cp[0]: 363 code_points.insert(k, value if isinstance(value, int) else tuple(value)) 364 elif start_value > cp[1]: 365 continue 366 elif end_value > cp[1]: 367 if k == last_index: 368 code_points[k] = min(cp[0], start_value), end_value 369 else: 370 next_cp = code_points[k + 1] 371 higher_bound = next_cp if isinstance(next_cp, int) else next_cp[0] 372 if end_value <= higher_bound: 373 code_points[k] = min(cp[0], start_value), end_value 374 else: 375 code_points[k] = min(cp[0], start_value), higher_bound 376 start_value = higher_bound 377 continue 378 elif start_value < cp[0]: 379 code_points[k] = start_value, cp[1] 380 break 381 else: 382 self._code_points.append(tuple(value) if isinstance(value, list) else value)
383
384 - def difference_update(self, *others):
385 for value in others: 386 if isinstance(value, string_base_type): 387 for cp in iter_code_points(iterparse_character_group(value), reverse=True): 388 self.discard(cp) 389 else: 390 for cp in iter_code_points(value, reverse=True): 391 self.discard(cp)
392
393 - def discard(self, value):
394 start_cp, end_cp = check_code_point(value) 395 code_points = self._code_points 396 for k in reversed(range(len(code_points))): 397 cp = code_points[k] 398 if isinstance(cp, int): 399 cp = cp, cp + 1 400 401 if start_cp >= cp[1]: 402 break 403 elif end_cp >= cp[1]: 404 if start_cp <= cp[0]: 405 del code_points[k] 406 elif start_cp - cp[0] > 1: 407 code_points[k] = cp[0], start_cp 408 else: 409 code_points[k] = cp[0] 410 elif end_cp > cp[0]: 411 if start_cp <= cp[0]: 412 if cp[1] - end_cp > 1: 413 code_points[k] = end_cp, cp[1] 414 else: 415 code_points[k] = cp[1] - 1 416 else: 417 if cp[1] - end_cp > 1: 418 code_points.insert(k + 1, (end_cp, cp[1])) 419 else: 420 code_points.insert(k + 1, cp[1] - 1) 421 if start_cp - cp[0] > 1: 422 code_points[k] = cp[0], start_cp 423 else: 424 code_points[k] = cp[0]
425 426 # 427 # MutableSet's mixin methods override
428 - def clear(self):
429 del self._code_points[:]
430
431 - def __eq__(self, other):
432 if not isinstance(other, Iterable): 433 return NotImplemented 434 elif isinstance(other, UnicodeSubset): 435 return self._code_points == other._code_points 436 else: 437 return self._code_points == other
438
439 - def __ior__(self, other):
440 if not isinstance(other, Iterable): 441 return NotImplemented 442 elif isinstance(other, UnicodeSubset): 443 other = reversed(other._code_points) 444 else: 445 other = iter_code_points(other, reverse=True) 446 447 for cp in other: 448 self.add(cp) 449 return self
450
451 - def __isub__(self, other):
452 if not isinstance(other, Iterable): 453 return NotImplemented 454 elif isinstance(other, UnicodeSubset): 455 other = reversed(other._code_points) 456 else: 457 other = iter_code_points(other, reverse=True) 458 459 for cp in other: 460 self.discard(cp) 461 return self
462
463 - def __sub__(self, other):
464 obj = self.copy() 465 return obj.__isub__(other)
466 467 __rsub__ = __sub__ 468
469 - def __iand__(self, other):
470 for value in (self - other): 471 self.discard(value) 472 return self
473
474 - def __ixor__(self, other):
475 if other is self: 476 self.clear() 477 return self 478 elif not isinstance(other, Iterable): 479 return NotImplemented 480 elif not isinstance(other, UnicodeSubset): 481 other = UnicodeSubset(other) 482 483 for value in other: 484 if value in self: 485 self.discard(value) 486 else: 487 self.add(value) 488 return self
489
490 491 -def get_unicodedata_categories():
492 """ 493 Extracts Unicode categories information from unicodedata library. Each category is 494 represented with an ordered list containing code points and code point ranges. 495 496 :return: a dictionary with category names as keys and lists as values. 497 """ 498 from unicodedata import category 499 500 categories = {k: [] for k in ( 501 'C', 'Cc', 'Cf', 'Cs', 'Co', 'Cn', 502 'L', 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 503 'M', 'Mn', 'Mc', 'Me', 504 'N', 'Nd', 'Nl', 'No', 505 'P', 'Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po', 506 'S', 'Sm', 'Sc', 'Sk', 'So', 507 'Z', 'Zs', 'Zl', 'Zp' 508 )} 509 510 minor_category = 'Cc' 511 start_cp, next_cp = 0, 1 512 for cp in range(maxunicode + 1): 513 514 #FIXME: unicode 515 # surrogates are not prohibited - beneath others by XML-1.0 and XML-1.1 - see [XSLCHARS]_ 516 try: 517 _cat = unicode_chr(cp) 518 except ValueError: 519 # Jython has no surrogates @0xD800 520 # test: _cat = '\\u%x' % (cp) 521 if os.name == 'java': # it is Jython - does not have surrogates - which are prohibited by XML - spec anyhow 522 continue 523 raise 524 525 _cat = category(_cat) 526 527 if _cat != minor_category: 528 if cp > next_cp: 529 categories[minor_category].append((start_cp, cp)) 530 categories[minor_category[0]].append(categories[minor_category][-1]) 531 else: 532 categories[minor_category].append(start_cp) 533 categories[minor_category[0]].append(start_cp) 534 535 minor_category = _cat 536 start_cp, next_cp = cp, cp + 1 537 else: 538 if next_cp == maxunicode + 1: 539 categories[minor_category].append(start_cp) 540 categories[minor_category[0]].append(start_cp) 541 else: 542 categories[minor_category].append((start_cp, maxunicode + 1)) 543 categories[minor_category[0]].append(categories[minor_category][-1]) 544 545 546 # 547 # if category(unicode_chr(cp)) != minor_category: 548 # if cp > next_cp: 549 # categories[minor_category].append((start_cp, cp)) 550 # categories[minor_category[0]].append(categories[minor_category][-1]) 551 # else: 552 # categories[minor_category].append(start_cp) 553 # categories[minor_category[0]].append(start_cp) 554 # 555 # minor_category = category(unicode_chr(cp)) 556 # start_cp, next_cp = cp, cp + 1 557 # else: 558 # if next_cp == maxunicode + 1: 559 # categories[minor_category].append(start_cp) 560 # categories[minor_category[0]].append(start_cp) 561 # else: 562 # categories[minor_category].append((start_cp, maxunicode + 1)) 563 # categories[minor_category[0]].append(categories[minor_category][-1]) 564 565 return categories
566
567 568 -def save_unicode_categories(filename=None):
569 """ 570 Save Unicode categories to a JSON file. 571 572 :param filename: the JSON file to save. If it's `None` uses the predefined filename 573 'unicode_categories.json' and try to save in the directory of this module. 574 """ 575 if filename is None: 576 filename = os.path.join(os.path.dirname(__file__), 'unicode_categories.json') 577 578 print("Saving Unicode categories to %r" % filename) 579 with open(filename, 'w') as fp: 580 json.dump(get_unicodedata_categories(), fp)
581
582 583 -def build_unicode_categories(filename=None):
584 """ 585 Builds the Unicode categories as `UnicodeSubset` instances. For a fast building a pre-built 586 JSON file with Unicode categories data can be used. If the JSON file is missing or is not 587 accessible the categories data is rebuild using `unicodedata.category()` API. 588 589 :param filename: the name of the JSON file to load for a fast building of the categories. \ 590 If not provided the predefined filename 'unicode_categories.json' is used. 591 :return: a dictionary that associates Unicode category names with `UnicodeSubset` instances. 592 """ 593 if maxunicode < UCS4_MAXUNICODE: 594 categories = get_unicodedata_categories() # for Python 2.7 595 else: 596 if filename is None: 597 filename = os.path.join(os.path.dirname(__file__), 'unicode_categories.json') 598 try: 599 with open(filename, 'r') as fp: 600 categories = json.load(fp) 601 except (IOError, SystemError, ValueError): 602 categories = get_unicodedata_categories() 603 else: 604 if any(not v for v in categories): 605 categories = get_unicodedata_categories() 606 607 return {k: UnicodeSubset.fromlist(v) for k, v in categories.items()}
608 609 610 UNICODE_CATEGORIES = build_unicode_categories() 611 612 613 UNICODE_BLOCKS = { 614 'IsBasicLatin': UnicodeSubset('\u0000-\u007F'), 615 'IsLatin-1Supplement': UnicodeSubset('\u0080-\u00FF'), 616 'IsLatinExtended-A': UnicodeSubset('\u0100-\u017F'), 617 'IsLatinExtended-B': UnicodeSubset('\u0180-\u024F'), 618 'IsIPAExtensions': UnicodeSubset('\u0250-\u02AF'), 619 'IsSpacingModifierLetters': UnicodeSubset('\u02B0-\u02FF'), 620 'IsCombiningDiacriticalMarks': UnicodeSubset('\u0300-\u036F'), 621 'IsGreek': UnicodeSubset('\u0370-\u03FF'), 622 'IsCyrillic': UnicodeSubset('\u0400-\u04FF'), 623 'IsArmenian': UnicodeSubset('\u0530-\u058F'), 624 'IsHebrew': UnicodeSubset('\u0590-\u05FF'), 625 'IsArabic': UnicodeSubset('\u0600-\u06FF'), 626 'IsSyriac': UnicodeSubset('\u0700-\u074F'), 627 'IsThaana': UnicodeSubset('\u0780-\u07BF'), 628 'IsDevanagari': UnicodeSubset('\u0900-\u097F'), 629 'IsBengali': UnicodeSubset('\u0980-\u09FF'), 630 'IsGurmukhi': UnicodeSubset('\u0A00-\u0A7F'), 631 'IsGujarati': UnicodeSubset('\u0A80-\u0AFF'), 632 'IsOriya': UnicodeSubset('\u0B00-\u0B7F'), 633 'IsTamil': UnicodeSubset('\u0B80-\u0BFF'), 634 'IsTelugu': UnicodeSubset('\u0C00-\u0C7F'), 635 'IsKannada': UnicodeSubset('\u0C80-\u0CFF'), 636 'IsMalayalam': UnicodeSubset('\u0D00-\u0D7F'), 637 'IsSinhala': UnicodeSubset('\u0D80-\u0DFF'), 638 'IsThai': UnicodeSubset('\u0E00-\u0E7F'), 639 'IsLao': UnicodeSubset('\u0E80-\u0EFF'), 640 'IsTibetan': UnicodeSubset('\u0F00-\u0FFF'), 641 'IsMyanmar': UnicodeSubset('\u1000-\u109F'), 642 'IsGeorgian': UnicodeSubset('\u10A0-\u10FF'), 643 'IsHangulJamo': UnicodeSubset('\u1100-\u11FF'), 644 'IsEthiopic': UnicodeSubset('\u1200-\u137F'), 645 'IsCherokee': UnicodeSubset('\u13A0-\u13FF'), 646 'IsUnifiedCanadianAboriginalSyllabics': UnicodeSubset('\u1400-\u167F'), 647 'IsOgham': UnicodeSubset('\u1680-\u169F'), 648 'IsRunic': UnicodeSubset('\u16A0-\u16FF'), 649 'IsKhmer': UnicodeSubset('\u1780-\u17FF'), 650 'IsMongolian': UnicodeSubset('\u1800-\u18AF'), 651 'IsLatinExtendedAdditional': UnicodeSubset('\u1E00-\u1EFF'), 652 'IsGreekExtended': UnicodeSubset('\u1F00-\u1FFF'), 653 'IsGeneralPunctuation': UnicodeSubset('\u2000-\u206F'), 654 'IsSuperscriptsandSubscripts': UnicodeSubset('\u2070-\u209F'), 655 'IsCurrencySymbols': UnicodeSubset('\u20A0-\u20CF'), 656 'IsCombiningMarksforSymbols': UnicodeSubset('\u20D0-\u20FF'), 657 'IsLetterlikeSymbols': UnicodeSubset('\u2100-\u214F'), 658 'IsNumberForms': UnicodeSubset('\u2150-\u218F'), 659 'IsArrows': UnicodeSubset('\u2190-\u21FF'), 660 'IsMathematicalOperators': UnicodeSubset('\u2200-\u22FF'), 661 'IsMiscellaneousTechnical': UnicodeSubset('\u2300-\u23FF'), 662 'IsControlPictures': UnicodeSubset('\u2400-\u243F'), 663 'IsOpticalCharacterRecognition': UnicodeSubset('\u2440-\u245F'), 664 'IsEnclosedAlphanumerics': UnicodeSubset('\u2460-\u24FF'), 665 'IsBoxDrawing': UnicodeSubset('\u2500-\u257F'), 666 'IsBlockElements': UnicodeSubset('\u2580-\u259F'), 667 'IsGeometricShapes': UnicodeSubset('\u25A0-\u25FF'), 668 'IsMiscellaneousSymbols': UnicodeSubset('\u2600-\u26FF'), 669 'IsDingbats': UnicodeSubset('\u2700-\u27BF'), 670 'IsBraillePatterns': UnicodeSubset('\u2800-\u28FF'), 671 'IsCJKRadicalsSupplement': UnicodeSubset('\u2E80-\u2EFF'), 672 'IsKangxiRadicals': UnicodeSubset('\u2F00-\u2FDF'), 673 'IsIdeographicDescriptionCharacters': UnicodeSubset('\u2FF0-\u2FFF'), 674 'IsCJKSymbolsandPunctuation': UnicodeSubset('\u3000-\u303F'), 675 'IsHiragana': UnicodeSubset('\u3040-\u309F'), 676 'IsKatakana': UnicodeSubset('\u30A0-\u30FF'), 677 'IsBopomofo': UnicodeSubset('\u3100-\u312F'), 678 'IsHangulCompatibilityJamo': UnicodeSubset('\u3130-\u318F'), 679 'IsKanbun': UnicodeSubset('\u3190-\u319F'), 680 'IsBopomofoExtended': UnicodeSubset('\u31A0-\u31BF'), 681 'IsEnclosedCJKLettersandMonths': UnicodeSubset('\u3200-\u32FF'), 682 'IsCJKCompatibility': UnicodeSubset('\u3300-\u33FF'), 683 'IsCJKUnifiedIdeographsExtensionA': UnicodeSubset('\u3400-\u4DB5'), 684 'IsCJKUnifiedIdeographs': UnicodeSubset('\u4E00-\u9FFF'), 685 'IsYiSyllables': UnicodeSubset('\uA000-\uA48F'), 686 # 'IsYiRadicals': UnicodeSubset('\uA490-\uA4CF'), 687 # 'IsHangulSyllables': UnicodeSubset('\uAC00-\uD7A3'), 688 # 'IsHighSurrogates': UnicodeSubset('\uD800-\uDB7F'), 689 # 'IsHighPrivateUseSurrogates': UnicodeSubset('\uDB80-\uDBFF'), 690 # 'IsLowSurrogates': UnicodeSubset('\uDC00-\uDFFF'), 691 692 'IsPrivateUse': UnicodeSubset('\uE000-\uF8FF'), 693 'IsCJKCompatibilityIdeographs': UnicodeSubset('\uF900-\uFAFF'), 694 'IsAlphabeticPresentationForms': UnicodeSubset('\uFB00-\uFB4F'), 695 'IsArabicPresentationForms-A': UnicodeSubset('\uFB50-\uFDFF'), 696 'IsCombiningHalfMarks': UnicodeSubset('\uFE20-\uFE2F'), 697 'IsCJKCompatibilityForms': UnicodeSubset('\uFE30-\uFE4F'), 698 'IsSmallFormVariants': UnicodeSubset('\uFE50-\uFE6F'), 699 'IsArabicPresentationForms-B': UnicodeSubset('\uFE70-\uFEFE'), 700 'IsSpecials': UnicodeSubset('\uFEFF\uFFF0-\uFFFD'), 701 'IsHalfwidthandFullwidthForms': UnicodeSubset('\uFF00-\uFFEF') 702 } 703 704 if maxunicode == UCS4_MAXUNICODE: 705 UNICODE_BLOCKS['IsPrivateUse'].update('\U000F0000-\U0010FFFD'), 706 UNICODE_BLOCKS.update({ 707 'IsOldItalic': UnicodeSubset('\U00010300-\U0001032F'), 708 'IsGothic': UnicodeSubset('\U00010330-\U0001034F'), 709 'IsDeseret': UnicodeSubset('\U00010400-\U0001044F'), 710 'IsByzantineMusicalSymbols': UnicodeSubset('\U0001D000-\U0001D0FF'), 711 'IsMusicalSymbols': UnicodeSubset('\U0001D100-\U0001D1FF'), 712 'IsMathematicalAlphanumericSymbols': UnicodeSubset('\U0001D400-\U0001D7FF'), 713 'IsCJKUnifiedIdeographsExtensionB': UnicodeSubset('\U00020000-\U0002A6D6'), 714 'IsCJKCompatibilityIdeographsSupplement': UnicodeSubset('\U0002F800-\U0002FA1F'), 715 'IsTags': UnicodeSubset('\U000E0000-\U000E007F') 716 }) 717