GenerateCaseConvert.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. # Script to generate CaseConvert.cxx from Python's Unicode data
  2. # Should be run rarely when a Python with a new version of Unicode data is available.
  3. # Requires Python 3.3 or later
  4. # Should not be run with old versions of Python.
  5. # Current best approach divides case conversions into two cases:
  6. # simple symmetric and complex.
  7. # Simple symmetric is where a lower and upper case pair convert to each
  8. # other and the folded form is the same as the lower case.
  9. # There are 1006 symmetric pairs.
  10. # These are further divided into ranges (stored as lower, upper, range length,
  11. # range pitch and singletons (stored as lower, upper).
  12. # Complex is for cases that don't fit the above: where there are multiple
  13. # characters in one of the forms or fold is different to lower or
  14. # lower(upper(x)) or upper(lower(x)) are not x. These are represented as UTF-8
  15. # strings with original, folded, upper, and lower separated by '|'.
  16. # There are 126 complex cases.
  17. import codecs, itertools, os, string, sys, unicodedata
  18. from FileGenerator import Regenerate
  19. def contiguousRanges(l, diff):
  20. # l is s list of lists
  21. # group into lists where first element of each element differs by diff
  22. out = [[l[0]]]
  23. for s in l[1:]:
  24. if s[0] != out[-1][-1][0] + diff:
  25. out.append([])
  26. out[-1].append(s)
  27. return out
  28. def flatten(listOfLists):
  29. "Flatten one level of nesting"
  30. return itertools.chain.from_iterable(listOfLists)
  31. def conversionSets():
  32. # For all Unicode characters, see whether they have case conversions
  33. # Return 2 sets: one of simple symmetric conversion cases and another
  34. # with complex cases.
  35. complexes = []
  36. symmetrics = []
  37. for ch in range(sys.maxunicode):
  38. if ch >= 0xd800 and ch <= 0xDBFF:
  39. continue
  40. if ch >= 0xdc00 and ch <= 0xDFFF:
  41. continue
  42. uch = chr(ch)
  43. fold = uch.casefold()
  44. upper = uch.upper()
  45. lower = uch.lower()
  46. symmetric = False
  47. if uch != upper and len(upper) == 1 and uch == lower and uch == fold:
  48. lowerUpper = upper.lower()
  49. foldUpper = upper.casefold()
  50. if lowerUpper == foldUpper and lowerUpper == uch:
  51. symmetric = True
  52. symmetrics.append((ch, ord(upper), ch - ord(upper)))
  53. if uch != lower and len(lower) == 1 and uch == upper and lower == fold:
  54. upperLower = lower.upper()
  55. if upperLower == uch:
  56. symmetric = True
  57. if fold == uch:
  58. fold = ""
  59. if upper == uch:
  60. upper = ""
  61. if lower == uch:
  62. lower = ""
  63. if (fold or upper or lower) and not symmetric:
  64. complexes.append((uch, fold, upper, lower))
  65. return symmetrics, complexes
  66. def groupRanges(symmetrics):
  67. # Group the symmetrics into groups where possible, returning a list
  68. # of ranges and a list of symmetrics that didn't fit into a range
  69. def distance(s):
  70. return s[2]
  71. groups = []
  72. uniquekeys = []
  73. for k, g in itertools.groupby(symmetrics, distance):
  74. groups.append(list(g)) # Store group iterator as a list
  75. uniquekeys.append(k)
  76. contiguousGroups = flatten([contiguousRanges(g, 1) for g in groups])
  77. longGroups = [(x[0][0], x[0][1], len(x), 1) for x in contiguousGroups if len(x) > 4]
  78. oneDiffs = [s for s in symmetrics if s[2] == 1]
  79. contiguousOnes = flatten([contiguousRanges(g, 2) for g in [oneDiffs]])
  80. longOneGroups = [(x[0][0], x[0][1], len(x), 2) for x in contiguousOnes if len(x) > 4]
  81. rangeGroups = sorted(longGroups+longOneGroups, key=lambda s: s[0])
  82. rangeCoverage = list(flatten([range(r[0], r[0]+r[2]*r[3], r[3]) for r in rangeGroups]))
  83. nonRanges = [(l, u) for l, u, d in symmetrics if l not in rangeCoverage]
  84. return rangeGroups, nonRanges
  85. def escape(s):
  86. return "".join((chr(c) if chr(c) in string.ascii_letters else "\\x%x" % c) for c in s.encode('utf-8'))
  87. def updateCaseConvert():
  88. symmetrics, complexes = conversionSets()
  89. rangeGroups, nonRanges = groupRanges(symmetrics)
  90. print(len(rangeGroups), "ranges")
  91. rangeLines = ["%d,%d,%d,%d, " % x for x in rangeGroups]
  92. print(len(nonRanges), "non ranges")
  93. nonRangeLines = ["%d,%d, " % x for x in nonRanges]
  94. print(len(symmetrics), "symmetric")
  95. complexLines = ['"%s|%s|%s|%s|"' % tuple(escape(t) for t in x) for x in complexes]
  96. print(len(complexLines), "complex")
  97. Regenerate("../src/CaseConvert.cxx", "//", rangeLines, nonRangeLines, complexLines)
  98. updateCaseConvert()