build_folding_map.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. #!/usr/bin/env python3
  2. import os
  3. import sys
  4. import textwrap
  5. self_path = os.path.dirname(os.path.realpath(__file__));
  6. f = open(self_path + "/unicode/CaseFolding.txt", "r")
  7. status_list = [ "C", "F" ]
  8. folding_list = [ dict(), dict(), dict() ]
  9. # Filter the foldings for "full" folding.
  10. for line in f:
  11. comment_off = line.find("#")
  12. if comment_off >= 0:
  13. line = line[:comment_off]
  14. line = line.strip()
  15. if not line:
  16. continue
  17. raw_codepoint, status, raw_mapping, ignored_tail = line.split(";", 3)
  18. if not status.strip() in status_list:
  19. continue
  20. codepoint = int(raw_codepoint.strip(), 16)
  21. mapping = [int(it, 16) for it in raw_mapping.strip().split(" ")]
  22. mapping_len = len(mapping)
  23. if mapping_len in range(1, 4):
  24. folding_list[mapping_len-1][codepoint] = mapping
  25. else:
  26. assert(False)
  27. f.close()
  28. # If we assume that (index0 ... index-1) makes a range (as defined below),
  29. # check that the newly provided index is compatible with the range too; i.e.
  30. # verify that the range can be extended without breaking its properties.
  31. #
  32. # Currently, we can handle ranges which:
  33. #
  34. # (1) either form consecutive sequence of codepoints and which map that range
  35. # to other consecutive range of codepoints (of the same length);
  36. #
  37. # (2) or a consecutive sequence of codepoints with step 2 where each codepoint
  38. # CP is mapped to the codepoint CP+1
  39. # (e.g. 0x1234 -> 0x1235; 0x1236 -> 0x1237; 0x1238 -> 0x1239; ...).
  40. #
  41. # Note: When the codepoints in the range are mapped to multiple codepoints,
  42. # only the 1st mapped codepoint is considered. All the other ones have to be
  43. # shared by all the mappings covered by the range.
  44. def is_range_compatible(folding, codepoint_list, index0, index):
  45. N = index - index0
  46. codepoint0 = codepoint_list[index0]
  47. codepoint1 = codepoint_list[index0+1]
  48. codepointN = codepoint_list[index]
  49. mapping0 = folding[codepoint0]
  50. mapping1 = folding[codepoint1]
  51. mappingN = folding[codepointN]
  52. # Check the range type (1):
  53. if codepoint1 - codepoint0 == 1 and codepointN - codepoint0 == N \
  54. and mapping1[0] - mapping0[0] == 1 and mapping1[1:] == mapping0[1:] \
  55. and mappingN[0] - mapping0[0] == N and mappingN[1:] == mapping0[1:]:
  56. return True
  57. # Check the range type (2):
  58. if codepoint1 - codepoint0 == 2 and codepointN - codepoint0 == 2 * N \
  59. and mapping0[0] - codepoint0 == 1 \
  60. and mapping1[0] - codepoint1 == 1 and mapping1[1:] == mapping0[1:] \
  61. and mappingN[0] - codepointN == 1 and mappingN[1:] == mapping0[1:]:
  62. return True
  63. return False
  64. def mapping_str(list, mapping):
  65. return ",".join("0x{:04x}".format(x) for x in mapping)
  66. for mapping_len in range(1, 4):
  67. folding = folding_list[mapping_len-1]
  68. codepoint_list = list(folding)
  69. index0 = 0
  70. count = len(folding)
  71. records = list()
  72. data_records = list()
  73. while index0 < count:
  74. index1 = index0 + 1
  75. while index1 < count and is_range_compatible(folding, codepoint_list, index0, index1):
  76. index1 += 1
  77. if index1 - index0 > 2:
  78. # Range of codepoints
  79. records.append("R(0x{:04x},0x{:04x})".format(codepoint_list[index0], codepoint_list[index1-1]))
  80. data_records.append(mapping_str(data_records, folding[codepoint_list[index0]]))
  81. data_records.append(mapping_str(data_records, folding[codepoint_list[index1-1]]))
  82. index0 = index1
  83. else:
  84. # Single codepoint
  85. records.append("S(0x{:04x})".format(codepoint_list[index0]))
  86. data_records.append(mapping_str(data_records, folding[codepoint_list[index0]]))
  87. index0 += 1
  88. sys.stdout.write("static const unsigned FOLD_MAP_{}[] = {{\n".format(mapping_len))
  89. sys.stdout.write("\n".join(textwrap.wrap(", ".join(records), 110,
  90. initial_indent = " ", subsequent_indent=" ")))
  91. sys.stdout.write("\n};\n")
  92. sys.stdout.write("static const unsigned FOLD_MAP_{}_DATA[] = {{\n".format(mapping_len))
  93. sys.stdout.write("\n".join(textwrap.wrap(", ".join(data_records), 110,
  94. initial_indent = " ", subsequent_indent=" ")))
  95. sys.stdout.write("\n};\n")