build_punct_map.py 1.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. #!/usr/bin/env python3
  2. import os
  3. import sys
  4. import textwrap
  5. self_path = os.path.dirname(os.path.realpath(__file__));
  6. f = open(self_path + "/unicode/DerivedGeneralCategory.txt", "r")
  7. codepoint_list = []
  8. category_list = [ "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps" ]
  9. # Filter codepoints falling in the right category:
  10. for line in f:
  11. comment_off = line.find("#")
  12. if comment_off >= 0:
  13. line = line[:comment_off]
  14. line = line.strip()
  15. if not line:
  16. continue
  17. char_range, category = line.split(";")
  18. char_range = char_range.strip()
  19. category = category.strip()
  20. if not category in category_list:
  21. continue
  22. delim_off = char_range.find("..")
  23. if delim_off >= 0:
  24. codepoint0 = int(char_range[:delim_off], 16)
  25. codepoint1 = int(char_range[delim_off+2:], 16)
  26. for codepoint in range(codepoint0, codepoint1 + 1):
  27. codepoint_list.append(codepoint)
  28. else:
  29. codepoint = int(char_range, 16)
  30. codepoint_list.append(codepoint)
  31. f.close()
  32. codepoint_list.sort()
  33. index0 = 0
  34. count = len(codepoint_list)
  35. records = list()
  36. while index0 < count:
  37. index1 = index0 + 1
  38. while index1 < count and codepoint_list[index1] == codepoint_list[index1-1] + 1:
  39. index1 += 1
  40. if index1 - index0 > 1:
  41. # Range of codepoints
  42. records.append("R(0x{:04x},0x{:04x})".format(codepoint_list[index0], codepoint_list[index1-1]))
  43. else:
  44. # Single codepoint
  45. records.append("S(0x{:04x})".format(codepoint_list[index0]))
  46. index0 = index1
  47. sys.stdout.write("static const unsigned PUNCT_MAP[] = {\n")
  48. sys.stdout.write("\n".join(textwrap.wrap(", ".join(records), 110,
  49. initial_indent = " ", subsequent_indent=" ")))
  50. sys.stdout.write("\n};\n\n")