char_range_fetch.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. #!/usr/bin/env python3
  2. # Script used to dump char ranges for specific properties from
  3. # the Unicode Character Database to the `char_range.inc` file.
  4. # NOTE: This script is deliberately not integrated into the build system;
  5. # you should run it manually whenever you want to update the data.
  6. from __future__ import annotations
  7. import os
  8. import sys
  9. from typing import Final
  10. from urllib.request import urlopen
  11. if __name__ == "__main__":
  12. sys.path.insert(1, os.path.join(os.path.dirname(__file__), "../../"))
  13. from methods import generate_copyright_header
  14. URL: Final[str] = "https://www.unicode.org/Public/16.0.0/ucd/DerivedCoreProperties.txt"
  15. xid_start: list[tuple[int, int]] = []
  16. xid_continue: list[tuple[int, int]] = []
  17. uppercase_letter: list[tuple[int, int]] = []
  18. lowercase_letter: list[tuple[int, int]] = []
  19. unicode_letter: list[tuple[int, int]] = []
  20. def merge_ranges(ranges: list[tuple[int, int]]) -> None:
  21. if len(ranges) < 2:
  22. return
  23. last_start: int = ranges[0][0]
  24. last_end: int = ranges[0][1]
  25. original_ranges: list[tuple[int, int]] = ranges[1:]
  26. ranges.clear()
  27. for curr_range in original_ranges:
  28. curr_start: int = curr_range[0]
  29. curr_end: int = curr_range[1]
  30. if last_end + 1 != curr_start:
  31. ranges.append((last_start, last_end))
  32. last_start = curr_start
  33. last_end = curr_end
  34. ranges.append((last_start, last_end))
  35. def parse_unicode_data() -> None:
  36. lines: list[str] = [line.decode("utf-8") for line in urlopen(URL)]
  37. for line in lines:
  38. if line.startswith("#") or not line.strip():
  39. continue
  40. split_line: list[str] = line.split(";")
  41. char_range: str = split_line[0].strip()
  42. char_property: str = split_line[1].strip().split("#")[0].strip()
  43. range_start: str = char_range
  44. range_end: str = char_range
  45. if ".." in char_range:
  46. range_start, range_end = char_range.split("..")
  47. range_tuple: tuple[int, int] = (int(range_start, 16), int(range_end, 16))
  48. if char_property == "XID_Start":
  49. xid_start.append(range_tuple)
  50. elif char_property == "XID_Continue":
  51. xid_continue.append(range_tuple)
  52. elif char_property == "Uppercase":
  53. uppercase_letter.append(range_tuple)
  54. elif char_property == "Lowercase":
  55. lowercase_letter.append(range_tuple)
  56. elif char_property == "Alphabetic":
  57. unicode_letter.append(range_tuple)
  58. # Underscore technically isn't in XID_Start, but for our purposes it's included.
  59. xid_start.append((0x005F, 0x005F))
  60. xid_start.sort(key=lambda x: x[0])
  61. merge_ranges(xid_start)
  62. merge_ranges(xid_continue)
  63. merge_ranges(uppercase_letter)
  64. merge_ranges(lowercase_letter)
  65. merge_ranges(unicode_letter)
  66. def make_array(array_name: str, range_list: list[tuple[int, int]]) -> str:
  67. result: str = f"\n\nconstexpr inline CharRange {array_name}[] = {{\n"
  68. for start, end in range_list:
  69. result += f"\t{{ 0x{start:x}, 0x{end:x} }},\n"
  70. result += "};"
  71. return result
  72. def generate_char_range_inc() -> None:
  73. parse_unicode_data()
  74. source: str = generate_copyright_header("char_range.inc")
  75. source += f"""
  76. // This file was generated using the `misc/scripts/char_range_fetch.py` script.
  77. #pragma once
  78. #include "core/typedefs.h"
  79. // Unicode Derived Core Properties
  80. // Source: {URL}
  81. struct CharRange {{
  82. \tchar32_t start;
  83. \tchar32_t end;
  84. }};"""
  85. source += make_array("xid_start", xid_start)
  86. source += make_array("xid_continue", xid_continue)
  87. source += make_array("uppercase_letter", uppercase_letter)
  88. source += make_array("lowercase_letter", lowercase_letter)
  89. source += make_array("unicode_letter", unicode_letter)
  90. source += "\n"
  91. char_range_path: str = os.path.join(os.path.dirname(__file__), "../../core/string/char_range.inc")
  92. with open(char_range_path, "w", newline="\n") as f:
  93. f.write(source)
  94. print("`char_range.inc` generated successfully.")
  95. if __name__ == "__main__":
  96. generate_char_range_inc()