unicode_ranges_fetch.py 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. #!/usr/bin/env python3
  2. # Script used to dump char ranges from
  3. # the Unicode Character Database to the `char_range.inc` file.
  4. # NOTE: This script is deliberately not integrated into the build system;
  5. # you should run it manually whenever you want to update the data.
  6. from __future__ import annotations
  7. import os
  8. import sys
  9. from typing import Final
  10. from urllib.request import urlopen
  11. if __name__ == "__main__":
  12. sys.path.insert(1, os.path.join(os.path.dirname(__file__), "../../"))
  13. from methods import generate_copyright_header
  14. URL: Final[str] = "https://www.unicode.org/Public/16.0.0/ucd/Blocks.txt"
  15. ranges: list[tuple[str, str, str]] = []
  16. exclude_blocks: set[str] = {
  17. "High Surrogates",
  18. "High Private Use Surrogates",
  19. "Low Surrogates",
  20. "Variation Selectors",
  21. "Specials",
  22. "Egyptian Hieroglyph Format Controls",
  23. "Tags",
  24. "Variation Selectors Supplement",
  25. }
  26. def parse_unicode_data() -> None:
  27. lines: list[str] = [line.decode("utf-8") for line in urlopen(URL)]
  28. for line in lines:
  29. if line.startswith("#") or not line.strip():
  30. continue
  31. split_line: list[str] = line.split(";")
  32. char_range: str = split_line[0].strip()
  33. block: str = split_line[1].strip()
  34. if block in exclude_blocks:
  35. continue
  36. range_start, range_end = char_range.split("..")
  37. ranges.append((f"0x{range_start}", f"0x{range_end}", block))
  38. def make_array(array_name: str, ranges: list[tuple[str, str, str]]) -> str:
  39. result: str = f"static UniRange {array_name}[] = {{\n"
  40. for start, end, block in ranges:
  41. result += f'\t{{ {start}, {end}, U"{block}" }},\n'
  42. result += """\t{ 0x10FFFF, 0x10FFFF, String() }
  43. };\n\n"""
  44. return result
  45. def generate_unicode_ranges_inc() -> None:
  46. parse_unicode_data()
  47. source: str = generate_copyright_header("unicode_ranges.inc")
  48. source += f"""
  49. // This file was generated using the `misc/scripts/unicode_ranges_fetch.py` script.
  50. #ifndef UNICODE_RANGES_INC
  51. #define UNICODE_RANGES_INC
  52. // Unicode Character Blocks
  53. // Source: {URL}
  54. struct UniRange {{
  55. \tint32_t start;
  56. \tint32_t end;
  57. \tString name;
  58. }};\n\n"""
  59. source += make_array("unicode_ranges", ranges)
  60. source += "#endif // UNICODE_RANGES_INC\n"
  61. unicode_ranges_path: str = os.path.join(os.path.dirname(__file__), "../../editor/import/unicode_ranges.inc")
  62. with open(unicode_ranges_path, "w", newline="\n") as f:
  63. f.write(source)
  64. print("`unicode_ranges.inc` generated successfully.")
  65. if __name__ == "__main__":
  66. generate_unicode_ranges_inc()