ucaps_fetch.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. #!/usr/bin/env python3
  2. # Script used to dump case mappings from
  3. # the Unicode Character Database to the `ucaps.h` file.
  4. # NOTE: This script is deliberately not integrated into the build system;
  5. # you should run it manually whenever you want to update the data.
  6. from __future__ import annotations
  7. import os
  8. import sys
  9. from typing import Final
  10. from urllib.request import urlopen
  11. if __name__ == "__main__":
  12. sys.path.insert(1, os.path.join(os.path.dirname(__file__), "../../"))
  13. from methods import generate_copyright_header
  14. URL: Final[str] = "https://www.unicode.org/Public/16.0.0/ucd/UnicodeData.txt"
  15. lower_to_upper: list[tuple[str, str]] = []
  16. upper_to_lower: list[tuple[str, str]] = []
  17. def parse_unicode_data() -> None:
  18. lines: list[str] = [line.decode("utf-8") for line in urlopen(URL)]
  19. for line in lines:
  20. split_line: list[str] = line.split(";")
  21. code_value: str = split_line[0].strip()
  22. uppercase_mapping: str = split_line[12].strip()
  23. lowercase_mapping: str = split_line[13].strip()
  24. if uppercase_mapping:
  25. lower_to_upper.append((f"0x{code_value}", f"0x{uppercase_mapping}"))
  26. if lowercase_mapping:
  27. upper_to_lower.append((f"0x{code_value}", f"0x{lowercase_mapping}"))
  28. def make_cap_table(table_name: str, len_name: str, table: list[tuple[str, str]]) -> str:
  29. result: str = f"static const int {table_name}[{len_name}][2] = {{\n"
  30. for first, second in table:
  31. result += f"\t{{ {first}, {second} }},\n"
  32. result += "};\n\n"
  33. return result
  34. def generate_ucaps_fetch() -> None:
  35. parse_unicode_data()
  36. source: str = generate_copyright_header("ucaps.h")
  37. source += f"""
  38. #pragma once
  39. // This file was generated using the `misc/scripts/ucaps_fetch.py` script.
  40. #define LTU_LEN {len(lower_to_upper)}
  41. #define UTL_LEN {len(upper_to_lower)}\n\n"""
  42. source += make_cap_table("caps_table", "LTU_LEN", lower_to_upper)
  43. source += make_cap_table("reverse_caps_table", "UTL_LEN", upper_to_lower)
  44. source += """static int _find_upper(int ch) {
  45. \tint low = 0;
  46. \tint high = LTU_LEN - 1;
  47. \tint middle;
  48. \twhile (low <= high) {
  49. \t\tmiddle = (low + high) / 2;
  50. \t\tif (ch < caps_table[middle][0]) {
  51. \t\t\thigh = middle - 1; // Search low end of array.
  52. \t\t} else if (caps_table[middle][0] < ch) {
  53. \t\t\tlow = middle + 1; // Search high end of array.
  54. \t\t} else {
  55. \t\t\treturn caps_table[middle][1];
  56. \t\t}
  57. \t}
  58. \treturn ch;
  59. }
  60. static int _find_lower(int ch) {
  61. \tint low = 0;
  62. \tint high = UTL_LEN - 1;
  63. \tint middle;
  64. \twhile (low <= high) {
  65. \t\tmiddle = (low + high) / 2;
  66. \t\tif (ch < reverse_caps_table[middle][0]) {
  67. \t\t\thigh = middle - 1; // Search low end of array.
  68. \t\t} else if (reverse_caps_table[middle][0] < ch) {
  69. \t\t\tlow = middle + 1; // Search high end of array.
  70. \t\t} else {
  71. \t\t\treturn reverse_caps_table[middle][1];
  72. \t\t}
  73. \t}
  74. \treturn ch;
  75. }
  76. """
  77. ucaps_path: str = os.path.join(os.path.dirname(__file__), "../../core/string/ucaps.h")
  78. with open(ucaps_path, "w", newline="\n") as f:
  79. f.write(source)
  80. print("`ucaps.h` generated successfully.")
  81. if __name__ == "__main__":
  82. generate_ucaps_fetch()