ExtractCJKCharacters.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. #!/usr/bin/env python3
  2. """
  3. Extract unique CJK characters from Japanese and Chinese resource files.
  4. This creates a minimal character set for SpriteFont generation.
  5. """
  6. import xml.etree.ElementTree as ET
  7. import sys
  8. from pathlib import Path
  9. def extract_characters_from_resx(file_path):
  10. """Extract all text values from a .resx file and return unique characters."""
  11. tree = ET.parse(file_path)
  12. root = tree.getroot()
  13. all_text = []
  14. # Find all <data> elements and extract their <value> content
  15. for data in root.findall(".//data"):
  16. value = data.find("value")
  17. if value is not None and value.text:
  18. all_text.append(value.text)
  19. # Combine all text and get unique characters
  20. combined_text = ''.join(all_text)
  21. unique_chars = sorted(set(combined_text))
  22. return unique_chars
  23. def is_cjk_character(char):
  24. """Check if a character is CJK (Chinese/Japanese/Korean)."""
  25. code_point = ord(char)
  26. # CJK Unified Ideographs
  27. if 0x4E00 <= code_point <= 0x9FFF:
  28. return True
  29. # Hiragana
  30. if 0x3040 <= code_point <= 0x309F:
  31. return True
  32. # Katakana
  33. if 0x30A0 <= code_point <= 0x30FF:
  34. return True
  35. # Katakana Phonetic Extensions
  36. if 0x31F0 <= code_point <= 0x31FF:
  37. return True
  38. # CJK Symbols and Punctuation
  39. if 0x3000 <= code_point <= 0x303F:
  40. return True
  41. return False
  42. def generate_character_region_xml(chars):
  43. """Generate XML character region entries for MonoGame .spritefont files."""
  44. regions = []
  45. current_start = None
  46. current_end = None
  47. for char in chars:
  48. code_point = ord(char)
  49. if current_start is None:
  50. current_start = code_point
  51. current_end = code_point
  52. elif code_point == current_end + 1:
  53. current_end = code_point
  54. else:
  55. # End current region and start new one
  56. regions.append(f" <Start>&#x{current_start:04X};</Start>\n <End>&#x{current_end:04X};</End>")
  57. current_start = code_point
  58. current_end = code_point
  59. # Add final region
  60. if current_start is not None:
  61. regions.append(f" <Start>&#x{current_start:04X};</Start>\n <End>&#x{current_end:04X};</End>")
  62. return '\n'.join(regions)
  63. def main():
  64. # Get the script directory
  65. script_dir = Path(__file__).parent
  66. resources_dir = script_dir.parent / "Core" / "Game"
  67. # Process Japanese
  68. ja_file = resources_dir / "Resources.ja.resx"
  69. if ja_file.exists():
  70. print(f"Processing {ja_file}...")
  71. ja_chars = extract_characters_from_resx(ja_file)
  72. ja_cjk_chars = [c for c in ja_chars if is_cjk_character(c)]
  73. print(f"\nJapanese Statistics:")
  74. print(f" Total unique characters: {len(ja_chars)}")
  75. print(f" CJK characters: {len(ja_cjk_chars)}")
  76. print(f" ASCII/Latin: {len(ja_chars) - len(ja_cjk_chars)}")
  77. # Write Japanese character list
  78. ja_output = script_dir / "japanese_characters.txt"
  79. with open(ja_output, 'w', encoding='utf-8') as f:
  80. f.write(''.join(ja_cjk_chars))
  81. print(f" Saved to: {ja_output}")
  82. # Generate XML for .spritefont
  83. ja_xml_output = script_dir / "japanese_character_regions.xml"
  84. with open(ja_xml_output, 'w', encoding='utf-8') as f:
  85. f.write(generate_character_region_xml(ja_cjk_chars))
  86. print(f" XML regions saved to: {ja_xml_output}")
  87. # Process Chinese
  88. zh_file = resources_dir / "Resources.zh.resx"
  89. if zh_file.exists():
  90. print(f"\nProcessing {zh_file}...")
  91. zh_chars = extract_characters_from_resx(zh_file)
  92. zh_cjk_chars = [c for c in zh_chars if is_cjk_character(c)]
  93. print(f"\nChinese Statistics:")
  94. print(f" Total unique characters: {len(zh_chars)}")
  95. print(f" CJK characters: {len(zh_cjk_chars)}")
  96. print(f" ASCII/Latin: {len(zh_chars) - len(zh_cjk_chars)}")
  97. # Write Chinese character list
  98. zh_output = script_dir / "chinese_characters.txt"
  99. with open(zh_output, 'w', encoding='utf-8') as f:
  100. f.write(''.join(zh_cjk_chars))
  101. print(f" Saved to: {zh_output}")
  102. # Generate XML for .spritefont
  103. zh_xml_output = script_dir / "chinese_character_regions.xml"
  104. with open(zh_xml_output, 'w', encoding='utf-8') as f:
  105. f.write(generate_character_region_xml(zh_cjk_chars))
  106. print(f" XML regions saved to: {zh_xml_output}")
  107. # Combine both for unified CJK font
  108. if ja_file.exists() and zh_file.exists():
  109. print(f"\nCombining Japanese + Chinese...")
  110. combined_cjk = sorted(set(ja_cjk_chars + zh_cjk_chars))
  111. print(f"\nCombined CJK Statistics:")
  112. print(f" Total unique CJK characters: {len(combined_cjk)}")
  113. # Write combined character list
  114. combined_output = script_dir / "cjk_characters.txt"
  115. with open(combined_output, 'w', encoding='utf-8') as f:
  116. f.write(''.join(combined_cjk))
  117. print(f" Saved to: {combined_output}")
  118. # Generate XML for .spritefont
  119. combined_xml_output = script_dir / "cjk_character_regions.xml"
  120. with open(combined_xml_output, 'w', encoding='utf-8') as f:
  121. f.write(generate_character_region_xml(combined_cjk))
  122. print(f" XML regions saved to: {combined_xml_output}")
  123. if __name__ == "__main__":
  124. main()