6 mesi fa · bb8ef4e052
--- a/core/string/char_range.inc
+++ b/core/string/char_range.inc
@@ -28,6 +28,8 @@
 
				 /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
			
 
				 /**************************************************************************/
			
 
				 
			
 
				+// This file was generated using the `misc/scripts/char_range_fetch.py` script.
			
 
				+
			
 
				 #ifndef CHAR_RANGE_INC
			
 
				 #define CHAR_RANGE_INC
			
 
				 
			
@@ -43,7 +45,7 @@ struct CharRange {
 
				 
			
 
				 constexpr inline CharRange xid_start[] = {
			
 
				 	{ 0x41, 0x5a },
			
 
				-	{ 0x5f, 0x5f }, // Underscore technically isn't in XID_Start, but for our purposes it's included.
			
 
				+	{ 0x5f, 0x5f },
			
 
				 	{ 0x61, 0x7a },
			
 
				 	{ 0xaa, 0xaa },
			
 
				 	{ 0xb5, 0xb5 },
			
--- a/misc/scripts/char_range_fetch.py
+++ b/misc/scripts/char_range_fetch.py
@@ -0,0 +1,138 @@
 
				+#!/usr/bin/env python3
			
 
				+
			
 
				+# Script used to dump char ranges for specific properties from
			
 
				+# the Unicode Character Database to the `char_range.inc` file.
			
 
				+# NOTE: This script is deliberately not integrated into the build system;
			
 
				+# you should run it manually whenever you want to update the data.
			
 
				+
			
 
				+import os
			
 
				+import sys
			
 
				+from typing import Final, List, Tuple
			
 
				+from urllib.request import urlopen
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    sys.path.insert(1, os.path.join(os.path.dirname(__file__), "../../"))
			
 
				+
			
 
				+from methods import generate_copyright_header
			
 
				+
			
 
				+URL: Final[str] = "https://www.unicode.org/Public/16.0.0/ucd/DerivedCoreProperties.txt"
			
 
				+
			
 
				+
			
 
				+xid_start: List[Tuple[int, int]] = []
			
 
				+xid_continue: List[Tuple[int, int]] = []
			
 
				+uppercase_letter: List[Tuple[int, int]] = []
			
 
				+lowercase_letter: List[Tuple[int, int]] = []
			
 
				+unicode_letter: List[Tuple[int, int]] = []
			
 
				+
			
 
				+
			
 
				+def merge_ranges(ranges: List[Tuple[int, int]]) -> None:
			
 
				+    if len(ranges) < 2:
			
 
				+        return
			
 
				+
			
 
				+    last_start: int = ranges[0][0]
			
 
				+    last_end: int = ranges[0][1]
			
 
				+    original_ranges: List[Tuple[int, int]] = ranges[1:]
			
 
				+
			
 
				+    ranges.clear()
			
 
				+
			
 
				+    for curr_range in original_ranges:
			
 
				+        curr_start: int = curr_range[0]
			
 
				+        curr_end: int = curr_range[1]
			
 
				+        if last_end + 1 != curr_start:
			
 
				+            ranges.append((last_start, last_end))
			
 
				+            last_start = curr_start
			
 
				+        last_end = curr_end
			
 
				+
			
 
				+    ranges.append((last_start, last_end))
			
 
				+
			
 
				+
			
 
				+def parse_unicode_data() -> None:
			
 
				+    lines: List[str] = [line.decode("utf-8") for line in urlopen(URL)]
			
 
				+
			
 
				+    for line in lines:
			
 
				+        if line.startswith("#") or not line.strip():
			
 
				+            continue
			
 
				+
			
 
				+        split_line: List[str] = line.split(";")
			
 
				+
			
 
				+        char_range: str = split_line[0].strip()
			
 
				+        char_property: str = split_line[1].strip().split("#")[0].strip()
			
 
				+
			
 
				+        range_start: str = char_range
			
 
				+        range_end: str = char_range
			
 
				+        if ".." in char_range:
			
 
				+            range_start, range_end = char_range.split("..")
			
 
				+
			
 
				+        range_tuple: Tuple[int, int] = (int(range_start, 16), int(range_end, 16))
			
 
				+
			
 
				+        if char_property == "XID_Start":
			
 
				+            xid_start.append(range_tuple)
			
 
				+        elif char_property == "XID_Continue":
			
 
				+            xid_continue.append(range_tuple)
			
 
				+        elif char_property == "Uppercase":
			
 
				+            uppercase_letter.append(range_tuple)
			
 
				+        elif char_property == "Lowercase":
			
 
				+            lowercase_letter.append(range_tuple)
			
 
				+        elif char_property == "Alphabetic":
			
 
				+            unicode_letter.append(range_tuple)
			
 
				+
			
 
				+    # Underscore technically isn't in XID_Start, but for our purposes it's included.
			
 
				+    xid_start.append((0x005F, 0x005F))
			
 
				+    xid_start.sort(key=lambda x: x[0])
			
 
				+
			
 
				+    merge_ranges(xid_start)
			
 
				+    merge_ranges(xid_continue)
			
 
				+    merge_ranges(uppercase_letter)
			
 
				+    merge_ranges(lowercase_letter)
			
 
				+    merge_ranges(unicode_letter)
			
 
				+
			
 
				+
			
 
				+def make_array(array_name: str, range_list: List[Tuple[int, int]]) -> str:
			
 
				+    result: str = f"constexpr inline CharRange {array_name}[] = {{\n"
			
 
				+
			
 
				+    for start, end in range_list:
			
 
				+        result += f"\t{{ 0x{start:x}, 0x{end:x} }},\n"
			
 
				+
			
 
				+    result += "};\n\n"
			
 
				+
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def generate_char_range_inc() -> None:
			
 
				+    parse_unicode_data()
			
 
				+
			
 
				+    source: str = generate_copyright_header("char_range.inc")
			
 
				+
			
 
				+    source += f"""
			
 
				+// This file was generated using the `misc/scripts/char_range_fetch.py` script.
			
 
				+
			
 
				+#ifndef CHAR_RANGE_INC
			
 
				+#define CHAR_RANGE_INC
			
 
				+
			
 
				+#include "core/typedefs.h"
			
 
				+
			
 
				+// Unicode Derived Core Properties
			
 
				+// Source: {URL}
			
 
				+
			
 
				+struct CharRange {{
			
 
				+\tchar32_t start;
			
 
				+\tchar32_t end;
			
 
				+}};\n\n"""
			
 
				+
			
 
				+    source += make_array("xid_start", xid_start)
			
 
				+    source += make_array("xid_continue", xid_continue)
			
 
				+    source += make_array("uppercase_letter", uppercase_letter)
			
 
				+    source += make_array("lowercase_letter", lowercase_letter)
			
 
				+    source += make_array("unicode_letter", unicode_letter)
			
 
				+
			
 
				+    source += "#endif // CHAR_RANGE_INC\n"
			
 
				+
			
 
				+    char_range_path: str = os.path.join(os.path.dirname(__file__), "../../core/string/char_range.inc")
			
 
				+    with open(char_range_path, "w", newline="\n") as f:
			
 
				+        f.write(source)
			
 
				+
			
 
				+    print("`char_range.inc` generated successfully.")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    generate_char_range_inc()