123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120 |
- #!/usr/bin/env python3
- import os
- import sys
- import textwrap
- self_path = os.path.dirname(os.path.realpath(__file__));
- f = open(self_path + "/unicode/CaseFolding.txt", "r")
- status_list = [ "C", "F" ]
- folding_list = [ dict(), dict(), dict() ]
- # Filter the foldings for "full" folding.
- for line in f:
- comment_off = line.find("#")
- if comment_off >= 0:
- line = line[:comment_off]
- line = line.strip()
- if not line:
- continue
- raw_codepoint, status, raw_mapping, ignored_tail = line.split(";", 3)
- if not status.strip() in status_list:
- continue
- codepoint = int(raw_codepoint.strip(), 16)
- mapping = [int(it, 16) for it in raw_mapping.strip().split(" ")]
- mapping_len = len(mapping)
- if mapping_len in range(1, 4):
- folding_list[mapping_len-1][codepoint] = mapping
- else:
- assert(False)
- f.close()
- # If we assume that (index0 ... index-1) makes a range (as defined below),
- # check that the newly provided index is compatible with the range too; i.e.
- # verify that the range can be extended without breaking its properties.
- #
- # Currently, we can handle ranges which:
- #
- # (1) either form consecutive sequence of codepoints and which map that range
- # to other consecutive range of codepoints (of the same length);
- #
- # (2) or a consecutive sequence of codepoints with step 2 where each codepoint
- # CP is mapped to the codepoint CP+1
- # (e.g. 0x1234 -> 0x1235; 0x1236 -> 0x1237; 0x1238 -> 0x1239; ...).
- #
- # Note: When the codepoints in the range are mapped to multiple codepoints,
- # only the 1st mapped codepoint is considered. All the other ones have to be
- # shared by all the mappings covered by the range.
- def is_range_compatible(folding, codepoint_list, index0, index):
- N = index - index0
- codepoint0 = codepoint_list[index0]
- codepoint1 = codepoint_list[index0+1]
- codepointN = codepoint_list[index]
- mapping0 = folding[codepoint0]
- mapping1 = folding[codepoint1]
- mappingN = folding[codepointN]
- # Check the range type (1):
- if codepoint1 - codepoint0 == 1 and codepointN - codepoint0 == N \
- and mapping1[0] - mapping0[0] == 1 and mapping1[1:] == mapping0[1:] \
- and mappingN[0] - mapping0[0] == N and mappingN[1:] == mapping0[1:]:
- return True
- # Check the range type (2):
- if codepoint1 - codepoint0 == 2 and codepointN - codepoint0 == 2 * N \
- and mapping0[0] - codepoint0 == 1 \
- and mapping1[0] - codepoint1 == 1 and mapping1[1:] == mapping0[1:] \
- and mappingN[0] - codepointN == 1 and mappingN[1:] == mapping0[1:]:
- return True
- return False
- def mapping_str(list, mapping):
- return ",".join("0x{:04x}".format(x) for x in mapping)
- for mapping_len in range(1, 4):
- folding = folding_list[mapping_len-1]
- codepoint_list = list(folding)
- index0 = 0
- count = len(folding)
- records = list()
- data_records = list()
- while index0 < count:
- index1 = index0 + 1
- while index1 < count and is_range_compatible(folding, codepoint_list, index0, index1):
- index1 += 1
- if index1 - index0 > 2:
- # Range of codepoints
- records.append("R(0x{:04x},0x{:04x})".format(codepoint_list[index0], codepoint_list[index1-1]))
- data_records.append(mapping_str(data_records, folding[codepoint_list[index0]]))
- data_records.append(mapping_str(data_records, folding[codepoint_list[index1-1]]))
- index0 = index1
- else:
- # Single codepoint
- records.append("S(0x{:04x})".format(codepoint_list[index0]))
- data_records.append(mapping_str(data_records, folding[codepoint_list[index0]]))
- index0 += 1
- sys.stdout.write("static const unsigned FOLD_MAP_{}[] = {{\n".format(mapping_len))
- sys.stdout.write("\n".join(textwrap.wrap(", ".join(records), 110,
- initial_indent = " ", subsequent_indent=" ")))
- sys.stdout.write("\n};\n")
- sys.stdout.write("static const unsigned FOLD_MAP_{}_DATA[] = {{\n".format(mapping_len))
- sys.stdout.write("\n".join(textwrap.wrap(", ".join(data_records), 110,
- initial_indent = " ", subsequent_indent=" ")))
- sys.stdout.write("\n};\n")
|