unicode.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611
  1. #!/usr/bin/env python
  2. #
  3. # Copyright 2011-2018 The Rust Project Developers. See the COPYRIGHT
  4. # file at the top-level directory of this distribution and at
  5. # http://rust-lang.org/COPYRIGHT.
  6. #
  7. # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  8. # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  9. # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  10. # option. This file may not be copied, modified, or distributed
  11. # except according to those terms.
  12. # This script uses the following Unicode tables:
  13. # - DerivedNormalizationProps.txt
  14. # - NormalizationTest.txt
  15. # - UnicodeData.txt
  16. # - StandardizedVariants.txt
  17. #
  18. # Since this should not require frequent updates, we just store this
  19. # out-of-line and check the tables.rs and normalization_tests.rs files into git.
  20. import collections
  21. import urllib.request
  22. UNICODE_VERSION = "13.0.0"
  23. UCD_URL = "https://www.unicode.org/Public/%s/ucd/" % UNICODE_VERSION
  24. PREAMBLE = """// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT
  25. // file at the top-level directory of this distribution and at
  26. // http://rust-lang.org/COPYRIGHT.
  27. //
  28. // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  29. // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  30. // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  31. // option. This file may not be copied, modified, or distributed
  32. // except according to those terms.
  33. // NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
  34. #![allow(missing_docs)]
  35. """
  36. NormalizationTest = collections.namedtuple(
  37. "NormalizationTest",
  38. ["source", "nfc", "nfd", "nfkc", "nfkd"],
  39. )
  40. # Mapping taken from Table 12 from:
  41. # http://www.unicode.org/reports/tr44/#General_Category_Values
  42. expanded_categories = {
  43. 'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
  44. 'Lm': ['L'], 'Lo': ['L'],
  45. 'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
  46. 'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
  47. 'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
  48. 'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
  49. 'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
  50. 'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
  51. 'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
  52. }
  53. # Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
  54. # http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
  55. S_BASE, L_COUNT, V_COUNT, T_COUNT = 0xAC00, 19, 21, 28
  56. S_COUNT = L_COUNT * V_COUNT * T_COUNT
  57. class UnicodeData(object):
  58. def __init__(self):
  59. self._load_unicode_data()
  60. self.norm_props = self._load_norm_props()
  61. self.norm_tests = self._load_norm_tests()
  62. self.canon_comp = self._compute_canonical_comp()
  63. self.canon_fully_decomp, self.compat_fully_decomp = self._compute_fully_decomposed()
  64. self.cjk_compat_variants_fully_decomp = {}
  65. self._load_cjk_compat_ideograph_variants()
  66. def stats(name, table):
  67. count = sum(len(v) for v in table.values())
  68. print("%s: %d chars => %d decomposed chars" % (name, len(table), count))
  69. print("Decomposition table stats:")
  70. stats("Canonical decomp", self.canon_decomp)
  71. stats("Compatible decomp", self.compat_decomp)
  72. stats("Canonical fully decomp", self.canon_fully_decomp)
  73. stats("Compatible fully decomp", self.compat_fully_decomp)
  74. stats("CJK Compat Variants fully decomp", self.cjk_compat_variants_fully_decomp)
  75. self.ss_leading, self.ss_trailing = self._compute_stream_safe_tables()
  76. def _fetch(self, filename):
  77. resp = urllib.request.urlopen(UCD_URL + filename)
  78. return resp.read().decode('utf-8')
  79. def _load_unicode_data(self):
  80. self.name_to_char_int = {}
  81. self.combining_classes = {}
  82. self.compat_decomp = {}
  83. self.canon_decomp = {}
  84. self.general_category_mark = []
  85. self.general_category_public_assigned = []
  86. assigned_start = 0;
  87. prev_char_int = -1;
  88. prev_name = "";
  89. for line in self._fetch("UnicodeData.txt").splitlines():
  90. # See ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
  91. pieces = line.split(';')
  92. assert len(pieces) == 15
  93. char, name, category, cc, decomp = pieces[0], pieces[1], pieces[2], pieces[3], pieces[5]
  94. char_int = int(char, 16)
  95. name = pieces[1].strip()
  96. self.name_to_char_int[name] = char_int
  97. if cc != '0':
  98. self.combining_classes[char_int] = cc
  99. if decomp.startswith('<'):
  100. self.compat_decomp[char_int] = [int(c, 16) for c in decomp.split()[1:]]
  101. elif decomp != '':
  102. self.canon_decomp[char_int] = [int(c, 16) for c in decomp.split()]
  103. if category == 'M' or 'M' in expanded_categories.get(category, []):
  104. self.general_category_mark.append(char_int)
  105. assert category != 'Cn', "Unexpected: Unassigned codepoint in UnicodeData.txt"
  106. if category not in ['Co', 'Cs']:
  107. if char_int != prev_char_int + 1 and not is_first_and_last(prev_name, name):
  108. self.general_category_public_assigned.append((assigned_start, prev_char_int))
  109. assigned_start = char_int
  110. prev_char_int = char_int
  111. prev_name = name;
  112. self.general_category_public_assigned.append((assigned_start, prev_char_int))
  113. def _load_cjk_compat_ideograph_variants(self):
  114. for line in self._fetch("StandardizedVariants.txt").splitlines():
  115. strip_comments = line.split('#', 1)[0].strip()
  116. if not strip_comments:
  117. continue
  118. variation_sequence, description, differences = strip_comments.split(';')
  119. description = description.strip()
  120. # Don't use variations that only apply in particular shaping environments.
  121. if differences:
  122. continue
  123. # Look for entries where the description field is a codepoint name.
  124. if description not in self.name_to_char_int:
  125. continue
  126. # Only consider the CJK Compatibility Ideographs.
  127. if not description.startswith('CJK COMPATIBILITY IDEOGRAPH-'):
  128. continue
  129. char_int = self.name_to_char_int[description]
  130. assert not char_int in self.combining_classes, "Unexpected: CJK compat variant with a combining class"
  131. assert not char_int in self.compat_decomp, "Unexpected: CJK compat variant and compatibility decomposition"
  132. assert len(self.canon_decomp[char_int]) == 1, "Unexpected: CJK compat variant and non-singleton canonical decomposition"
  133. # If we ever need to handle Hangul here, we'll need to handle it separately.
  134. assert not (S_BASE <= char_int < S_BASE + S_COUNT)
  135. cjk_compat_variant_parts = [int(c, 16) for c in variation_sequence.split()]
  136. for c in cjk_compat_variant_parts:
  137. assert not c in self.canon_decomp, "Unexpected: CJK compat variant is unnormalized (canon)"
  138. assert not c in self.compat_decomp, "Unexpected: CJK compat variant is unnormalized (compat)"
  139. self.cjk_compat_variants_fully_decomp[char_int] = cjk_compat_variant_parts
  140. def _load_norm_props(self):
  141. props = collections.defaultdict(list)
  142. for line in self._fetch("DerivedNormalizationProps.txt").splitlines():
  143. (prop_data, _, _) = line.partition("#")
  144. prop_pieces = prop_data.split(";")
  145. if len(prop_pieces) < 2:
  146. continue
  147. assert len(prop_pieces) <= 3
  148. (low, _, high) = prop_pieces[0].strip().partition("..")
  149. prop = prop_pieces[1].strip()
  150. data = None
  151. if len(prop_pieces) == 3:
  152. data = prop_pieces[2].strip()
  153. props[prop].append((low, high, data))
  154. return props
  155. def _load_norm_tests(self):
  156. tests = []
  157. for line in self._fetch("NormalizationTest.txt").splitlines():
  158. (test_data, _, _) = line.partition("#")
  159. test_pieces = test_data.split(";")
  160. if len(test_pieces) < 5:
  161. continue
  162. source, nfc, nfd, nfkc, nfkd = [[c.strip() for c in p.split()] for p in test_pieces[:5]]
  163. tests.append(NormalizationTest(source, nfc, nfd, nfkc, nfkd))
  164. return tests
  165. def _compute_canonical_comp(self):
  166. canon_comp = {}
  167. comp_exclusions = [
  168. (int(low, 16), int(high or low, 16))
  169. for low, high, _ in self.norm_props["Full_Composition_Exclusion"]
  170. ]
  171. for char_int, decomp in self.canon_decomp.items():
  172. if any(lo <= char_int <= hi for lo, hi in comp_exclusions):
  173. continue
  174. assert len(decomp) == 2
  175. assert (decomp[0], decomp[1]) not in canon_comp
  176. canon_comp[(decomp[0], decomp[1])] = char_int
  177. return canon_comp
  178. def _compute_fully_decomposed(self):
  179. """
  180. Even though the decomposition algorithm is recursive, it is possible
  181. to precompute the recursion at table generation time with modest
  182. increase to the table size. Then, for these precomputed tables, we
  183. note that 1) compatible decomposition is a subset of canonical
  184. decomposition and 2) they mostly agree on their intersection.
  185. Therefore, we don't store entries in the compatible table for
  186. characters that decompose the same way under canonical decomposition.
  187. Decomposition table stats:
  188. Canonical decomp: 2060 chars => 3085 decomposed chars
  189. Compatible decomp: 3662 chars => 5440 decomposed chars
  190. Canonical fully decomp: 2060 chars => 3404 decomposed chars
  191. Compatible fully decomp: 3678 chars => 5599 decomposed chars
  192. The upshot is that decomposition code is very simple and easy to inline
  193. at mild code size cost.
  194. """
  195. def _decompose(char_int, compatible):
  196. # 7-bit ASCII never decomposes
  197. if char_int <= 0x7f:
  198. yield char_int
  199. return
  200. # Assert that we're handling Hangul separately.
  201. assert not (S_BASE <= char_int < S_BASE + S_COUNT)
  202. decomp = self.canon_decomp.get(char_int)
  203. if decomp is not None:
  204. for decomposed_ch in decomp:
  205. for fully_decomposed_ch in _decompose(decomposed_ch, compatible):
  206. yield fully_decomposed_ch
  207. return
  208. if compatible and char_int in self.compat_decomp:
  209. for decomposed_ch in self.compat_decomp[char_int]:
  210. for fully_decomposed_ch in _decompose(decomposed_ch, compatible):
  211. yield fully_decomposed_ch
  212. return
  213. yield char_int
  214. return
  215. end_codepoint = max(
  216. max(self.canon_decomp.keys()),
  217. max(self.compat_decomp.keys()),
  218. )
  219. canon_fully_decomp = {}
  220. compat_fully_decomp = {}
  221. for char_int in range(0, end_codepoint + 1):
  222. # Always skip Hangul, since it's more efficient to represent its
  223. # decomposition programmatically.
  224. if S_BASE <= char_int < S_BASE + S_COUNT:
  225. continue
  226. canon = list(_decompose(char_int, False))
  227. if not (len(canon) == 1 and canon[0] == char_int):
  228. canon_fully_decomp[char_int] = canon
  229. compat = list(_decompose(char_int, True))
  230. if not (len(compat) == 1 and compat[0] == char_int):
  231. compat_fully_decomp[char_int] = compat
  232. # Since canon_fully_decomp is a subset of compat_fully_decomp, we don't
  233. # need to store their overlap when they agree. When they don't agree,
  234. # store the decomposition in the compatibility table since we'll check
  235. # that first when normalizing to NFKD.
  236. assert set(canon_fully_decomp) <= set(compat_fully_decomp)
  237. for ch in set(canon_fully_decomp) & set(compat_fully_decomp):
  238. if canon_fully_decomp[ch] == compat_fully_decomp[ch]:
  239. del compat_fully_decomp[ch]
  240. return canon_fully_decomp, compat_fully_decomp
  241. def _compute_stream_safe_tables(self):
  242. """
  243. To make a text stream-safe with the Stream-Safe Text Process (UAX15-D4),
  244. we need to be able to know the number of contiguous non-starters *after*
  245. applying compatibility decomposition to each character.
  246. We can do this incrementally by computing the number of leading and
  247. trailing non-starters for each character's compatibility decomposition
  248. with the following rules:
  249. 1) If a character is not affected by compatibility decomposition, look
  250. up its canonical combining class to find out if it's a non-starter.
  251. 2) All Hangul characters are starters, even under decomposition.
  252. 3) Otherwise, very few decomposing characters have a nonzero count
  253. of leading or trailing non-starters, so store these characters
  254. with their associated counts in a separate table.
  255. """
  256. leading_nonstarters = {}
  257. trailing_nonstarters = {}
  258. for c in set(self.canon_fully_decomp) | set(self.compat_fully_decomp):
  259. decomposed = self.compat_fully_decomp.get(c) or self.canon_fully_decomp[c]
  260. num_leading = 0
  261. for d in decomposed:
  262. if d not in self.combining_classes:
  263. break
  264. num_leading += 1
  265. num_trailing = 0
  266. for d in reversed(decomposed):
  267. if d not in self.combining_classes:
  268. break
  269. num_trailing += 1
  270. if num_leading > 0:
  271. leading_nonstarters[c] = num_leading
  272. if num_trailing > 0:
  273. trailing_nonstarters[c] = num_trailing
  274. return leading_nonstarters, trailing_nonstarters
  275. hexify = lambda c: '{:04X}'.format(c)
  276. # Test whether `first` and `last` are corresponding "<..., First>" and
  277. # "<..., Last>" markers.
  278. def is_first_and_last(first, last):
  279. if not first.startswith('<') or not first.endswith(', First>'):
  280. return False
  281. if not last.startswith('<') or not last.endswith(', Last>'):
  282. return False
  283. return first[1:-8] == last[1:-7]
  284. def gen_mph_data(name, d, kv_type, kv_callback):
  285. (salt, keys) = minimal_perfect_hash(d)
  286. out.write("pub(crate) const %s_SALT: &[u16] = &[\n" % name.upper())
  287. for s in salt:
  288. out.write(" 0x{:x},\n".format(s))
  289. out.write("];\n")
  290. out.write("pub(crate) const {}_KV: &[{}] = &[\n".format(name.upper(), kv_type))
  291. for k in keys:
  292. out.write(" {},\n".format(kv_callback(k)))
  293. out.write("];\n\n")
  294. def gen_combining_class(combining_classes, out):
  295. gen_mph_data('canonical_combining_class', combining_classes, 'u32',
  296. lambda k: "0x{:X}".format(int(combining_classes[k]) | (k << 8)))
  297. def gen_composition_table(canon_comp, out):
  298. table = {}
  299. for (c1, c2), c3 in canon_comp.items():
  300. if c1 < 0x10000 and c2 < 0x10000:
  301. table[(c1 << 16) | c2] = c3
  302. (salt, keys) = minimal_perfect_hash(table)
  303. gen_mph_data('COMPOSITION_TABLE', table, '(u32, char)',
  304. lambda k: "(0x%s, '\\u{%s}')" % (hexify(k), hexify(table[k])))
  305. out.write("pub(crate) fn composition_table_astral(c1: char, c2: char) -> Option<char> {\n")
  306. out.write(" match (c1, c2) {\n")
  307. for (c1, c2), c3 in sorted(canon_comp.items()):
  308. if c1 >= 0x10000 and c2 >= 0x10000:
  309. out.write(" ('\\u{%s}', '\\u{%s}') => Some('\\u{%s}'),\n" % (hexify(c1), hexify(c2), hexify(c3)))
  310. out.write(" _ => None,\n")
  311. out.write(" }\n")
  312. out.write("}\n")
  313. def gen_decomposition_tables(canon_decomp, compat_decomp, cjk_compat_variants_decomp, out):
  314. tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility'), (cjk_compat_variants_decomp, 'cjk_compat_variants')]
  315. for table, name in tables:
  316. gen_mph_data(name + '_decomposed', table, "(u32, &'static [char])",
  317. lambda k: "(0x{:x}, &[{}])".format(k,
  318. ", ".join("'\\u{%s}'" % hexify(c) for c in table[k])))
  319. def gen_qc_match(prop_table, out):
  320. out.write(" match c {\n")
  321. for low, high, data in prop_table:
  322. assert data in ('N', 'M')
  323. result = "No" if data == 'N' else "Maybe"
  324. if high:
  325. out.write(r" '\u{%s}'...'\u{%s}' => %s," % (low, high, result))
  326. else:
  327. out.write(r" '\u{%s}' => %s," % (low, result))
  328. out.write("\n")
  329. out.write(" _ => Yes,\n")
  330. out.write(" }\n")
  331. def gen_nfc_qc(prop_tables, out):
  332. out.write("#[inline]\n")
  333. out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
  334. out.write("pub fn qc_nfc(c: char) -> IsNormalized {\n")
  335. gen_qc_match(prop_tables['NFC_QC'], out)
  336. out.write("}\n")
  337. def gen_nfkc_qc(prop_tables, out):
  338. out.write("#[inline]\n")
  339. out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
  340. out.write("pub fn qc_nfkc(c: char) -> IsNormalized {\n")
  341. gen_qc_match(prop_tables['NFKC_QC'], out)
  342. out.write("}\n")
  343. def gen_nfd_qc(prop_tables, out):
  344. out.write("#[inline]\n")
  345. out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
  346. out.write("pub fn qc_nfd(c: char) -> IsNormalized {\n")
  347. gen_qc_match(prop_tables['NFD_QC'], out)
  348. out.write("}\n")
  349. def gen_nfkd_qc(prop_tables, out):
  350. out.write("#[inline]\n")
  351. out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
  352. out.write("pub fn qc_nfkd(c: char) -> IsNormalized {\n")
  353. gen_qc_match(prop_tables['NFKD_QC'], out)
  354. out.write("}\n")
  355. def gen_combining_mark(general_category_mark, out):
  356. gen_mph_data('combining_mark', general_category_mark, 'u32',
  357. lambda k: '0x{:04x}'.format(k))
  358. def gen_public_assigned(general_category_public_assigned, out):
  359. # This could be done as a hash but the table is somewhat small.
  360. out.write("#[inline]\n")
  361. out.write("pub fn is_public_assigned(c: char) -> bool {\n")
  362. out.write(" match c {\n")
  363. start = True
  364. for first, last in general_category_public_assigned:
  365. if start:
  366. out.write(" ")
  367. start = False
  368. else:
  369. out.write(" | ")
  370. if first == last:
  371. out.write("'\\u{%s}'\n" % hexify(first))
  372. else:
  373. out.write("'\\u{%s}'..='\\u{%s}'\n" % (hexify(first), hexify(last)))
  374. out.write(" => true,\n")
  375. out.write(" _ => false,\n")
  376. out.write(" }\n")
  377. out.write("}\n")
  378. out.write("\n")
  379. def gen_stream_safe(leading, trailing, out):
  380. # This could be done as a hash but the table is very small.
  381. out.write("#[inline]\n")
  382. out.write("pub fn stream_safe_leading_nonstarters(c: char) -> usize {\n")
  383. out.write(" match c {\n")
  384. for char, num_leading in sorted(leading.items()):
  385. out.write(" '\\u{%s}' => %d,\n" % (hexify(char), num_leading))
  386. out.write(" _ => 0,\n")
  387. out.write(" }\n")
  388. out.write("}\n")
  389. out.write("\n")
  390. gen_mph_data('trailing_nonstarters', trailing, 'u32',
  391. lambda k: "0x{:X}".format(int(trailing[k]) | (k << 8)))
  392. def gen_tests(tests, out):
  393. out.write("""#[derive(Debug)]
  394. pub struct NormalizationTest {
  395. pub source: &'static str,
  396. pub nfc: &'static str,
  397. pub nfd: &'static str,
  398. pub nfkc: &'static str,
  399. pub nfkd: &'static str,
  400. }
  401. """)
  402. out.write("pub const NORMALIZATION_TESTS: &[NormalizationTest] = &[\n")
  403. str_literal = lambda s: '"%s"' % "".join("\\u{%s}" % c for c in s)
  404. for test in tests:
  405. out.write(" NormalizationTest {\n")
  406. out.write(" source: %s,\n" % str_literal(test.source))
  407. out.write(" nfc: %s,\n" % str_literal(test.nfc))
  408. out.write(" nfd: %s,\n" % str_literal(test.nfd))
  409. out.write(" nfkc: %s,\n" % str_literal(test.nfkc))
  410. out.write(" nfkd: %s,\n" % str_literal(test.nfkd))
  411. out.write(" },\n")
  412. out.write("];\n")
  413. # Guaranteed to be less than n.
  414. def my_hash(x, salt, n):
  415. # This is hash based on the theory that multiplication is efficient
  416. mask_32 = 0xffffffff
  417. y = ((x + salt) * 2654435769) & mask_32
  418. y ^= (x * 0x31415926) & mask_32
  419. return (y * n) >> 32
  420. # Compute minimal perfect hash function, d can be either a dict or list of keys.
  421. def minimal_perfect_hash(d):
  422. n = len(d)
  423. buckets = dict((h, []) for h in range(n))
  424. for key in d:
  425. h = my_hash(key, 0, n)
  426. buckets[h].append(key)
  427. bsorted = [(len(buckets[h]), h) for h in range(n)]
  428. bsorted.sort(reverse = True)
  429. claimed = [False] * n
  430. salts = [0] * n
  431. keys = [0] * n
  432. for (bucket_size, h) in bsorted:
  433. # Note: the traditional perfect hashing approach would also special-case
  434. # bucket_size == 1 here and assign any empty slot, rather than iterating
  435. # until rehash finds an empty slot. But we're not doing that so we can
  436. # avoid the branch.
  437. if bucket_size == 0:
  438. break
  439. else:
  440. for salt in range(1, 32768):
  441. rehashes = [my_hash(key, salt, n) for key in buckets[h]]
  442. # Make sure there are no rehash collisions within this bucket.
  443. if all(not claimed[hash] for hash in rehashes):
  444. if len(set(rehashes)) < bucket_size:
  445. continue
  446. salts[h] = salt
  447. for key in buckets[h]:
  448. rehash = my_hash(key, salt, n)
  449. claimed[rehash] = True
  450. keys[rehash] = key
  451. break
  452. if salts[h] == 0:
  453. print("minimal perfect hashing failed")
  454. # Note: if this happens (because of unfortunate data), then there are
  455. # a few things that could be done. First, the hash function could be
  456. # tweaked. Second, the bucket order could be scrambled (especially the
  457. # singletons). Right now, the buckets are sorted, which has the advantage
  458. # of being deterministic.
  459. #
  460. # As a more extreme approach, the singleton bucket optimization could be
  461. # applied (give the direct address for singleton buckets, rather than
  462. # relying on a rehash). That is definitely the more standard approach in
  463. # the minimal perfect hashing literature, but in testing the branch was a
  464. # significant slowdown.
  465. exit(1)
  466. return (salts, keys)
  467. if __name__ == '__main__':
  468. data = UnicodeData()
  469. with open("tables.rs", "w", newline = "\n") as out:
  470. out.write(PREAMBLE)
  471. out.write("use crate::quick_check::IsNormalized;\n")
  472. out.write("use crate::quick_check::IsNormalized::*;\n")
  473. out.write("\n")
  474. version = "(%s, %s, %s)" % tuple(UNICODE_VERSION.split("."))
  475. out.write("#[allow(unused)]\n")
  476. out.write("pub const UNICODE_VERSION: (u8, u8, u8) = %s;\n\n" % version)
  477. gen_combining_class(data.combining_classes, out)
  478. out.write("\n")
  479. gen_composition_table(data.canon_comp, out)
  480. out.write("\n")
  481. gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, data.cjk_compat_variants_fully_decomp, out)
  482. gen_combining_mark(data.general_category_mark, out)
  483. out.write("\n")
  484. gen_public_assigned(data.general_category_public_assigned, out)
  485. out.write("\n")
  486. gen_nfc_qc(data.norm_props, out)
  487. out.write("\n")
  488. gen_nfkc_qc(data.norm_props, out)
  489. out.write("\n")
  490. gen_nfd_qc(data.norm_props, out)
  491. out.write("\n")
  492. gen_nfkd_qc(data.norm_props, out)
  493. out.write("\n")
  494. gen_stream_safe(data.ss_leading, data.ss_trailing, out)
  495. out.write("\n")
  496. with open("normalization_tests.rs", "w", newline = "\n") as out:
  497. out.write(PREAMBLE)
  498. gen_tests(data.norm_tests, out)