unicode.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321
  1. #!/usr/bin/env python3
  2. #
  3. # Copyright 2011-2015 The Rust Project Developers. See the COPYRIGHT
  4. # file at the top-level directory of this distribution and at
  5. # http://rust-lang.org/COPYRIGHT.
  6. #
  7. # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  8. # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  9. # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  10. # option. This file may not be copied, modified, or distributed
  11. # except according to those terms.
  12. # This script uses the following Unicode tables:
  13. # - EastAsianWidth.txt
  14. # - ReadMe.txt
  15. # - UnicodeData.txt
  16. #
  17. # Since this should not require frequent updates, we just store this
  18. # out-of-line and check the unicode.rs file into git.
  19. import fileinput, re, os, sys, operator
  20. preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
  21. // file at the top-level directory of this distribution and at
  22. // http://rust-lang.org/COPYRIGHT.
  23. //
  24. // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  25. // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  26. // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  27. // option. This file may not be copied, modified, or distributed
  28. // except according to those terms.
  29. // NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
  30. #![allow(missing_docs, non_upper_case_globals, non_snake_case)]
  31. '''
  32. # Mapping taken from Table 12 from:
  33. # http://www.unicode.org/reports/tr44/#General_Category_Values
  34. expanded_categories = {
  35. 'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
  36. 'Lm': ['L'], 'Lo': ['L'],
  37. 'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
  38. 'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
  39. 'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
  40. 'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
  41. 'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
  42. 'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
  43. 'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
  44. }
  45. # these are the surrogate codepoints, which are not valid rust characters
  46. surrogate_codepoints = (0xd800, 0xdfff)
  47. def fetch(f):
  48. if not os.path.exists(os.path.basename(f)):
  49. os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s"
  50. % f)
  51. if not os.path.exists(os.path.basename(f)):
  52. sys.stderr.write("cannot load %s" % f)
  53. exit(1)
  54. def is_surrogate(n):
  55. return surrogate_codepoints[0] <= n <= surrogate_codepoints[1]
  56. def load_unicode_data(f):
  57. fetch(f)
  58. gencats = {}
  59. udict = {}
  60. range_start = -1
  61. for line in fileinput.input(f):
  62. data = line.split(';')
  63. if len(data) != 15:
  64. continue
  65. cp = int(data[0], 16)
  66. if is_surrogate(cp):
  67. continue
  68. if range_start >= 0:
  69. for i in range(range_start, cp):
  70. udict[i] = data
  71. range_start = -1
  72. if data[1].endswith(", First>"):
  73. range_start = cp
  74. continue
  75. udict[cp] = data
  76. for code in udict:
  77. [code_org, name, gencat, combine, bidi,
  78. decomp, deci, digit, num, mirror,
  79. old, iso, upcase, lowcase, titlecase ] = udict[code]
  80. # place letter in categories as appropriate
  81. for cat in [gencat, "Assigned"] + expanded_categories.get(gencat, []):
  82. if cat not in gencats:
  83. gencats[cat] = []
  84. gencats[cat].append(code)
  85. gencats = group_cats(gencats)
  86. return gencats
  87. def group_cats(cats):
  88. cats_out = {}
  89. for cat in cats:
  90. cats_out[cat] = group_cat(cats[cat])
  91. return cats_out
  92. def group_cat(cat):
  93. cat_out = []
  94. letters = sorted(set(cat))
  95. cur_start = letters.pop(0)
  96. cur_end = cur_start
  97. for letter in letters:
  98. assert letter > cur_end, \
  99. "cur_end: %s, letter: %s" % (hex(cur_end), hex(letter))
  100. if letter == cur_end + 1:
  101. cur_end = letter
  102. else:
  103. cat_out.append((cur_start, cur_end))
  104. cur_start = cur_end = letter
  105. cat_out.append((cur_start, cur_end))
  106. return cat_out
  107. def format_table_content(f, content, indent):
  108. line = " "*indent
  109. first = True
  110. for chunk in content.split(","):
  111. if len(line) + len(chunk) < 98:
  112. if first:
  113. line += chunk
  114. else:
  115. line += ", " + chunk
  116. first = False
  117. else:
  118. f.write(line + ",\n")
  119. line = " "*indent + chunk
  120. f.write(line)
  121. # load all widths of want_widths, except those in except_cats
  122. def load_east_asian_width(want_widths, except_cats):
  123. f = "EastAsianWidth.txt"
  124. fetch(f)
  125. widths = {}
  126. re1 = re.compile("^([0-9A-F]+);(\w+) +# (\w+)")
  127. re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+);(\w+) +# (\w+)")
  128. for line in fileinput.input(f):
  129. width = None
  130. d_lo = 0
  131. d_hi = 0
  132. cat = None
  133. m = re1.match(line)
  134. if m:
  135. d_lo = m.group(1)
  136. d_hi = m.group(1)
  137. width = m.group(2)
  138. cat = m.group(3)
  139. else:
  140. m = re2.match(line)
  141. if m:
  142. d_lo = m.group(1)
  143. d_hi = m.group(2)
  144. width = m.group(3)
  145. cat = m.group(4)
  146. else:
  147. continue
  148. if cat in except_cats or width not in want_widths:
  149. continue
  150. d_lo = int(d_lo, 16)
  151. d_hi = int(d_hi, 16)
  152. if width not in widths:
  153. widths[width] = []
  154. widths[width].append((d_lo, d_hi))
  155. return widths
  156. def escape_char(c):
  157. return "'\\u{%x}'" % c
  158. def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
  159. pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True):
  160. pub_string = "const"
  161. if not is_const:
  162. pub_string = "let"
  163. if is_pub:
  164. pub_string = "pub " + pub_string
  165. f.write(" %s %s: %s = &[\n" % (pub_string, name, t_type))
  166. data = ""
  167. first = True
  168. for dat in t_data:
  169. if not first:
  170. data += ","
  171. first = False
  172. data += pfun(dat)
  173. format_table_content(f, data, 8)
  174. f.write("\n ];\n\n")
  175. def emit_charwidth_module(f, width_table):
  176. f.write("pub mod charwidth {")
  177. f.write("""
  178. use core::option::Option::{self, Some, None};
  179. use core::result::Result::{Ok, Err};
  180. #[inline]
  181. fn bsearch_range_value_table(c: char, is_cjk: bool, r: &'static [(char, char, u8, u8)]) -> u8 {
  182. use core::cmp::Ordering::{Equal, Less, Greater};
  183. match r.binary_search_by(|&(lo, hi, _, _)| {
  184. if lo <= c && c <= hi { Equal }
  185. else if hi < c { Less }
  186. else { Greater }
  187. }) {
  188. Ok(idx) => {
  189. let (_, _, r_ncjk, r_cjk) = r[idx];
  190. if is_cjk { r_cjk } else { r_ncjk }
  191. }
  192. Err(_) => 1
  193. }
  194. }
  195. """)
  196. f.write("""
  197. #[inline]
  198. pub fn width(c: char, is_cjk: bool) -> Option<usize> {
  199. match c as usize {
  200. _c @ 0 => Some(0), // null is zero width
  201. cu if cu < 0x20 => None, // control sequences have no width
  202. cu if cu < 0x7F => Some(1), // ASCII
  203. cu if cu < 0xA0 => None, // more control sequences
  204. _ => Some(bsearch_range_value_table(c, is_cjk, charwidth_table) as usize)
  205. }
  206. }
  207. """)
  208. f.write(" // character width table. Based on Markus Kuhn's free wcwidth() implementation,\n")
  209. f.write(" // http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c\n")
  210. emit_table(f, "charwidth_table", width_table, "&'static [(char, char, u8, u8)]", is_pub=False,
  211. pfun=lambda x: "(%s,%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2], x[3]))
  212. f.write("}\n\n")
  213. def remove_from_wtable(wtable, val):
  214. wtable_out = []
  215. while wtable:
  216. if wtable[0][1] < val:
  217. wtable_out.append(wtable.pop(0))
  218. elif wtable[0][0] > val:
  219. break
  220. else:
  221. (wt_lo, wt_hi, width, width_cjk) = wtable.pop(0)
  222. if wt_lo == wt_hi == val:
  223. continue
  224. elif wt_lo == val:
  225. wtable_out.append((wt_lo+1, wt_hi, width, width_cjk))
  226. elif wt_hi == val:
  227. wtable_out.append((wt_lo, wt_hi-1, width, width_cjk))
  228. else:
  229. wtable_out.append((wt_lo, val-1, width, width_cjk))
  230. wtable_out.append((val+1, wt_hi, width, width_cjk))
  231. if wtable:
  232. wtable_out.extend(wtable)
  233. return wtable_out
  234. def optimize_width_table(wtable):
  235. wtable_out = []
  236. w_this = wtable.pop(0)
  237. while wtable:
  238. if w_this[1] == wtable[0][0] - 1 and w_this[2:3] == wtable[0][2:3]:
  239. w_tmp = wtable.pop(0)
  240. w_this = (w_this[0], w_tmp[1], w_tmp[2], w_tmp[3])
  241. else:
  242. wtable_out.append(w_this)
  243. w_this = wtable.pop(0)
  244. wtable_out.append(w_this)
  245. return wtable_out
  246. if __name__ == "__main__":
  247. r = "tables.rs"
  248. if os.path.exists(r):
  249. os.remove(r)
  250. with open(r, "w") as rf:
  251. # write the file's preamble
  252. rf.write(preamble)
  253. # download and parse all the data
  254. fetch("ReadMe.txt")
  255. with open("ReadMe.txt") as readme:
  256. pattern = "for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
  257. unicode_version = re.search(pattern, readme.read()).groups()
  258. rf.write("""
  259. /// The version of [Unicode](http://www.unicode.org/)
  260. /// that this version of unicode-width is based on.
  261. pub const UNICODE_VERSION: (u8, u8, u8) = (%s, %s, %s);
  262. """ % unicode_version)
  263. gencats = load_unicode_data("UnicodeData.txt")
  264. ### character width module
  265. width_table = []
  266. for zwcat in ["Me", "Mn", "Cf"]:
  267. width_table.extend([(lo_hi[0], lo_hi[1], 0, 0) for lo_hi in gencats[zwcat]])
  268. width_table.append((4448, 4607, 0, 0))
  269. # get widths, except those that are explicitly marked zero-width above
  270. ea_widths = load_east_asian_width(["W", "F", "A"], ["Me", "Mn", "Cf"])
  271. # these are doublewidth
  272. for dwcat in ["W", "F"]:
  273. width_table.extend([(lo_hi1[0], lo_hi1[1], 2, 2) for lo_hi1 in ea_widths[dwcat]])
  274. width_table.extend([(lo_hi2[0], lo_hi2[1], 1, 2) for lo_hi2 in ea_widths["A"]])
  275. width_table.sort(key=lambda w: w[0])
  276. # soft hyphen is not zero width in preformatted text; it's used to indicate
  277. # a hyphen inserted to facilitate a linebreak.
  278. width_table = remove_from_wtable(width_table, 173)
  279. # optimize the width table by collapsing adjacent entities when possible
  280. width_table = optimize_width_table(width_table)
  281. emit_charwidth_module(rf, width_table)