generate-encoding-data.py 60 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008
  1. #!/usr/bin/python
  2. # Copyright Mozilla Foundation. See the COPYRIGHT
  3. # file at the top-level directory of this distribution.
  4. #
  5. # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  6. # https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  7. # <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
  8. # option. This file may not be copied, modified, or distributed
  9. # except according to those terms.
  10. import json
  11. import subprocess
  12. import sys
  13. import os.path
  14. if (not os.path.isfile("../encoding/encodings.json")) or (not os.path.isfile("../encoding/indexes.json")):
  15. sys.stderr.write("This script needs a clone of https://github.com/whatwg/encoding/ (preferably at revision f381389) next to the encoding_rs directory.\n");
  16. sys.exit(-1)
  17. if not os.path.isfile("../encoding_c/src/lib.rs"):
  18. sys.stderr.write("This script also writes the generated parts of the encoding_c crate and needs a clone of https://github.com/hsivonen/encoding_c next to the encoding_rs directory.\n");
  19. sys.exit(-1)
  20. if not os.path.isfile("../codepage/src/lib.rs"):
  21. sys.stderr.write("This script also writes the generated parts of the codepage crate and needs a clone of https://github.com/hsivonen/codepage next to the encoding_rs directory.\n");
  22. sys.exit(-1)
  23. def cmp_from_end(one, other):
  24. c = cmp(len(one), len(other))
  25. if c != 0:
  26. return c
  27. i = len(one) - 1
  28. while i >= 0:
  29. c = cmp(one[i], other[i])
  30. if c != 0:
  31. return c
  32. i -= 1
  33. return 0
  34. class Label:
  35. def __init__(self, label, preferred):
  36. self.label = label
  37. self.preferred = preferred
  38. def __cmp__(self, other):
  39. return cmp_from_end(self.label, other.label)
  40. class CodePage:
  41. def __init__(self, code_page, preferred):
  42. self.code_page = code_page
  43. self.preferred = preferred
  44. def __cmp__(self, other):
  45. return self.code_page, other.code_page
  46. def static_u16_table(name, data):
  47. data_file.write('''pub static %s: [u16; %d] = [
  48. ''' % (name, len(data)))
  49. for i in xrange(len(data)):
  50. data_file.write('0x%04X,\n' % data[i])
  51. data_file.write('''];
  52. ''')
  53. def static_u16_table_from_indexable(name, data, item, feature):
  54. data_file.write('''#[cfg(all(
  55. feature = "less-slow-%s",
  56. not(feature = "fast-%s")
  57. ))]
  58. static %s: [u16; %d] = [
  59. ''' % (feature, feature, name, len(data)))
  60. for i in xrange(len(data)):
  61. data_file.write('0x%04X,\n' % data[i][item])
  62. data_file.write('''];
  63. ''')
  64. def static_u8_pair_table_from_indexable(name, data, item, feature):
  65. data_file.write('''#[cfg(all(
  66. feature = "less-slow-%s",
  67. not(feature = "fast-%s")
  68. ))]
  69. static %s: [[u8; 2]; %d] = [
  70. ''' % (feature, feature, name, len(data)))
  71. for i in xrange(len(data)):
  72. data_file.write('[0x%02X, 0x%02X],\n' % data[i][item])
  73. data_file.write('''];
  74. ''')
  75. def static_u8_pair_table(name, data, feature):
  76. data_file.write('''#[cfg(feature = "%s")]
  77. static %s: [[u8; 2]; %d] = [
  78. ''' % (feature, name, len(data)))
  79. for i in xrange(len(data)):
  80. pair = data[i]
  81. if not pair:
  82. pair = (0, 0)
  83. data_file.write('[0x%02X, 0x%02X],\n' % pair)
  84. data_file.write('''];
  85. ''')
  86. preferred = []
  87. dom = []
  88. labels = []
  89. data = json.load(open("../encoding/encodings.json", "r"))
  90. indexes = json.load(open("../encoding/indexes.json", "r"))
  91. single_byte = []
  92. multi_byte = []
  93. def to_camel_name(name):
  94. if name == u"iso-8859-8-i":
  95. return u"Iso8I"
  96. if name.startswith(u"iso-8859-"):
  97. return name.replace(u"iso-8859-", u"Iso")
  98. return name.title().replace(u"X-", u"").replace(u"-", u"").replace(u"_", u"")
  99. def to_constant_name(name):
  100. return name.replace(u"-", u"_").upper()
  101. def to_snake_name(name):
  102. return name.replace(u"-", u"_").lower()
  103. def to_dom_name(name):
  104. return name
  105. # Guestimate based on
  106. # https://w3techs.com/technologies/overview/character_encoding/all
  107. # whose methodology is known to be bogus, but the results are credible for
  108. # this purpose. UTF-16LE lifted up due to prevalence on Windows and
  109. # "ANSI codepages" prioritized.
  110. encodings_by_code_page_frequency = [
  111. "UTF-8",
  112. "UTF-16LE",
  113. "windows-1252",
  114. "windows-1251",
  115. "GBK",
  116. "Shift_JIS",
  117. "EUC-KR",
  118. "windows-1250",
  119. "windows-1256",
  120. "windows-1254",
  121. "Big5",
  122. "windows-874",
  123. "windows-1255",
  124. "windows-1253",
  125. "windows-1257",
  126. "windows-1258",
  127. "EUC-JP",
  128. "ISO-8859-2",
  129. "ISO-8859-15",
  130. "ISO-8859-7",
  131. "KOI8-R",
  132. "gb18030",
  133. "ISO-8859-5",
  134. "ISO-8859-8-I",
  135. "ISO-8859-4",
  136. "ISO-8859-6",
  137. "ISO-2022-JP",
  138. "KOI8-U",
  139. "ISO-8859-13",
  140. "ISO-8859-3",
  141. "UTF-16BE",
  142. "IBM866",
  143. "ISO-8859-10",
  144. "ISO-8859-8",
  145. "macintosh",
  146. "x-mac-cyrillic",
  147. "ISO-8859-14",
  148. "ISO-8859-16",
  149. ]
  150. encodings_by_code_page = {
  151. 932: "Shift_JIS",
  152. 936: "GBK",
  153. 949: "EUC-KR",
  154. 950: "Big5",
  155. 866: "IBM866",
  156. 874: "windows-874",
  157. 1200: "UTF-16LE",
  158. 1201: "UTF-16BE",
  159. 1250: "windows-1250",
  160. 1251: "windows-1251",
  161. 1252: "windows-1252",
  162. 1253: "windows-1253",
  163. 1254: "windows-1254",
  164. 1255: "windows-1255",
  165. 1256: "windows-1256",
  166. 1257: "windows-1257",
  167. 1258: "windows-1258",
  168. 10000: "macintosh",
  169. 10017: "x-mac-cyrillic",
  170. 20866: "KOI8-R",
  171. 20932: "EUC-JP",
  172. 21866: "KOI8-U",
  173. 28592: "ISO-8859-2",
  174. 28593: "ISO-8859-3",
  175. 28594: "ISO-8859-4",
  176. 28595: "ISO-8859-5",
  177. 28596: "ISO-8859-6",
  178. 28597: "ISO-8859-7",
  179. 28598: "ISO-8859-8",
  180. 28600: "ISO-8859-10",
  181. 28603: "ISO-8859-13",
  182. 28604: "ISO-8859-14",
  183. 28605: "ISO-8859-15",
  184. 28606: "ISO-8859-16",
  185. 38598: "ISO-8859-8-I",
  186. 50221: "ISO-2022-JP",
  187. 54936: "gb18030",
  188. 65001: "UTF-8",
  189. }
  190. code_pages_by_encoding = {}
  191. for code_page, encoding in encodings_by_code_page.iteritems():
  192. code_pages_by_encoding[encoding] = code_page
  193. encoding_by_alias_code_page = {
  194. 951: "Big5",
  195. 10007: "x-mac-cyrillic",
  196. 20936: "GBK",
  197. 20949: "EUC-KR",
  198. 21010: "UTF-16LE", # Undocumented; needed by calamine for Excel compat
  199. 28591: "windows-1252",
  200. 28599: "windows-1254",
  201. 28601: "windows-874",
  202. 50220: "ISO-2022-JP",
  203. 50222: "ISO-2022-JP",
  204. 50225: "replacement", # ISO-2022-KR
  205. 50227: "replacement", # ISO-2022-CN
  206. 51949: "EUC-JP",
  207. 51936: "GBK",
  208. 51949: "EUC-KR",
  209. 52936: "replacement", # HZ
  210. }
  211. code_pages = []
  212. for name in encodings_by_code_page_frequency:
  213. code_pages.append(code_pages_by_encoding[name])
  214. encodings_by_code_page.update(encoding_by_alias_code_page)
  215. temp_keys = encodings_by_code_page.keys()
  216. temp_keys.sort()
  217. for code_page in temp_keys:
  218. if not code_page in code_pages:
  219. code_pages.append(code_page)
  220. # The position in the index (0 is the first index entry,
  221. # i.e. byte value 0x80) that starts the longest run of
  222. # consecutive code points. Must not be in the first
  223. # quadrant. If the character to be encoded is not in this
  224. # run, the part of the index after the run is searched
  225. # forward. Then the part of the index from 32 to the start
  226. # of the run. The first quadrant is searched last.
  227. #
  228. # If there is no obviously most useful longest run,
  229. # the index here is just used to affect the search order.
  230. start_of_longest_run_in_single_byte = {
  231. "IBM866": 96, # 0 would be longest, but we don't want to start in the first quadrant
  232. "windows-874": 33,
  233. "windows-1250": 92,
  234. "windows-1251": 64,
  235. "windows-1252": 32,
  236. "windows-1253": 83,
  237. "windows-1254": 95,
  238. "windows-1255": 96,
  239. "windows-1256": 65,
  240. "windows-1257": 95, # not actually longest
  241. "windows-1258": 95, # not actually longest
  242. "macintosh": 106, # useless
  243. "x-mac-cyrillic": 96,
  244. "KOI8-R": 64, # not actually longest
  245. "KOI8-U": 64, # not actually longest
  246. "ISO-8859-2": 95, # not actually longest
  247. "ISO-8859-3": 95, # not actually longest
  248. "ISO-8859-4": 95, # not actually longest
  249. "ISO-8859-5": 46,
  250. "ISO-8859-6": 65,
  251. "ISO-8859-7": 83,
  252. "ISO-8859-8": 96,
  253. "ISO-8859-10": 90, # not actually longest
  254. "ISO-8859-13": 95, # not actually longest
  255. "ISO-8859-14": 95,
  256. "ISO-8859-15": 63,
  257. "ISO-8859-16": 95, # not actually longest
  258. }
  259. #
  260. for group in data:
  261. if group["heading"] == "Legacy single-byte encodings":
  262. single_byte = group["encodings"]
  263. else:
  264. multi_byte.extend(group["encodings"])
  265. for encoding in group["encodings"]:
  266. preferred.append(encoding["name"])
  267. for label in encoding["labels"]:
  268. labels.append(Label(label, encoding["name"]))
  269. for name in preferred:
  270. dom.append(to_dom_name(name))
  271. preferred.sort()
  272. labels.sort()
  273. dom.sort(cmp=cmp_from_end)
  274. longest_label_length = 0
  275. longest_name_length = 0
  276. longest_label = None
  277. longest_name = None
  278. for name in preferred:
  279. if len(name) > longest_name_length:
  280. longest_name_length = len(name)
  281. longest_name = name
  282. for label in labels:
  283. if len(label.label) > longest_label_length:
  284. longest_label_length = len(label.label)
  285. longest_label = label.label
  286. def longest_run_for_single_byte(name):
  287. if name == u"ISO-8859-8-I":
  288. name = u"ISO-8859-8"
  289. index = indexes[name.lower()]
  290. run_byte_offset = start_of_longest_run_in_single_byte[name]
  291. run_bmp_offset = index[run_byte_offset]
  292. previous_code_point = run_bmp_offset
  293. run_length = 1
  294. while True:
  295. i = run_byte_offset + run_length
  296. if i == len(index):
  297. break
  298. code_point = index[i]
  299. if previous_code_point + 1 != code_point:
  300. break
  301. previous_code_point = code_point
  302. run_length += 1
  303. return (run_bmp_offset, run_byte_offset, run_length)
  304. def is_single_byte(name):
  305. for encoding in single_byte:
  306. if name == encoding["name"]:
  307. return True
  308. return False
  309. def read_non_generated(path):
  310. partially_generated_file = open(path, "r")
  311. full = partially_generated_file.read()
  312. partially_generated_file.close()
  313. generated_begin = "// BEGIN GENERATED CODE. PLEASE DO NOT EDIT."
  314. generated_end = "// END GENERATED CODE"
  315. generated_begin_index = full.find(generated_begin)
  316. if generated_begin_index < 0:
  317. sys.stderr.write("Can't find generated code start marker in %s. Exiting.\n" % path)
  318. sys.exit(-1)
  319. generated_end_index = full.find(generated_end)
  320. if generated_end_index < 0:
  321. sys.stderr.write("Can't find generated code end marker in %s. Exiting.\n" % path)
  322. sys.exit(-1)
  323. return (full[0:generated_begin_index + len(generated_begin)],
  324. full[generated_end_index:])
  325. (lib_rs_begin, lib_rs_end) = read_non_generated("src/lib.rs")
  326. label_file = open("src/lib.rs", "w")
  327. label_file.write(lib_rs_begin)
  328. label_file.write("""
  329. // Instead, please regenerate using generate-encoding-data.py
  330. const LONGEST_LABEL_LENGTH: usize = %d; // %s
  331. """ % (longest_label_length, longest_label))
  332. for name in preferred:
  333. variant = None
  334. if is_single_byte(name):
  335. (run_bmp_offset, run_byte_offset, run_length) = longest_run_for_single_byte(name)
  336. variant = "SingleByte(&data::SINGLE_BYTE_DATA.%s, 0x%04X, %d, %d)" % (to_snake_name(u"iso-8859-8" if name == u"ISO-8859-8-I" else name), run_bmp_offset, run_byte_offset, run_length)
  337. else:
  338. variant = to_camel_name(name)
  339. docfile = open("doc/%s.txt" % name, "r")
  340. doctext = docfile.read()
  341. docfile.close()
  342. label_file.write('''/// The initializer for the [%s](static.%s.html) encoding.
  343. ///
  344. /// For use only for taking the address of this form when
  345. /// Rust prohibits the use of the non-`_INIT` form directly,
  346. /// such as in initializers of other `static`s. If in doubt,
  347. /// use the corresponding non-`_INIT` reference-typed `static`.
  348. ///
  349. /// This part of the public API will go away if Rust changes
  350. /// to make the referent of `pub const FOO: &'static Encoding`
  351. /// unique cross-crate or if Rust starts allowing static arrays
  352. /// to be initialized with `pub static FOO: &'static Encoding`
  353. /// items.
  354. pub static %s_INIT: Encoding = Encoding {
  355. name: "%s",
  356. variant: VariantEncoding::%s,
  357. };
  358. /// The %s encoding.
  359. ///
  360. %s///
  361. /// This will change from `static` to `const` if Rust changes
  362. /// to make the referent of `pub const FOO: &'static Encoding`
  363. /// unique cross-crate, so don't take the address of this
  364. /// `static`.
  365. pub static %s: &'static Encoding = &%s_INIT;
  366. ''' % (to_dom_name(name), to_constant_name(name), to_constant_name(name), to_dom_name(name), variant, to_dom_name(name), doctext, to_constant_name(name), to_constant_name(name)))
  367. label_file.write("""static LABELS_SORTED: [&'static str; %d] = [
  368. """ % len(labels))
  369. for label in labels:
  370. label_file.write('''"%s",\n''' % label.label)
  371. label_file.write("""];
  372. static ENCODINGS_IN_LABEL_SORT: [&'static Encoding; %d] = [
  373. """ % len(labels))
  374. for label in labels:
  375. label_file.write('''&%s_INIT,\n''' % to_constant_name(label.preferred))
  376. label_file.write('''];
  377. ''')
  378. label_file.write(lib_rs_end)
  379. label_file.close()
  380. label_test_file = open("src/test_labels_names.rs", "w")
  381. label_test_file.write('''// Any copyright to the test code below this comment is dedicated to the
  382. // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
  383. // THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
  384. // Instead, please regenerate using generate-encoding-data.py
  385. use super::*;
  386. #[test]
  387. fn test_all_labels() {
  388. ''')
  389. for label in labels:
  390. label_test_file.write('''assert_eq!(Encoding::for_label(b"%s"), Some(%s));\n''' % (label.label, to_constant_name(label.preferred)))
  391. label_test_file.write('''}
  392. ''')
  393. label_test_file.close()
  394. def null_to_zero(code_point):
  395. if not code_point:
  396. code_point = 0
  397. return code_point
  398. (data_rs_begin, data_rs_end) = read_non_generated("src/data.rs")
  399. data_file = open("src/data.rs", "w")
  400. data_file.write(data_rs_begin)
  401. data_file.write('''
  402. // Instead, please regenerate using generate-encoding-data.py
  403. #[repr(align(64))] // Align to cache lines
  404. pub struct SingleByteData {
  405. ''')
  406. # Single-byte
  407. for encoding in single_byte:
  408. name = encoding["name"]
  409. if name == u"ISO-8859-8-I":
  410. continue
  411. data_file.write(''' pub %s: [u16; 128],
  412. ''' % to_snake_name(name))
  413. data_file.write('''}
  414. pub static SINGLE_BYTE_DATA: SingleByteData = SingleByteData {
  415. ''')
  416. for encoding in single_byte:
  417. name = encoding["name"]
  418. if name == u"ISO-8859-8-I":
  419. continue
  420. data_file.write(''' %s: [
  421. ''' % to_snake_name(name))
  422. for code_point in indexes[name.lower()]:
  423. data_file.write('0x%04X,\n' % null_to_zero(code_point))
  424. data_file.write('''],
  425. ''')
  426. data_file.write('''};
  427. ''')
  428. # Big5
  429. index = indexes["big5"]
  430. astralness = []
  431. low_bits = []
  432. for code_point in index[942:19782]:
  433. if code_point:
  434. astralness.append(1 if code_point > 0xFFFF else 0)
  435. low_bits.append(code_point & 0xFFFF)
  436. else:
  437. astralness.append(0)
  438. low_bits.append(0)
  439. # pad length to multiple of 32
  440. for j in xrange(32 - (len(astralness) % 32)):
  441. astralness.append(0)
  442. data_file.write('''#[cfg_attr(feature = "cargo-clippy", allow(unreadable_literal))]
  443. static BIG5_ASTRALNESS: [u32; %d] = [
  444. ''' % (len(astralness) / 32))
  445. i = 0
  446. while i < len(astralness):
  447. accu = 0
  448. for j in xrange(32):
  449. accu |= astralness[i + j] << j
  450. data_file.write('0x%08X,\n' % accu)
  451. i += 32
  452. data_file.write('''];
  453. ''')
  454. static_u16_table("BIG5_LOW_BITS", low_bits)
  455. # Encoder table for Level 1 Hanzi
  456. # Note: If we were OK with doubling this table, we
  457. # could use a directly-indexable table instead...
  458. level1_hanzi_index = index[5495:10896]
  459. level1_hanzi_pairs = []
  460. for i in xrange(len(level1_hanzi_index)):
  461. hanzi_lead = (i / 157) + 0xA4
  462. hanzi_trail = (i % 157)
  463. hanzi_trail += 0x40 if hanzi_trail < 0x3F else 0x62
  464. level1_hanzi_pairs.append((level1_hanzi_index[i], (hanzi_lead, hanzi_trail)))
  465. level1_hanzi_pairs.append((0x4E5A, (0xC8, 0x7B)))
  466. level1_hanzi_pairs.append((0x5202, (0xC8, 0x7D)))
  467. level1_hanzi_pairs.append((0x9FB0, (0xC8, 0xA1)))
  468. level1_hanzi_pairs.append((0x5188, (0xC8, 0xA2)))
  469. level1_hanzi_pairs.append((0x9FB1, (0xC8, 0xA3)))
  470. level1_hanzi_pairs.sort(key=lambda x: x[0])
  471. static_u16_table_from_indexable("BIG5_LEVEL1_HANZI_CODE_POINTS", level1_hanzi_pairs, 0, "big5-hanzi-encode")
  472. static_u8_pair_table_from_indexable("BIG5_LEVEL1_HANZI_BYTES", level1_hanzi_pairs, 1, "big5-hanzi-encode")
  473. # Fast Unified Ideograph encode
  474. big5_unified_ideograph_bytes = [None] * (0x9FCC - 0x4E00)
  475. for row in xrange(0x7E - 0x20):
  476. for column in xrange(157):
  477. pointer = 5024 + column + (row * 157)
  478. code_point = index[pointer]
  479. if code_point and code_point >= 0x4E00 and code_point <= 0x9FCB:
  480. unified_offset = code_point - 0x4E00
  481. unified_lead = 0xA1 + row
  482. unified_trail = (0x40 if column < 0x3F else 0x62) + column
  483. if code_point == 0x5341 or code_point == 0x5345 or not big5_unified_ideograph_bytes[unified_offset]:
  484. big5_unified_ideograph_bytes[unified_offset] = (unified_lead, unified_trail)
  485. static_u8_pair_table("BIG5_UNIFIED_IDEOGRAPH_BYTES", big5_unified_ideograph_bytes, "fast-big5-hanzi-encode")
  486. # JIS0208
  487. index = indexes["jis0208"]
  488. # JIS 0208 Level 1 Kanji
  489. static_u16_table("JIS0208_LEVEL1_KANJI", index[1410:4375])
  490. # JIS 0208 Level 2 Kanji and Additional Kanji
  491. static_u16_table("JIS0208_LEVEL2_AND_ADDITIONAL_KANJI", index[4418:7808])
  492. # IBM Kanji
  493. static_u16_table("IBM_KANJI", index[8272:8632])
  494. # Check that the other instance is the same
  495. if index[8272:8632] != index[10744:11104]:
  496. raise Error()
  497. # JIS 0208 symbols (all non-Kanji, non-range items)
  498. symbol_index = []
  499. symbol_triples = []
  500. pointers_to_scan = [
  501. (0, 188),
  502. (658, 691),
  503. (1159, 1221),
  504. ]
  505. in_run = False
  506. run_start_pointer = 0
  507. run_start_array_index = 0
  508. for (start, end) in pointers_to_scan:
  509. for i in range(start, end):
  510. code_point = index[i]
  511. if in_run:
  512. if code_point:
  513. symbol_index.append(code_point)
  514. else:
  515. symbol_triples.append(run_start_pointer)
  516. symbol_triples.append(i - run_start_pointer)
  517. symbol_triples.append(run_start_array_index)
  518. in_run = False
  519. else:
  520. if code_point:
  521. in_run = True
  522. run_start_pointer = i
  523. run_start_array_index = len(symbol_index)
  524. symbol_index.append(code_point)
  525. if in_run:
  526. symbol_triples.append(run_start_pointer)
  527. symbol_triples.append(end - run_start_pointer)
  528. symbol_triples.append(run_start_array_index)
  529. in_run = False
  530. if in_run:
  531. raise Error()
  532. # Now add manually the two overlapping slices of
  533. # index from the NEC/IBM extensions.
  534. run_start_array_index = len(symbol_index)
  535. symbol_index.extend(index[10736:10744])
  536. # Later
  537. symbol_triples.append(10736)
  538. symbol_triples.append(8)
  539. symbol_triples.append(run_start_array_index)
  540. # Earlier
  541. symbol_triples.append(8644)
  542. symbol_triples.append(4)
  543. symbol_triples.append(run_start_array_index)
  544. static_u16_table("JIS0208_SYMBOLS", symbol_index)
  545. static_u16_table("JIS0208_SYMBOL_TRIPLES", symbol_triples)
  546. # Write down the magic numbers needed when preferring the earlier case
  547. data_file.write('''const IBM_SYMBOL_START: usize = %d;''' % (run_start_array_index + 1))
  548. data_file.write('''const IBM_SYMBOL_END: usize = %d;''' % (run_start_array_index + 4))
  549. data_file.write('''const IBM_SYMBOL_POINTER_START: usize = %d;''' % 8645)
  550. # JIS 0208 ranges (excluding kana)
  551. range_triples = []
  552. pointers_to_scan = [
  553. (188, 281),
  554. (470, 657),
  555. (1128, 1159),
  556. (8634, 8644),
  557. (10716, 10736),
  558. ]
  559. in_run = False
  560. run_start_pointer = 0
  561. run_start_code_point = 0
  562. previous_code_point = 0
  563. for (start, end) in pointers_to_scan:
  564. for i in range(start, end):
  565. code_point = index[i]
  566. if in_run:
  567. if code_point:
  568. if previous_code_point + 1 != code_point:
  569. range_triples.append(run_start_pointer)
  570. range_triples.append(i - run_start_pointer)
  571. range_triples.append(run_start_code_point)
  572. run_start_pointer = i
  573. run_start_code_point = code_point
  574. previous_code_point = code_point
  575. else:
  576. range_triples.append(run_start_pointer)
  577. range_triples.append(i - run_start_pointer)
  578. range_triples.append(run_start_code_point)
  579. run_start_pointer = 0
  580. run_start_code_point = 0
  581. previous_code_point = 0
  582. in_run = False
  583. else:
  584. if code_point:
  585. in_run = True
  586. run_start_pointer = i
  587. run_start_code_point = code_point
  588. previous_code_point = code_point
  589. if in_run:
  590. range_triples.append(run_start_pointer)
  591. range_triples.append(end - run_start_pointer)
  592. range_triples.append(run_start_code_point)
  593. run_start_pointer = 0
  594. run_start_code_point = 0
  595. previous_code_point = 0
  596. in_run = False
  597. if in_run:
  598. raise Error()
  599. static_u16_table("JIS0208_RANGE_TRIPLES", range_triples)
  600. # Encoder table for Level 1 Kanji
  601. # Note: If we were OK with 30 KB more footprint, we
  602. # could use a directly-indexable table instead...
  603. level1_kanji_index = index[1410:4375]
  604. level1_kanji_pairs = []
  605. for i in xrange(len(level1_kanji_index)):
  606. pointer = 1410 + i
  607. (lead, trail) = divmod(pointer, 188)
  608. lead += 0x81 if lead < 0x1F else 0xC1
  609. trail += 0x40 if trail < 0x3F else 0x41
  610. level1_kanji_pairs.append((level1_kanji_index[i], (lead, trail)))
  611. level1_kanji_pairs.sort(key=lambda x: x[0])
  612. static_u16_table_from_indexable("JIS0208_LEVEL1_KANJI_CODE_POINTS", level1_kanji_pairs, 0, "kanji-encode")
  613. static_u8_pair_table_from_indexable("JIS0208_LEVEL1_KANJI_SHIFT_JIS_BYTES", level1_kanji_pairs, 1, "kanji-encode")
  614. # Fast encoder table for Kanji
  615. kanji_bytes = [None] * (0x9FA1 - 0x4E00)
  616. for pointer in xrange(len(index)):
  617. code_point = index[pointer]
  618. if code_point and code_point >= 0x4E00 and code_point <= 0x9FA0:
  619. (lead, trail) = divmod(pointer, 188)
  620. lead += 0x81 if lead < 0x1F else 0xC1
  621. trail += 0x40 if trail < 0x3F else 0x41
  622. # unset the high bit of lead if IBM Kanji
  623. if pointer >= 8272:
  624. lead = lead & 0x7F
  625. kanji_bytes[code_point - 0x4E00] = (lead, trail)
  626. static_u8_pair_table("JIS0208_KANJI_BYTES", kanji_bytes, "fast-kanji-encode")
  627. # ISO-2022-JP half-width katakana
  628. # index is still jis0208
  629. half_width_index = indexes["iso-2022-jp-katakana"]
  630. data_file.write('''pub static ISO_2022_JP_HALF_WIDTH_TRAIL: [u8; %d] = [
  631. ''' % len(half_width_index))
  632. for i in xrange(len(half_width_index)):
  633. code_point = half_width_index[i]
  634. pointer = index.index(code_point)
  635. trail = pointer % 94 + 0x21
  636. data_file.write('0x%02X,\n' % trail)
  637. data_file.write('''];
  638. ''')
  639. # EUC-KR
  640. index = indexes["euc-kr"]
  641. # Unicode 1.1 Hangul above the old KS X 1001 block
  642. # Compressed form takes 35% of uncompressed form
  643. pointers = []
  644. offsets = []
  645. previous_code_point = 0
  646. for row in xrange(0x20):
  647. for column in xrange(190):
  648. i = column + (row * 190)
  649. # Skip the gaps
  650. if (column >= 0x1A and column < 0x20) or (column >= 0x3A and column < 0x40):
  651. continue
  652. code_point = index[i]
  653. if previous_code_point > code_point:
  654. raise Error()
  655. if code_point - previous_code_point != 1:
  656. adjustment = 0
  657. if column >= 0x40:
  658. adjustment = 12
  659. elif column >= 0x20:
  660. adjustment = 6
  661. pointers.append(column - adjustment + (row * (190 - 12)))
  662. offsets.append(code_point)
  663. previous_code_point = code_point
  664. static_u16_table("CP949_TOP_HANGUL_POINTERS", pointers)
  665. static_u16_table("CP949_TOP_HANGUL_OFFSETS", offsets)
  666. # Unicode 1.1 Hangul to the left of the old KS X 1001 block
  667. pointers = []
  668. offsets = []
  669. previous_code_point = 0
  670. for row in xrange(0x46 - 0x20):
  671. for column in xrange(190 - 94):
  672. i = 6080 + column + (row * 190)
  673. # Skip the gaps
  674. if (column >= 0x1A and column < 0x20) or (column >= 0x3A and column < 0x40):
  675. continue
  676. if i > 13127:
  677. # Exclude unassigned on partial last row
  678. break
  679. code_point = index[i]
  680. if previous_code_point > code_point:
  681. raise Error()
  682. if code_point - previous_code_point != 1:
  683. adjustment = 0
  684. if column >= 0x40:
  685. adjustment = 12
  686. elif column >= 0x20:
  687. adjustment = 6
  688. pointers.append(column - adjustment + (row * (190 - 94 - 12)))
  689. offsets.append(code_point)
  690. previous_code_point = code_point
  691. static_u16_table("CP949_LEFT_HANGUL_POINTERS", pointers)
  692. static_u16_table("CP949_LEFT_HANGUL_OFFSETS", offsets)
  693. # KS X 1001 Hangul
  694. hangul_index = []
  695. previous_code_point = 0
  696. for row in xrange(0x48 - 0x2F):
  697. for column in xrange(94):
  698. code_point = index[9026 + column + (row * 190)]
  699. if previous_code_point >= code_point:
  700. raise Error()
  701. hangul_index.append(code_point)
  702. previous_code_point = code_point
  703. static_u16_table("KSX1001_HANGUL", hangul_index)
  704. # KS X 1001 Hanja
  705. hanja_index = []
  706. for row in xrange(0x7D - 0x49):
  707. for column in xrange(94):
  708. hanja_index.append(index[13966 + column + (row * 190)])
  709. static_u16_table("KSX1001_HANJA", hanja_index)
  710. # KS X 1001 symbols
  711. symbol_index = []
  712. for i in range(6176, 6270):
  713. symbol_index.append(index[i])
  714. for i in range(6366, 6437):
  715. symbol_index.append(index[i])
  716. static_u16_table("KSX1001_SYMBOLS", symbol_index)
  717. # KS X 1001 Uppercase Latin
  718. subindex = []
  719. for i in range(7506, 7521):
  720. subindex.append(null_to_zero(index[i]))
  721. static_u16_table("KSX1001_UPPERCASE", subindex)
  722. # KS X 1001 Lowercase Latin
  723. subindex = []
  724. for i in range(7696, 7712):
  725. subindex.append(index[i])
  726. static_u16_table("KSX1001_LOWERCASE", subindex)
  727. # KS X 1001 Box drawing
  728. subindex = []
  729. for i in range(7126, 7194):
  730. subindex.append(index[i])
  731. static_u16_table("KSX1001_BOX", subindex)
  732. # KS X 1001 other
  733. pointers = []
  734. offsets = []
  735. previous_code_point = 0
  736. for row in xrange(10):
  737. for column in xrange(94):
  738. i = 6556 + column + (row * 190)
  739. code_point = index[i]
  740. # Exclude ranges that were processed as lookup tables
  741. # or that contain unmapped cells by filling them with
  742. # ASCII. Upon encode, ASCII code points will
  743. # never appear as the search key.
  744. if (i >= 6946 and i <= 6950):
  745. code_point = i - 6946
  746. elif (i >= 6961 and i <= 6967):
  747. code_point = i - 6961
  748. elif (i >= 6992 and i <= 6999):
  749. code_point = i - 6992
  750. elif (i >= 7024 and i <= 7029):
  751. code_point = i - 7024
  752. elif (i >= 7126 and i <= 7219):
  753. code_point = i - 7126
  754. elif (i >= 7395 and i <= 7409):
  755. code_point = i - 7395
  756. elif (i >= 7506 and i <= 7521):
  757. code_point = i - 7506
  758. elif (i >= 7696 and i <= 7711):
  759. code_point = i - 7696
  760. elif (i >= 7969 and i <= 7979):
  761. code_point = i - 7969
  762. elif (i >= 8162 and i <= 8169):
  763. code_point = i - 8162
  764. elif (i >= 8299 and i <= 8313):
  765. code_point = i - 8299
  766. elif (i >= 8347 and i <= 8359):
  767. code_point = i - 8347
  768. if code_point - previous_code_point != 1:
  769. pointers.append(column + (row * 94))
  770. offsets.append(code_point)
  771. previous_code_point = code_point
  772. static_u16_table("KSX1001_OTHER_POINTERS", pointers)
  773. # Omit the last offset, because the end of the last line
  774. # is unmapped, so we don't want to look at it.
  775. static_u16_table("KSX1001_OTHER_UNSORTED_OFFSETS", offsets[:-1])
  776. # Fast Hangul and Hanja encode
  777. hangul_bytes = [None] * (0xD7A4 - 0xAC00)
  778. hanja_unified_bytes = [None] * (0x9F9D - 0x4E00)
  779. hanja_compatibility_bytes = [None] * (0xFA0C - 0xF900)
  780. for row in xrange(0x7D):
  781. for column in xrange(190):
  782. pointer = column + (row * 190)
  783. code_point = index[pointer]
  784. if code_point:
  785. lead = 0x81 + row
  786. trail = 0x41 + column
  787. if code_point >= 0xAC00 and code_point < 0xD7A4:
  788. hangul_bytes[code_point - 0xAC00] = (lead, trail)
  789. elif code_point >= 0x4E00 and code_point < 0x9F9D:
  790. hanja_unified_bytes[code_point - 0x4E00] = (lead, trail)
  791. elif code_point >= 0xF900 and code_point < 0xFA0C:
  792. hanja_compatibility_bytes[code_point - 0xF900] = (lead, trail)
  793. static_u8_pair_table("CP949_HANGUL_BYTES", hangul_bytes, "fast-hangul-encode")
  794. static_u8_pair_table("KSX1001_UNIFIED_HANJA_BYTES", hanja_unified_bytes, "fast-hanja-encode")
  795. static_u8_pair_table("KSX1001_COMPATIBILITY_HANJA_BYTES", hanja_compatibility_bytes, "fast-hanja-encode")
  796. # JIS 0212
  797. index = indexes["jis0212"]
  798. # JIS 0212 Kanji
  799. static_u16_table("JIS0212_KANJI", index[1410:7211])
  800. # JIS 0212 accented (all non-Kanji, non-range items)
  801. symbol_index = []
  802. symbol_triples = []
  803. pointers_to_scan = [
  804. (0, 596),
  805. (608, 644),
  806. (656, 1409),
  807. ]
  808. in_run = False
  809. run_start_pointer = 0
  810. run_start_array_index = 0
  811. for (start, end) in pointers_to_scan:
  812. for i in range(start, end):
  813. code_point = index[i]
  814. if in_run:
  815. if code_point:
  816. symbol_index.append(code_point)
  817. elif index[i + 1]:
  818. symbol_index.append(0)
  819. else:
  820. symbol_triples.append(run_start_pointer)
  821. symbol_triples.append(i - run_start_pointer)
  822. symbol_triples.append(run_start_array_index)
  823. in_run = False
  824. else:
  825. if code_point:
  826. in_run = True
  827. run_start_pointer = i
  828. run_start_array_index = len(symbol_index)
  829. symbol_index.append(code_point)
  830. if in_run:
  831. symbol_triples.append(run_start_pointer)
  832. symbol_triples.append(end - run_start_pointer)
  833. symbol_triples.append(run_start_array_index)
  834. in_run = False
  835. if in_run:
  836. raise Error()
  837. static_u16_table("JIS0212_ACCENTED", symbol_index)
  838. static_u16_table("JIS0212_ACCENTED_TRIPLES", symbol_triples)
  839. # gb18030
  840. index = indexes["gb18030"]
  841. # Unicode 1.1 ideographs above the old GB2312 block
  842. # Compressed form takes 63% of uncompressed form
  843. pointers = []
  844. offsets = []
  845. previous_code_point = 0
  846. for i in xrange(6080):
  847. code_point = index[i]
  848. if previous_code_point > code_point:
  849. raise Error()
  850. if code_point - previous_code_point != 1:
  851. pointers.append(i)
  852. offsets.append(code_point)
  853. previous_code_point = code_point
  854. static_u16_table("GBK_TOP_IDEOGRAPH_POINTERS", pointers)
  855. static_u16_table("GBK_TOP_IDEOGRAPH_OFFSETS", offsets)
  856. # Unicode 1.1 ideographs to the left of the old GB2312 block
  857. # Compressed form takes 40% of uncompressed form
  858. pointers = []
  859. offsets = []
  860. previous_code_point = 0
  861. for row in xrange(0x7D - 0x29):
  862. for column in xrange(190 - 94):
  863. i = 7790 + column + (row * 190)
  864. if i > 23650:
  865. # Exclude compatibility ideographs at the end
  866. break
  867. code_point = index[i]
  868. if previous_code_point > code_point:
  869. raise Error()
  870. if code_point - previous_code_point != 1:
  871. pointers.append(column + (row * (190 - 94)))
  872. offsets.append(code_point)
  873. previous_code_point = code_point
  874. static_u16_table("GBK_LEFT_IDEOGRAPH_POINTERS", pointers)
  875. static_u16_table("GBK_LEFT_IDEOGRAPH_OFFSETS", offsets)
  876. # GBK other (excl. Ext A, Compat & PUA at the bottom)
  877. pointers = []
  878. offsets = []
  879. previous_code_point = 0
  880. for row in xrange(0x29 - 0x20):
  881. for column in xrange(190 - 94):
  882. i = 6080 + column + (row * 190)
  883. code_point = index[i]
  884. if code_point - previous_code_point != 1:
  885. pointers.append(column + (row * (190 - 94)))
  886. offsets.append(code_point)
  887. previous_code_point = code_point
  888. pointers.append((190 - 94) * (0x29 - 0x20))
  889. static_u16_table("GBK_OTHER_POINTERS", pointers)
  890. static_u16_table("GBK_OTHER_UNSORTED_OFFSETS", offsets)
  891. # GBK bottom: Compatibility ideagraphs, Ext A and PUA
  892. bottom_index = []
  893. # 5 compat following Unified Ideographs
  894. for i in range(23651, 23656):
  895. bottom_index.append(index[i])
  896. # Last row
  897. for i in range(23750, 23846):
  898. bottom_index.append(index[i])
  899. static_u16_table("GBK_BOTTOM", bottom_index)
  900. # GB2312 Hanzi
  901. # (and the 5 PUA code points in between Level 1 and Level 2)
  902. hanzi_index = []
  903. for row in xrange(0x77 - 0x2F):
  904. for column in xrange(94):
  905. hanzi_index.append(index[9026 + column + (row * 190)])
  906. static_u16_table("GB2312_HANZI", hanzi_index)
  907. # GB2312 symbols
  908. symbol_index = []
  909. for i in xrange(94):
  910. symbol_index.append(index[6176 + i])
  911. static_u16_table("GB2312_SYMBOLS", symbol_index)
  912. # GB2312 symbols on Greek row (incl. PUA)
  913. symbol_index = []
  914. for i in xrange(22):
  915. symbol_index.append(index[7189 + i])
  916. static_u16_table("GB2312_SYMBOLS_AFTER_GREEK", symbol_index)
  917. # GB2312 Pinyin
  918. pinyin_index = []
  919. for i in xrange(32):
  920. pinyin_index.append(index[7506 + i])
  921. static_u16_table("GB2312_PINYIN", pinyin_index)
  922. # GB2312 other (excl. bottom PUA)
  923. pointers = []
  924. offsets = []
  925. previous_code_point = 0
  926. for row in xrange(14):
  927. for column in xrange(94):
  928. i = 6366 + column + (row * 190)
  929. code_point = index[i]
  930. # Exclude the two ranges that were processed as
  931. # lookup tables above by filling them with
  932. # ASCII. Upon encode, ASCII code points will
  933. # never appear as the search key.
  934. if (i >= 7189 and i < 7189 + 22):
  935. code_point = i - 7189
  936. elif (i >= 7506 and i < 7506 + 32):
  937. code_point = i - 7506
  938. if code_point - previous_code_point != 1:
  939. pointers.append(column + (row * 94))
  940. offsets.append(code_point)
  941. previous_code_point = code_point
  942. pointers.append(14 * 94)
  943. static_u16_table("GB2312_OTHER_POINTERS", pointers)
  944. static_u16_table("GB2312_OTHER_UNSORTED_OFFSETS", offsets)
  945. # Non-gbk code points
  946. pointers = []
  947. offsets = []
  948. for pair in indexes["gb18030-ranges"]:
  949. if pair[1] == 0x10000:
  950. break # the last entry doesn't fit in u16
  951. pointers.append(pair[0])
  952. offsets.append(pair[1])
  953. static_u16_table("GB18030_RANGE_POINTERS", pointers)
  954. static_u16_table("GB18030_RANGE_OFFSETS", offsets)
  955. # Encoder table for Level 1 Hanzi
  956. # The units here really fit into 12 bits, but since we're
  957. # looking for speed here, let's use 16 bits per unit.
  958. # Once we use 16 bits per unit, we might as well precompute
  959. # the output bytes.
  960. level1_hanzi_index = hanzi_index[:(94 * (0xD8 - 0xB0) - 5)]
  961. level1_hanzi_pairs = []
  962. for i in xrange(len(level1_hanzi_index)):
  963. hanzi_lead = (i / 94) + 0xB0
  964. hanzi_trail = (i % 94) + 0xA1
  965. level1_hanzi_pairs.append((level1_hanzi_index[i], (hanzi_lead, hanzi_trail)))
  966. level1_hanzi_pairs.sort(key=lambda x: x[0])
  967. static_u16_table_from_indexable("GB2312_LEVEL1_HANZI_CODE_POINTS", level1_hanzi_pairs, 0, "gb-hanzi-encode")
  968. static_u8_pair_table_from_indexable("GB2312_LEVEL1_HANZI_BYTES", level1_hanzi_pairs, 1, "gb-hanzi-encode")
  969. # Fast Hanzi encoder table
  970. hanzi_bytes = [None] * (0x9FA7 - 0x4E00)
  971. for row in xrange(126):
  972. for column in xrange(190):
  973. pointer = column + (row * 190)
  974. code_point = index[pointer]
  975. if code_point and code_point >= 0x4E00 and code_point <= 0x9FA6:
  976. hanzi_lead = 0x81 + row
  977. hanzi_trail = column + (0x40 if column < 0x3F else 0x41)
  978. hanzi_bytes[code_point - 0x4E00] = (hanzi_lead, hanzi_trail)
  979. static_u8_pair_table("GBK_HANZI_BYTES", hanzi_bytes, "fast-gb-hanzi-encode")
  980. data_file.write(data_rs_end)
  981. data_file.close()
  982. # Variant
  983. variant_file = open("src/variant.rs", "w")
  984. variant_file.write('''// Copyright Mozilla Foundation. See the COPYRIGHT
  985. // file at the top-level directory of this distribution.
  986. //
  987. // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  988. // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  989. // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
  990. // option. This file may not be copied, modified, or distributed
  991. // except according to those terms.
  992. // THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
  993. // Instead, please regenerate using generate-encoding-data.py
  994. //! This module provides enums that wrap the various decoders and encoders.
  995. //! The purpose is to make `Decoder` and `Encoder` `Sized` by writing the
  996. //! dispatch explicitly for a finite set of specialized decoders and encoders.
  997. //! Unfortunately, this means the compiler doesn't generate the dispatch code
  998. //! and it has to be written here instead.
  999. //!
  1000. //! The purpose of making `Decoder` and `Encoder` `Sized` is to allow stack
  1001. //! allocation in Rust code, including the convenience methods on `Encoding`.
  1002. ''')
  1003. encoding_variants = [u"single-byte",]
  1004. for encoding in multi_byte:
  1005. if encoding["name"] in [u"UTF-16LE", u"UTF-16BE"]:
  1006. continue
  1007. else:
  1008. encoding_variants.append(encoding["name"])
  1009. encoding_variants.append(u"UTF-16")
  1010. decoder_variants = []
  1011. for variant in encoding_variants:
  1012. if variant == u"GBK":
  1013. continue
  1014. decoder_variants.append(variant)
  1015. encoder_variants = []
  1016. for variant in encoding_variants:
  1017. if variant in [u"replacement", u"GBK", u"UTF-16"]:
  1018. continue
  1019. encoder_variants.append(variant)
  1020. for variant in decoder_variants:
  1021. variant_file.write("use %s::*;\n" % to_snake_name(variant))
  1022. variant_file.write('''use super::*;
  1023. pub enum VariantDecoder {
  1024. ''')
  1025. for variant in decoder_variants:
  1026. variant_file.write(" %s(%sDecoder),\n" % (to_camel_name(variant), to_camel_name(variant)))
  1027. variant_file.write('''}
  1028. impl VariantDecoder {
  1029. ''')
  1030. def write_variant_method(name, mut, arg_list, ret, variants, excludes, kind):
  1031. variant_file.write('''pub fn %s(&''' % name)
  1032. if mut:
  1033. variant_file.write('''mut ''')
  1034. variant_file.write('''self''')
  1035. for arg in arg_list:
  1036. variant_file.write(''', %s: %s''' % (arg[0], arg[1]))
  1037. variant_file.write(''')''')
  1038. if ret:
  1039. variant_file.write(''' -> %s''' % ret)
  1040. variant_file.write(''' {\nmatch *self {\n''')
  1041. for variant in variants:
  1042. variant_file.write('''Variant%s::%s(ref ''' % (kind, to_camel_name(variant)))
  1043. if mut:
  1044. variant_file.write('''mut ''')
  1045. if variant in excludes:
  1046. variant_file.write('''v) => (),''')
  1047. continue
  1048. variant_file.write('''v) => v.%s(''' % name)
  1049. first = True
  1050. for arg in arg_list:
  1051. if not first:
  1052. variant_file.write(''', ''')
  1053. first = False
  1054. variant_file.write(arg[0])
  1055. variant_file.write('''),\n''')
  1056. variant_file.write('''}\n}\n\n''')
  1057. write_variant_method("max_utf16_buffer_length", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder")
  1058. write_variant_method("max_utf8_buffer_length_without_replacement", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder")
  1059. write_variant_method("max_utf8_buffer_length", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder")
  1060. write_variant_method("decode_to_utf16_raw", True, [("src", "&[u8]"),
  1061. ("dst", "&mut [u16]"),
  1062. ("last", "bool")], "(DecoderResult, usize, usize)", decoder_variants, [], "Decoder")
  1063. write_variant_method("decode_to_utf8_raw", True, [("src", "&[u8]"),
  1064. ("dst", "&mut [u8]"),
  1065. ("last", "bool")], "(DecoderResult, usize, usize)", decoder_variants, [], "Decoder")
  1066. variant_file.write('''
  1067. pub fn latin1_byte_compatible_up_to(&self, buffer: &[u8]) -> Option<usize> {
  1068. match *self {
  1069. VariantDecoder::SingleByte(ref v) => {
  1070. return Some(v.latin1_byte_compatible_up_to(buffer));
  1071. }
  1072. VariantDecoder::Utf8(ref v) => {
  1073. if !v.in_neutral_state() {
  1074. return None;
  1075. }
  1076. }
  1077. VariantDecoder::Gb18030(ref v) => {
  1078. if !v.in_neutral_state() {
  1079. return None;
  1080. }
  1081. }
  1082. VariantDecoder::Big5(ref v) => {
  1083. if !v.in_neutral_state() {
  1084. return None;
  1085. }
  1086. }
  1087. VariantDecoder::EucJp(ref v) => {
  1088. if !v.in_neutral_state() {
  1089. return None;
  1090. }
  1091. }
  1092. VariantDecoder::Iso2022Jp(ref v) => {
  1093. if v.in_neutral_state() {
  1094. return Some(Encoding::iso_2022_jp_ascii_valid_up_to(buffer));
  1095. }
  1096. return None;
  1097. }
  1098. VariantDecoder::ShiftJis(ref v) => {
  1099. if !v.in_neutral_state() {
  1100. return None;
  1101. }
  1102. }
  1103. VariantDecoder::EucKr(ref v) => {
  1104. if !v.in_neutral_state() {
  1105. return None;
  1106. }
  1107. }
  1108. VariantDecoder::UserDefined(_) => {}
  1109. VariantDecoder::Replacement(_) | VariantDecoder::Utf16(_) => {
  1110. return None;
  1111. }
  1112. };
  1113. Some(Encoding::ascii_valid_up_to(buffer))
  1114. }
  1115. }
  1116. pub enum VariantEncoder {
  1117. ''')
  1118. for variant in encoder_variants:
  1119. variant_file.write(" %s(%sEncoder),\n" % (to_camel_name(variant), to_camel_name(variant)))
  1120. variant_file.write('''}
  1121. impl VariantEncoder {
  1122. pub fn has_pending_state(&self) -> bool {
  1123. match *self {
  1124. VariantEncoder::Iso2022Jp(ref v) => {
  1125. v.has_pending_state()
  1126. }
  1127. _ => false,
  1128. }
  1129. }
  1130. ''')
  1131. write_variant_method("max_buffer_length_from_utf16_without_replacement", False, [("u16_length", "usize")], "Option<usize>", encoder_variants, [], "Encoder")
  1132. write_variant_method("max_buffer_length_from_utf8_without_replacement", False, [("byte_length", "usize")], "Option<usize>", encoder_variants, [], "Encoder")
  1133. write_variant_method("encode_from_utf16_raw", True, [("src", "&[u16]"),
  1134. ("dst", "&mut [u8]"),
  1135. ("last", "bool")], "(EncoderResult, usize, usize)", encoder_variants, [], "Encoder")
  1136. write_variant_method("encode_from_utf8_raw", True, [("src", "&str"),
  1137. ("dst", "&mut [u8]"),
  1138. ("last", "bool")], "(EncoderResult, usize, usize)", encoder_variants, [], "Encoder")
  1139. variant_file.write('''}
  1140. pub enum VariantEncoding {
  1141. SingleByte(&'static [u16; 128], u16, u8, u8),''')
  1142. for encoding in multi_byte:
  1143. variant_file.write("%s,\n" % to_camel_name(encoding["name"]))
  1144. variant_file.write('''}
  1145. impl VariantEncoding {
  1146. pub fn new_variant_decoder(&self) -> VariantDecoder {
  1147. match *self {
  1148. VariantEncoding::SingleByte(table, _, _, _) => SingleByteDecoder::new(table),
  1149. VariantEncoding::Utf8 => Utf8Decoder::new(),
  1150. VariantEncoding::Gbk | VariantEncoding::Gb18030 => Gb18030Decoder::new(),
  1151. VariantEncoding::Big5 => Big5Decoder::new(),
  1152. VariantEncoding::EucJp => EucJpDecoder::new(),
  1153. VariantEncoding::Iso2022Jp => Iso2022JpDecoder::new(),
  1154. VariantEncoding::ShiftJis => ShiftJisDecoder::new(),
  1155. VariantEncoding::EucKr => EucKrDecoder::new(),
  1156. VariantEncoding::Replacement => ReplacementDecoder::new(),
  1157. VariantEncoding::UserDefined => UserDefinedDecoder::new(),
  1158. VariantEncoding::Utf16Be => Utf16Decoder::new(true),
  1159. VariantEncoding::Utf16Le => Utf16Decoder::new(false),
  1160. }
  1161. }
  1162. pub fn new_encoder(&self, encoding: &'static Encoding) -> Encoder {
  1163. match *self {
  1164. VariantEncoding::SingleByte(table, run_bmp_offset, run_byte_offset, run_length) => SingleByteEncoder::new(encoding, table, run_bmp_offset, run_byte_offset, run_length),
  1165. VariantEncoding::Utf8 => Utf8Encoder::new(encoding),
  1166. VariantEncoding::Gbk => Gb18030Encoder::new(encoding, false),
  1167. VariantEncoding::Gb18030 => Gb18030Encoder::new(encoding, true),
  1168. VariantEncoding::Big5 => Big5Encoder::new(encoding),
  1169. VariantEncoding::EucJp => EucJpEncoder::new(encoding),
  1170. VariantEncoding::Iso2022Jp => Iso2022JpEncoder::new(encoding),
  1171. VariantEncoding::ShiftJis => ShiftJisEncoder::new(encoding),
  1172. VariantEncoding::EucKr => EucKrEncoder::new(encoding),
  1173. VariantEncoding::UserDefined => UserDefinedEncoder::new(encoding),
  1174. VariantEncoding::Utf16Be | VariantEncoding::Replacement |
  1175. VariantEncoding::Utf16Le => unreachable!(),
  1176. }
  1177. }
  1178. pub fn is_single_byte(&self) -> bool {
  1179. match *self {
  1180. VariantEncoding::SingleByte(_, _, _, _) | VariantEncoding::UserDefined => true,
  1181. _ => false,
  1182. }
  1183. }
  1184. }
  1185. ''')
  1186. variant_file.close()
  1187. (ffi_rs_begin, ffi_rs_end) = read_non_generated("../encoding_c/src/lib.rs")
  1188. ffi_file = open("../encoding_c/src/lib.rs", "w")
  1189. ffi_file.write(ffi_rs_begin)
  1190. ffi_file.write("""
  1191. // Instead, please regenerate using generate-encoding-data.py
  1192. /// The minimum length of buffers that may be passed to `encoding_name()`.
  1193. pub const ENCODING_NAME_MAX_LENGTH: usize = %d; // %s
  1194. """ % (longest_name_length, longest_name))
  1195. for name in preferred:
  1196. ffi_file.write('''/// The %s encoding.
  1197. #[no_mangle]
  1198. pub static %s_ENCODING: ConstEncoding = ConstEncoding(&%s_INIT);
  1199. ''' % (to_dom_name(name), to_constant_name(name), to_constant_name(name)))
  1200. ffi_file.write(ffi_rs_end)
  1201. ffi_file.close()
  1202. (single_byte_rs_begin, single_byte_rs_end) = read_non_generated("src/single_byte.rs")
  1203. single_byte_file = open("src/single_byte.rs", "w")
  1204. single_byte_file.write(single_byte_rs_begin)
  1205. single_byte_file.write("""
  1206. // Instead, please regenerate using generate-encoding-data.py
  1207. #[test]
  1208. fn test_single_byte_decode() {""")
  1209. idx = 0 # for Miri, return after 2nd test
  1210. for name in preferred:
  1211. if name == u"ISO-8859-8-I":
  1212. continue;
  1213. if is_single_byte(name):
  1214. single_byte_file.write("""
  1215. decode_single_byte(%s, &data::SINGLE_BYTE_DATA.%s);""" % (to_constant_name(name), to_snake_name(name)))
  1216. idx += 1
  1217. if idx == 2:
  1218. single_byte_file.write("""
  1219. if cfg!(miri) {
  1220. // Miri is too slow
  1221. return;
  1222. }""")
  1223. single_byte_file.write("""
  1224. }
  1225. #[test]
  1226. fn test_single_byte_encode() {""")
  1227. idx = 0 # for Miri, return after 2nd test
  1228. for name in preferred:
  1229. if name == u"ISO-8859-8-I":
  1230. continue;
  1231. if is_single_byte(name):
  1232. single_byte_file.write("""
  1233. encode_single_byte(%s, &data::SINGLE_BYTE_DATA.%s);""" % (to_constant_name(name), to_snake_name(name)))
  1234. idx += 1
  1235. if idx == 2:
  1236. single_byte_file.write("""
  1237. if cfg!(miri) {
  1238. // Miri is too slow
  1239. return;
  1240. }""")
  1241. single_byte_file.write("""
  1242. }
  1243. """)
  1244. single_byte_file.write(single_byte_rs_end)
  1245. single_byte_file.close()
  1246. static_file = open("../encoding_c/include/encoding_rs_statics.h", "w")
  1247. static_file.write("""// Copyright Mozilla Foundation. See the COPYRIGHT
  1248. // file at the top-level directory of this distribution.
  1249. //
  1250. // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  1251. // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  1252. // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
  1253. // option. This file may not be copied, modified, or distributed
  1254. // except according to those terms.
  1255. // THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
  1256. // Instead, please regenerate using generate-encoding-data.py
  1257. // This file is not meant to be included directly. Instead, encoding_rs.h
  1258. // includes this file.
  1259. #ifndef encoding_rs_statics_h_
  1260. #define encoding_rs_statics_h_
  1261. #ifndef ENCODING_RS_ENCODING
  1262. #define ENCODING_RS_ENCODING Encoding
  1263. #ifndef __cplusplus
  1264. typedef struct Encoding_ Encoding;
  1265. #endif
  1266. #endif
  1267. #ifndef ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR
  1268. #define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ENCODING_RS_ENCODING*
  1269. #endif
  1270. #ifndef ENCODING_RS_ENCODER
  1271. #define ENCODING_RS_ENCODER Encoder
  1272. #ifndef __cplusplus
  1273. typedef struct Encoder_ Encoder;
  1274. #endif
  1275. #endif
  1276. #ifndef ENCODING_RS_DECODER
  1277. #define ENCODING_RS_DECODER Decoder
  1278. #ifndef __cplusplus
  1279. typedef struct Decoder_ Decoder;
  1280. #endif
  1281. #endif
  1282. #define INPUT_EMPTY 0
  1283. #define OUTPUT_FULL 0xFFFFFFFF
  1284. // %s
  1285. #define ENCODING_NAME_MAX_LENGTH %d
  1286. """ % (longest_name, longest_name_length))
  1287. for name in preferred:
  1288. static_file.write('''/// The %s encoding.
  1289. extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const %s_ENCODING;
  1290. ''' % (to_dom_name(name), to_constant_name(name)))
  1291. static_file.write("""#endif // encoding_rs_statics_h_
  1292. """)
  1293. static_file.close()
  1294. (utf_8_rs_begin, utf_8_rs_end) = read_non_generated("src/utf_8.rs")
  1295. utf_8_file = open("src/utf_8.rs", "w")
  1296. utf_8_file.write(utf_8_rs_begin)
  1297. utf_8_file.write("""
  1298. // Instead, please regenerate using generate-encoding-data.py
  1299. pub static UTF8_DATA: Utf8Data = Utf8Data {
  1300. table: [
  1301. """)
  1302. for i in range(256):
  1303. combined = (1 << 2) # invalid lead
  1304. if i < 0x80 or i > 0xBF:
  1305. combined |= (1 << 3) # normal trail
  1306. if i < 0xA0 or i > 0xBF:
  1307. combined |= (1 << 4) # three-byte special lower bound
  1308. if i < 0x80 or i > 0x9F:
  1309. combined |= (1 << 5) # three-byte special upper bound
  1310. if i < 0x90 or i > 0xBF:
  1311. combined |= (1 << 6) # four-byte special lower bound
  1312. if i < 0x80 or i > 0x8F:
  1313. combined |= (1 << 7) # four-byte special upper bound
  1314. utf_8_file.write("%d," % combined)
  1315. for i in range(128, 256):
  1316. lane = (1 << 2) # invalid lead
  1317. if i >= 0xC2 and i <= 0xDF:
  1318. lane = (1 << 3) # normal trail
  1319. elif i == 0xE0:
  1320. lane = (1 << 4) # three-byte special lower bound
  1321. elif i >= 0xE1 and i <= 0xEC:
  1322. lane = (1 << 3) # normal trail
  1323. elif i == 0xED:
  1324. lane = (1 << 5) # three-byte special upper bound
  1325. elif i >= 0xEE and i <= 0xEF:
  1326. lane = (1 << 3) # normal trail
  1327. elif i == 0xF0:
  1328. lane = (1 << 6) # four-byte special lower bound
  1329. elif i >= 0xF1 and i <= 0xF3:
  1330. lane = (1 << 3) # normal trail
  1331. elif i == 0xF4:
  1332. lane = (1 << 7) # four-byte special upper bound
  1333. utf_8_file.write("%d," % lane)
  1334. utf_8_file.write("""
  1335. ],
  1336. };
  1337. """)
  1338. utf_8_file.write(utf_8_rs_end)
  1339. utf_8_file.close()
  1340. # Unit tests
  1341. TEST_HEADER = '''Any copyright to the test code below this comment is dedicated to the
  1342. Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
  1343. This is a generated file. Please do not edit.
  1344. Instead, please regenerate using generate-encoding-data.py
  1345. '''
  1346. index = indexes["jis0208"]
  1347. jis0208_in_file = open("src/test_data/jis0208_in.txt", "w")
  1348. jis0208_in_file.write(TEST_HEADER)
  1349. for pointer in range(0, 94 * 94):
  1350. (lead, trail) = divmod(pointer, 94)
  1351. lead += 0xA1
  1352. trail += 0xA1
  1353. jis0208_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
  1354. jis0208_in_file.close()
  1355. jis0208_in_ref_file = open("src/test_data/jis0208_in_ref.txt", "w")
  1356. jis0208_in_ref_file.write(TEST_HEADER)
  1357. for pointer in range(0, 94 * 94):
  1358. code_point = index[pointer]
  1359. if code_point:
  1360. jis0208_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
  1361. else:
  1362. jis0208_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
  1363. jis0208_in_ref_file.close()
  1364. jis0208_out_file = open("src/test_data/jis0208_out.txt", "w")
  1365. jis0208_out_ref_file = open("src/test_data/jis0208_out_ref.txt", "w")
  1366. jis0208_out_file.write(TEST_HEADER)
  1367. jis0208_out_ref_file.write(TEST_HEADER)
  1368. for pointer in range(0, 94 * 94):
  1369. code_point = index[pointer]
  1370. if code_point:
  1371. revised_pointer = pointer
  1372. if revised_pointer == 8644 or (revised_pointer >= 1207 and revised_pointer < 1220):
  1373. revised_pointer = index.index(code_point)
  1374. (lead, trail) = divmod(revised_pointer, 94)
  1375. lead += 0xA1
  1376. trail += 0xA1
  1377. jis0208_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
  1378. jis0208_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
  1379. jis0208_out_file.close()
  1380. jis0208_out_ref_file.close()
  1381. shift_jis_in_file = open("src/test_data/shift_jis_in.txt", "w")
  1382. shift_jis_in_file.write(TEST_HEADER)
  1383. for pointer in range(0, len(index)):
  1384. (lead, trail) = divmod(pointer, 188)
  1385. lead += 0x81 if lead < 0x1F else 0xC1
  1386. trail += 0x40 if trail < 0x3F else 0x41
  1387. shift_jis_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
  1388. shift_jis_in_file.close()
  1389. shift_jis_in_ref_file = open("src/test_data/shift_jis_in_ref.txt", "w")
  1390. shift_jis_in_ref_file.write(TEST_HEADER)
  1391. for pointer in range(0, len(index)):
  1392. code_point = 0xE000 - 8836 + pointer if pointer >= 8836 and pointer <= 10715 else index[pointer]
  1393. if code_point:
  1394. shift_jis_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
  1395. else:
  1396. trail = pointer % 188
  1397. trail += 0x40 if trail < 0x3F else 0x41
  1398. if trail < 0x80:
  1399. shift_jis_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8"))
  1400. else:
  1401. shift_jis_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
  1402. shift_jis_in_ref_file.close()
  1403. shift_jis_out_file = open("src/test_data/shift_jis_out.txt", "w")
  1404. shift_jis_out_ref_file = open("src/test_data/shift_jis_out_ref.txt", "w")
  1405. shift_jis_out_file.write(TEST_HEADER)
  1406. shift_jis_out_ref_file.write(TEST_HEADER)
  1407. for pointer in range(0, 8272):
  1408. code_point = index[pointer]
  1409. if code_point:
  1410. revised_pointer = pointer
  1411. if revised_pointer >= 1207 and revised_pointer < 1220:
  1412. revised_pointer = index.index(code_point)
  1413. (lead, trail) = divmod(revised_pointer, 188)
  1414. lead += 0x81 if lead < 0x1F else 0xC1
  1415. trail += 0x40 if trail < 0x3F else 0x41
  1416. shift_jis_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
  1417. shift_jis_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
  1418. for pointer in range(8836, len(index)):
  1419. code_point = index[pointer]
  1420. if code_point:
  1421. revised_pointer = index.index(code_point)
  1422. if revised_pointer >= 8272 and revised_pointer < 8836:
  1423. revised_pointer = pointer
  1424. (lead, trail) = divmod(revised_pointer, 188)
  1425. lead += 0x81 if lead < 0x1F else 0xC1
  1426. trail += 0x40 if trail < 0x3F else 0x41
  1427. shift_jis_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
  1428. shift_jis_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
  1429. shift_jis_out_file.close()
  1430. shift_jis_out_ref_file.close()
  1431. iso_2022_jp_in_file = open("src/test_data/iso_2022_jp_in.txt", "w")
  1432. iso_2022_jp_in_file.write(TEST_HEADER)
  1433. for pointer in range(0, 94 * 94):
  1434. (lead, trail) = divmod(pointer, 94)
  1435. lead += 0x21
  1436. trail += 0x21
  1437. iso_2022_jp_in_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail)))
  1438. iso_2022_jp_in_file.close()
  1439. iso_2022_jp_in_ref_file = open("src/test_data/iso_2022_jp_in_ref.txt", "w")
  1440. iso_2022_jp_in_ref_file.write(TEST_HEADER)
  1441. for pointer in range(0, 94 * 94):
  1442. code_point = index[pointer]
  1443. if code_point:
  1444. iso_2022_jp_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
  1445. else:
  1446. iso_2022_jp_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
  1447. iso_2022_jp_in_ref_file.close()
  1448. iso_2022_jp_out_file = open("src/test_data/iso_2022_jp_out.txt", "w")
  1449. iso_2022_jp_out_ref_file = open("src/test_data/iso_2022_jp_out_ref.txt", "w")
  1450. iso_2022_jp_out_file.write(TEST_HEADER)
  1451. iso_2022_jp_out_ref_file.write(TEST_HEADER)
  1452. for pointer in range(0, 94 * 94):
  1453. code_point = index[pointer]
  1454. if code_point:
  1455. revised_pointer = pointer
  1456. if revised_pointer == 8644 or (revised_pointer >= 1207 and revised_pointer < 1220):
  1457. revised_pointer = index.index(code_point)
  1458. (lead, trail) = divmod(revised_pointer, 94)
  1459. lead += 0x21
  1460. trail += 0x21
  1461. iso_2022_jp_out_ref_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail)))
  1462. iso_2022_jp_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
  1463. for i in xrange(len(half_width_index)):
  1464. code_point = i + 0xFF61
  1465. normalized_code_point = half_width_index[i]
  1466. pointer = index.index(normalized_code_point)
  1467. (lead, trail) = divmod(pointer, 94)
  1468. lead += 0x21
  1469. trail += 0x21
  1470. iso_2022_jp_out_ref_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail)))
  1471. iso_2022_jp_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
  1472. iso_2022_jp_out_file.close()
  1473. iso_2022_jp_out_ref_file.close()
  1474. index = indexes["euc-kr"]
  1475. euc_kr_in_file = open("src/test_data/euc_kr_in.txt", "w")
  1476. euc_kr_in_file.write(TEST_HEADER)
  1477. for pointer in range(0, len(index)):
  1478. (lead, trail) = divmod(pointer, 190)
  1479. lead += 0x81
  1480. trail += 0x41
  1481. euc_kr_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
  1482. euc_kr_in_file.close()
  1483. euc_kr_in_ref_file = open("src/test_data/euc_kr_in_ref.txt", "w")
  1484. euc_kr_in_ref_file.write(TEST_HEADER)
  1485. for pointer in range(0, len(index)):
  1486. code_point = index[pointer]
  1487. if code_point:
  1488. euc_kr_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
  1489. else:
  1490. trail = pointer % 190
  1491. trail += 0x41
  1492. if trail < 0x80:
  1493. euc_kr_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8"))
  1494. else:
  1495. euc_kr_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
  1496. euc_kr_in_ref_file.close()
  1497. euc_kr_out_file = open("src/test_data/euc_kr_out.txt", "w")
  1498. euc_kr_out_ref_file = open("src/test_data/euc_kr_out_ref.txt", "w")
  1499. euc_kr_out_file.write(TEST_HEADER)
  1500. euc_kr_out_ref_file.write(TEST_HEADER)
  1501. for pointer in range(0, len(index)):
  1502. code_point = index[pointer]
  1503. if code_point:
  1504. (lead, trail) = divmod(pointer, 190)
  1505. lead += 0x81
  1506. trail += 0x41
  1507. euc_kr_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
  1508. euc_kr_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
  1509. euc_kr_out_file.close()
  1510. euc_kr_out_ref_file.close()
  1511. index = indexes["gb18030"]
  1512. gb18030_in_file = open("src/test_data/gb18030_in.txt", "w")
  1513. gb18030_in_file.write(TEST_HEADER)
  1514. for pointer in range(0, len(index)):
  1515. (lead, trail) = divmod(pointer, 190)
  1516. lead += 0x81
  1517. trail += 0x40 if trail < 0x3F else 0x41
  1518. gb18030_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
  1519. gb18030_in_file.close()
  1520. gb18030_in_ref_file = open("src/test_data/gb18030_in_ref.txt", "w")
  1521. gb18030_in_ref_file.write(TEST_HEADER)
  1522. for pointer in range(0, len(index)):
  1523. code_point = index[pointer]
  1524. if code_point:
  1525. gb18030_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
  1526. else:
  1527. trail = pointer % 190
  1528. trail += 0x40 if trail < 0x3F else 0x41
  1529. if trail < 0x80:
  1530. gb18030_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8"))
  1531. else:
  1532. gb18030_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
  1533. gb18030_in_ref_file.close()
  1534. gb18030_out_file = open("src/test_data/gb18030_out.txt", "w")
  1535. gb18030_out_ref_file = open("src/test_data/gb18030_out_ref.txt", "w")
  1536. gb18030_out_file.write(TEST_HEADER)
  1537. gb18030_out_ref_file.write(TEST_HEADER)
  1538. for pointer in range(0, len(index)):
  1539. if pointer == 6555:
  1540. continue
  1541. code_point = index[pointer]
  1542. if code_point:
  1543. (lead, trail) = divmod(pointer, 190)
  1544. lead += 0x81
  1545. trail += 0x40 if trail < 0x3F else 0x41
  1546. gb18030_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
  1547. gb18030_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
  1548. gb18030_out_file.close()
  1549. gb18030_out_ref_file.close()
  1550. index = indexes["big5"]
  1551. big5_in_file = open("src/test_data/big5_in.txt", "w")
  1552. big5_in_file.write(TEST_HEADER)
  1553. for pointer in range(0, len(index)):
  1554. (lead, trail) = divmod(pointer, 157)
  1555. lead += 0x81
  1556. trail += 0x40 if trail < 0x3F else 0x62
  1557. big5_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
  1558. big5_in_file.close()
  1559. big5_two_characters = {
  1560. 1133: u"\u00CA\u0304",
  1561. 1135: u"\u00CA\u030C",
  1562. 1164: u"\u00EA\u0304",
  1563. 1166: u"\u00EA\u030C",
  1564. }
  1565. big5_in_ref_file = open("src/test_data/big5_in_ref.txt", "w")
  1566. big5_in_ref_file.write(TEST_HEADER)
  1567. for pointer in range(0, len(index)):
  1568. if pointer in big5_two_characters.keys():
  1569. big5_in_ref_file.write((u"%s\n" % big5_two_characters[pointer]).encode("utf-8"))
  1570. continue
  1571. code_point = index[pointer]
  1572. if code_point:
  1573. big5_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
  1574. else:
  1575. trail = pointer % 157
  1576. trail += 0x40 if trail < 0x3F else 0x62
  1577. if trail < 0x80:
  1578. big5_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8"))
  1579. else:
  1580. big5_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
  1581. big5_in_ref_file.close()
  1582. prefer_last = [
  1583. 0x2550,
  1584. 0x255E,
  1585. 0x2561,
  1586. 0x256A,
  1587. 0x5341,
  1588. 0x5345,
  1589. ]
  1590. pointer_for_prefer_last = []
  1591. for code_point in prefer_last:
  1592. # Python lists don't have .rindex() :-(
  1593. for i in xrange(len(index) - 1, -1, -1):
  1594. candidate = index[i]
  1595. if candidate == code_point:
  1596. pointer_for_prefer_last.append(i)
  1597. break
  1598. big5_out_file = open("src/test_data/big5_out.txt", "w")
  1599. big5_out_ref_file = open("src/test_data/big5_out_ref.txt", "w")
  1600. big5_out_file.write(TEST_HEADER)
  1601. big5_out_ref_file.write(TEST_HEADER)
  1602. for pointer in range(((0xA1 - 0x81) * 157), len(index)):
  1603. code_point = index[pointer]
  1604. if code_point:
  1605. if code_point in prefer_last:
  1606. if pointer != pointer_for_prefer_last[prefer_last.index(code_point)]:
  1607. continue
  1608. else:
  1609. if pointer != index.index(code_point):
  1610. continue
  1611. (lead, trail) = divmod(pointer, 157)
  1612. lead += 0x81
  1613. trail += 0x40 if trail < 0x3F else 0x62
  1614. big5_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
  1615. big5_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
  1616. big5_out_file.close()
  1617. big5_out_ref_file.close()
  1618. index = indexes["jis0212"]
  1619. jis0212_in_file = open("src/test_data/jis0212_in.txt", "w")
  1620. jis0212_in_file.write(TEST_HEADER)
  1621. for pointer in range(0, len(index)):
  1622. (lead, trail) = divmod(pointer, 94)
  1623. lead += 0xA1
  1624. trail += 0xA1
  1625. jis0212_in_file.write("\x8F%s%s\n" % (chr(lead), chr(trail)))
  1626. jis0212_in_file.close()
  1627. jis0212_in_ref_file = open("src/test_data/jis0212_in_ref.txt", "w")
  1628. jis0212_in_ref_file.write(TEST_HEADER)
  1629. for pointer in range(0, len(index)):
  1630. code_point = index[pointer]
  1631. if code_point:
  1632. jis0212_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
  1633. else:
  1634. jis0212_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
  1635. jis0212_in_ref_file.close()
  1636. (codepage_begin, codepage_end) = read_non_generated("../codepage/src/lib.rs")
  1637. codepage_file = open("../codepage/src/lib.rs", "w")
  1638. codepage_file.write(codepage_begin)
  1639. codepage_file.write("""
  1640. // Instead, please regenerate using generate-encoding-data.py
  1641. /// Supported code page numbers in estimated order of usage frequency
  1642. static CODE_PAGES: [u16; %d] = [
  1643. """ % len(code_pages))
  1644. for code_page in code_pages:
  1645. codepage_file.write(" %d,\n" % code_page)
  1646. codepage_file.write("""];
  1647. /// Encodings corresponding to the code page numbers in the same order
  1648. static ENCODINGS: [&'static Encoding; %d] = [
  1649. """ % len(code_pages))
  1650. for code_page in code_pages:
  1651. name = encodings_by_code_page[code_page]
  1652. codepage_file.write(" &%s_INIT,\n" % to_constant_name(name))
  1653. codepage_file.write("""];
  1654. """)
  1655. codepage_file.write(codepage_end)
  1656. codepage_file.close()
  1657. (codepage_test_begin, codepage_test_end) = read_non_generated("../codepage/src/tests.rs")
  1658. codepage_test_file = open("../codepage/src/tests.rs", "w")
  1659. codepage_test_file.write(codepage_test_begin)
  1660. codepage_test_file.write("""
  1661. // Instead, please regenerate using generate-encoding-data.py
  1662. #[test]
  1663. fn test_to_encoding() {
  1664. assert_eq!(to_encoding(0), None);
  1665. """)
  1666. for code_page in code_pages:
  1667. codepage_test_file.write(" assert_eq!(to_encoding(%d), Some(%s));\n" % (code_page, to_constant_name(encodings_by_code_page[code_page])))
  1668. codepage_test_file.write("""}
  1669. #[test]
  1670. fn test_from_encoding() {
  1671. """)
  1672. for name in preferred:
  1673. if code_pages_by_encoding.has_key(name):
  1674. codepage_test_file.write(" assert_eq!(from_encoding(%s), Some(%d));\n" % (to_constant_name(name), code_pages_by_encoding[name]))
  1675. else:
  1676. codepage_test_file.write(" assert_eq!(from_encoding(%s), None);\n" % to_constant_name(name))
  1677. codepage_test_file.write("""}
  1678. """)
  1679. codepage_test_file.write(codepage_test_end)
  1680. codepage_test_file.close()
  1681. subprocess.call(["cargo", "fmt"])