1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008 |
- #!/usr/bin/python
- # Copyright Mozilla Foundation. See the COPYRIGHT
- # file at the top-level directory of this distribution.
- #
- # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
- # https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
- # <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
- # option. This file may not be copied, modified, or distributed
- # except according to those terms.
- import json
- import subprocess
- import sys
- import os.path
- if (not os.path.isfile("../encoding/encodings.json")) or (not os.path.isfile("../encoding/indexes.json")):
- sys.stderr.write("This script needs a clone of https://github.com/whatwg/encoding/ (preferably at revision f381389) next to the encoding_rs directory.\n");
- sys.exit(-1)
- if not os.path.isfile("../encoding_c/src/lib.rs"):
- sys.stderr.write("This script also writes the generated parts of the encoding_c crate and needs a clone of https://github.com/hsivonen/encoding_c next to the encoding_rs directory.\n");
- sys.exit(-1)
- if not os.path.isfile("../codepage/src/lib.rs"):
- sys.stderr.write("This script also writes the generated parts of the codepage crate and needs a clone of https://github.com/hsivonen/codepage next to the encoding_rs directory.\n");
- sys.exit(-1)
- def cmp_from_end(one, other):
- c = cmp(len(one), len(other))
- if c != 0:
- return c
- i = len(one) - 1
- while i >= 0:
- c = cmp(one[i], other[i])
- if c != 0:
- return c
- i -= 1
- return 0
- class Label:
- def __init__(self, label, preferred):
- self.label = label
- self.preferred = preferred
- def __cmp__(self, other):
- return cmp_from_end(self.label, other.label)
- class CodePage:
- def __init__(self, code_page, preferred):
- self.code_page = code_page
- self.preferred = preferred
- def __cmp__(self, other):
- return self.code_page, other.code_page
- def static_u16_table(name, data):
- data_file.write('''pub static %s: [u16; %d] = [
- ''' % (name, len(data)))
- for i in xrange(len(data)):
- data_file.write('0x%04X,\n' % data[i])
- data_file.write('''];
- ''')
- def static_u16_table_from_indexable(name, data, item, feature):
- data_file.write('''#[cfg(all(
- feature = "less-slow-%s",
- not(feature = "fast-%s")
- ))]
- static %s: [u16; %d] = [
- ''' % (feature, feature, name, len(data)))
- for i in xrange(len(data)):
- data_file.write('0x%04X,\n' % data[i][item])
- data_file.write('''];
- ''')
- def static_u8_pair_table_from_indexable(name, data, item, feature):
- data_file.write('''#[cfg(all(
- feature = "less-slow-%s",
- not(feature = "fast-%s")
- ))]
- static %s: [[u8; 2]; %d] = [
- ''' % (feature, feature, name, len(data)))
- for i in xrange(len(data)):
- data_file.write('[0x%02X, 0x%02X],\n' % data[i][item])
- data_file.write('''];
- ''')
- def static_u8_pair_table(name, data, feature):
- data_file.write('''#[cfg(feature = "%s")]
- static %s: [[u8; 2]; %d] = [
- ''' % (feature, name, len(data)))
- for i in xrange(len(data)):
- pair = data[i]
- if not pair:
- pair = (0, 0)
- data_file.write('[0x%02X, 0x%02X],\n' % pair)
- data_file.write('''];
- ''')
- preferred = []
- dom = []
- labels = []
- data = json.load(open("../encoding/encodings.json", "r"))
- indexes = json.load(open("../encoding/indexes.json", "r"))
- single_byte = []
- multi_byte = []
- def to_camel_name(name):
- if name == u"iso-8859-8-i":
- return u"Iso8I"
- if name.startswith(u"iso-8859-"):
- return name.replace(u"iso-8859-", u"Iso")
- return name.title().replace(u"X-", u"").replace(u"-", u"").replace(u"_", u"")
- def to_constant_name(name):
- return name.replace(u"-", u"_").upper()
- def to_snake_name(name):
- return name.replace(u"-", u"_").lower()
- def to_dom_name(name):
- return name
- # Guestimate based on
- # https://w3techs.com/technologies/overview/character_encoding/all
- # whose methodology is known to be bogus, but the results are credible for
- # this purpose. UTF-16LE lifted up due to prevalence on Windows and
- # "ANSI codepages" prioritized.
- encodings_by_code_page_frequency = [
- "UTF-8",
- "UTF-16LE",
- "windows-1252",
- "windows-1251",
- "GBK",
- "Shift_JIS",
- "EUC-KR",
- "windows-1250",
- "windows-1256",
- "windows-1254",
- "Big5",
- "windows-874",
- "windows-1255",
- "windows-1253",
- "windows-1257",
- "windows-1258",
- "EUC-JP",
- "ISO-8859-2",
- "ISO-8859-15",
- "ISO-8859-7",
- "KOI8-R",
- "gb18030",
- "ISO-8859-5",
- "ISO-8859-8-I",
- "ISO-8859-4",
- "ISO-8859-6",
- "ISO-2022-JP",
- "KOI8-U",
- "ISO-8859-13",
- "ISO-8859-3",
- "UTF-16BE",
- "IBM866",
- "ISO-8859-10",
- "ISO-8859-8",
- "macintosh",
- "x-mac-cyrillic",
- "ISO-8859-14",
- "ISO-8859-16",
- ]
- encodings_by_code_page = {
- 932: "Shift_JIS",
- 936: "GBK",
- 949: "EUC-KR",
- 950: "Big5",
- 866: "IBM866",
- 874: "windows-874",
- 1200: "UTF-16LE",
- 1201: "UTF-16BE",
- 1250: "windows-1250",
- 1251: "windows-1251",
- 1252: "windows-1252",
- 1253: "windows-1253",
- 1254: "windows-1254",
- 1255: "windows-1255",
- 1256: "windows-1256",
- 1257: "windows-1257",
- 1258: "windows-1258",
- 10000: "macintosh",
- 10017: "x-mac-cyrillic",
- 20866: "KOI8-R",
- 20932: "EUC-JP",
- 21866: "KOI8-U",
- 28592: "ISO-8859-2",
- 28593: "ISO-8859-3",
- 28594: "ISO-8859-4",
- 28595: "ISO-8859-5",
- 28596: "ISO-8859-6",
- 28597: "ISO-8859-7",
- 28598: "ISO-8859-8",
- 28600: "ISO-8859-10",
- 28603: "ISO-8859-13",
- 28604: "ISO-8859-14",
- 28605: "ISO-8859-15",
- 28606: "ISO-8859-16",
- 38598: "ISO-8859-8-I",
- 50221: "ISO-2022-JP",
- 54936: "gb18030",
- 65001: "UTF-8",
- }
- code_pages_by_encoding = {}
- for code_page, encoding in encodings_by_code_page.iteritems():
- code_pages_by_encoding[encoding] = code_page
- encoding_by_alias_code_page = {
- 951: "Big5",
- 10007: "x-mac-cyrillic",
- 20936: "GBK",
- 20949: "EUC-KR",
- 21010: "UTF-16LE", # Undocumented; needed by calamine for Excel compat
- 28591: "windows-1252",
- 28599: "windows-1254",
- 28601: "windows-874",
- 50220: "ISO-2022-JP",
- 50222: "ISO-2022-JP",
- 50225: "replacement", # ISO-2022-KR
- 50227: "replacement", # ISO-2022-CN
- 51949: "EUC-JP",
- 51936: "GBK",
- 51949: "EUC-KR",
- 52936: "replacement", # HZ
- }
- code_pages = []
- for name in encodings_by_code_page_frequency:
- code_pages.append(code_pages_by_encoding[name])
- encodings_by_code_page.update(encoding_by_alias_code_page)
- temp_keys = encodings_by_code_page.keys()
- temp_keys.sort()
- for code_page in temp_keys:
- if not code_page in code_pages:
- code_pages.append(code_page)
- # The position in the index (0 is the first index entry,
- # i.e. byte value 0x80) that starts the longest run of
- # consecutive code points. Must not be in the first
- # quadrant. If the character to be encoded is not in this
- # run, the part of the index after the run is searched
- # forward. Then the part of the index from 32 to the start
- # of the run. The first quadrant is searched last.
- #
- # If there is no obviously most useful longest run,
- # the index here is just used to affect the search order.
- start_of_longest_run_in_single_byte = {
- "IBM866": 96, # 0 would be longest, but we don't want to start in the first quadrant
- "windows-874": 33,
- "windows-1250": 92,
- "windows-1251": 64,
- "windows-1252": 32,
- "windows-1253": 83,
- "windows-1254": 95,
- "windows-1255": 96,
- "windows-1256": 65,
- "windows-1257": 95, # not actually longest
- "windows-1258": 95, # not actually longest
- "macintosh": 106, # useless
- "x-mac-cyrillic": 96,
- "KOI8-R": 64, # not actually longest
- "KOI8-U": 64, # not actually longest
- "ISO-8859-2": 95, # not actually longest
- "ISO-8859-3": 95, # not actually longest
- "ISO-8859-4": 95, # not actually longest
- "ISO-8859-5": 46,
- "ISO-8859-6": 65,
- "ISO-8859-7": 83,
- "ISO-8859-8": 96,
- "ISO-8859-10": 90, # not actually longest
- "ISO-8859-13": 95, # not actually longest
- "ISO-8859-14": 95,
- "ISO-8859-15": 63,
- "ISO-8859-16": 95, # not actually longest
- }
- #
- for group in data:
- if group["heading"] == "Legacy single-byte encodings":
- single_byte = group["encodings"]
- else:
- multi_byte.extend(group["encodings"])
- for encoding in group["encodings"]:
- preferred.append(encoding["name"])
- for label in encoding["labels"]:
- labels.append(Label(label, encoding["name"]))
- for name in preferred:
- dom.append(to_dom_name(name))
- preferred.sort()
- labels.sort()
- dom.sort(cmp=cmp_from_end)
- longest_label_length = 0
- longest_name_length = 0
- longest_label = None
- longest_name = None
- for name in preferred:
- if len(name) > longest_name_length:
- longest_name_length = len(name)
- longest_name = name
- for label in labels:
- if len(label.label) > longest_label_length:
- longest_label_length = len(label.label)
- longest_label = label.label
- def longest_run_for_single_byte(name):
- if name == u"ISO-8859-8-I":
- name = u"ISO-8859-8"
- index = indexes[name.lower()]
- run_byte_offset = start_of_longest_run_in_single_byte[name]
- run_bmp_offset = index[run_byte_offset]
- previous_code_point = run_bmp_offset
- run_length = 1
- while True:
- i = run_byte_offset + run_length
- if i == len(index):
- break
- code_point = index[i]
- if previous_code_point + 1 != code_point:
- break
- previous_code_point = code_point
- run_length += 1
- return (run_bmp_offset, run_byte_offset, run_length)
- def is_single_byte(name):
- for encoding in single_byte:
- if name == encoding["name"]:
- return True
- return False
- def read_non_generated(path):
- partially_generated_file = open(path, "r")
- full = partially_generated_file.read()
- partially_generated_file.close()
- generated_begin = "// BEGIN GENERATED CODE. PLEASE DO NOT EDIT."
- generated_end = "// END GENERATED CODE"
- generated_begin_index = full.find(generated_begin)
- if generated_begin_index < 0:
- sys.stderr.write("Can't find generated code start marker in %s. Exiting.\n" % path)
- sys.exit(-1)
- generated_end_index = full.find(generated_end)
- if generated_end_index < 0:
- sys.stderr.write("Can't find generated code end marker in %s. Exiting.\n" % path)
- sys.exit(-1)
- return (full[0:generated_begin_index + len(generated_begin)],
- full[generated_end_index:])
- (lib_rs_begin, lib_rs_end) = read_non_generated("src/lib.rs")
- label_file = open("src/lib.rs", "w")
- label_file.write(lib_rs_begin)
- label_file.write("""
- // Instead, please regenerate using generate-encoding-data.py
- const LONGEST_LABEL_LENGTH: usize = %d; // %s
- """ % (longest_label_length, longest_label))
- for name in preferred:
- variant = None
- if is_single_byte(name):
- (run_bmp_offset, run_byte_offset, run_length) = longest_run_for_single_byte(name)
- variant = "SingleByte(&data::SINGLE_BYTE_DATA.%s, 0x%04X, %d, %d)" % (to_snake_name(u"iso-8859-8" if name == u"ISO-8859-8-I" else name), run_bmp_offset, run_byte_offset, run_length)
- else:
- variant = to_camel_name(name)
- docfile = open("doc/%s.txt" % name, "r")
- doctext = docfile.read()
- docfile.close()
- label_file.write('''/// The initializer for the [%s](static.%s.html) encoding.
- ///
- /// For use only for taking the address of this form when
- /// Rust prohibits the use of the non-`_INIT` form directly,
- /// such as in initializers of other `static`s. If in doubt,
- /// use the corresponding non-`_INIT` reference-typed `static`.
- ///
- /// This part of the public API will go away if Rust changes
- /// to make the referent of `pub const FOO: &'static Encoding`
- /// unique cross-crate or if Rust starts allowing static arrays
- /// to be initialized with `pub static FOO: &'static Encoding`
- /// items.
- pub static %s_INIT: Encoding = Encoding {
- name: "%s",
- variant: VariantEncoding::%s,
- };
- /// The %s encoding.
- ///
- %s///
- /// This will change from `static` to `const` if Rust changes
- /// to make the referent of `pub const FOO: &'static Encoding`
- /// unique cross-crate, so don't take the address of this
- /// `static`.
- pub static %s: &'static Encoding = &%s_INIT;
- ''' % (to_dom_name(name), to_constant_name(name), to_constant_name(name), to_dom_name(name), variant, to_dom_name(name), doctext, to_constant_name(name), to_constant_name(name)))
- label_file.write("""static LABELS_SORTED: [&'static str; %d] = [
- """ % len(labels))
- for label in labels:
- label_file.write('''"%s",\n''' % label.label)
- label_file.write("""];
- static ENCODINGS_IN_LABEL_SORT: [&'static Encoding; %d] = [
- """ % len(labels))
- for label in labels:
- label_file.write('''&%s_INIT,\n''' % to_constant_name(label.preferred))
- label_file.write('''];
- ''')
- label_file.write(lib_rs_end)
- label_file.close()
- label_test_file = open("src/test_labels_names.rs", "w")
- label_test_file.write('''// Any copyright to the test code below this comment is dedicated to the
- // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
- // THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
- // Instead, please regenerate using generate-encoding-data.py
- use super::*;
- #[test]
- fn test_all_labels() {
- ''')
- for label in labels:
- label_test_file.write('''assert_eq!(Encoding::for_label(b"%s"), Some(%s));\n''' % (label.label, to_constant_name(label.preferred)))
- label_test_file.write('''}
- ''')
- label_test_file.close()
- def null_to_zero(code_point):
- if not code_point:
- code_point = 0
- return code_point
- (data_rs_begin, data_rs_end) = read_non_generated("src/data.rs")
- data_file = open("src/data.rs", "w")
- data_file.write(data_rs_begin)
- data_file.write('''
- // Instead, please regenerate using generate-encoding-data.py
- #[repr(align(64))] // Align to cache lines
- pub struct SingleByteData {
- ''')
- # Single-byte
- for encoding in single_byte:
- name = encoding["name"]
- if name == u"ISO-8859-8-I":
- continue
- data_file.write(''' pub %s: [u16; 128],
- ''' % to_snake_name(name))
- data_file.write('''}
- pub static SINGLE_BYTE_DATA: SingleByteData = SingleByteData {
- ''')
- for encoding in single_byte:
- name = encoding["name"]
- if name == u"ISO-8859-8-I":
- continue
- data_file.write(''' %s: [
- ''' % to_snake_name(name))
- for code_point in indexes[name.lower()]:
- data_file.write('0x%04X,\n' % null_to_zero(code_point))
- data_file.write('''],
- ''')
- data_file.write('''};
- ''')
- # Big5
- index = indexes["big5"]
- astralness = []
- low_bits = []
- for code_point in index[942:19782]:
- if code_point:
- astralness.append(1 if code_point > 0xFFFF else 0)
- low_bits.append(code_point & 0xFFFF)
- else:
- astralness.append(0)
- low_bits.append(0)
- # pad length to multiple of 32
- for j in xrange(32 - (len(astralness) % 32)):
- astralness.append(0)
- data_file.write('''#[cfg_attr(feature = "cargo-clippy", allow(unreadable_literal))]
- static BIG5_ASTRALNESS: [u32; %d] = [
- ''' % (len(astralness) / 32))
- i = 0
- while i < len(astralness):
- accu = 0
- for j in xrange(32):
- accu |= astralness[i + j] << j
- data_file.write('0x%08X,\n' % accu)
- i += 32
- data_file.write('''];
- ''')
- static_u16_table("BIG5_LOW_BITS", low_bits)
- # Encoder table for Level 1 Hanzi
- # Note: If we were OK with doubling this table, we
- # could use a directly-indexable table instead...
- level1_hanzi_index = index[5495:10896]
- level1_hanzi_pairs = []
- for i in xrange(len(level1_hanzi_index)):
- hanzi_lead = (i / 157) + 0xA4
- hanzi_trail = (i % 157)
- hanzi_trail += 0x40 if hanzi_trail < 0x3F else 0x62
- level1_hanzi_pairs.append((level1_hanzi_index[i], (hanzi_lead, hanzi_trail)))
- level1_hanzi_pairs.append((0x4E5A, (0xC8, 0x7B)))
- level1_hanzi_pairs.append((0x5202, (0xC8, 0x7D)))
- level1_hanzi_pairs.append((0x9FB0, (0xC8, 0xA1)))
- level1_hanzi_pairs.append((0x5188, (0xC8, 0xA2)))
- level1_hanzi_pairs.append((0x9FB1, (0xC8, 0xA3)))
- level1_hanzi_pairs.sort(key=lambda x: x[0])
- static_u16_table_from_indexable("BIG5_LEVEL1_HANZI_CODE_POINTS", level1_hanzi_pairs, 0, "big5-hanzi-encode")
- static_u8_pair_table_from_indexable("BIG5_LEVEL1_HANZI_BYTES", level1_hanzi_pairs, 1, "big5-hanzi-encode")
- # Fast Unified Ideograph encode
- big5_unified_ideograph_bytes = [None] * (0x9FCC - 0x4E00)
- for row in xrange(0x7E - 0x20):
- for column in xrange(157):
- pointer = 5024 + column + (row * 157)
- code_point = index[pointer]
- if code_point and code_point >= 0x4E00 and code_point <= 0x9FCB:
- unified_offset = code_point - 0x4E00
- unified_lead = 0xA1 + row
- unified_trail = (0x40 if column < 0x3F else 0x62) + column
- if code_point == 0x5341 or code_point == 0x5345 or not big5_unified_ideograph_bytes[unified_offset]:
- big5_unified_ideograph_bytes[unified_offset] = (unified_lead, unified_trail)
- static_u8_pair_table("BIG5_UNIFIED_IDEOGRAPH_BYTES", big5_unified_ideograph_bytes, "fast-big5-hanzi-encode")
- # JIS0208
- index = indexes["jis0208"]
- # JIS 0208 Level 1 Kanji
- static_u16_table("JIS0208_LEVEL1_KANJI", index[1410:4375])
- # JIS 0208 Level 2 Kanji and Additional Kanji
- static_u16_table("JIS0208_LEVEL2_AND_ADDITIONAL_KANJI", index[4418:7808])
- # IBM Kanji
- static_u16_table("IBM_KANJI", index[8272:8632])
- # Check that the other instance is the same
- if index[8272:8632] != index[10744:11104]:
- raise Error()
- # JIS 0208 symbols (all non-Kanji, non-range items)
- symbol_index = []
- symbol_triples = []
- pointers_to_scan = [
- (0, 188),
- (658, 691),
- (1159, 1221),
- ]
- in_run = False
- run_start_pointer = 0
- run_start_array_index = 0
- for (start, end) in pointers_to_scan:
- for i in range(start, end):
- code_point = index[i]
- if in_run:
- if code_point:
- symbol_index.append(code_point)
- else:
- symbol_triples.append(run_start_pointer)
- symbol_triples.append(i - run_start_pointer)
- symbol_triples.append(run_start_array_index)
- in_run = False
- else:
- if code_point:
- in_run = True
- run_start_pointer = i
- run_start_array_index = len(symbol_index)
- symbol_index.append(code_point)
- if in_run:
- symbol_triples.append(run_start_pointer)
- symbol_triples.append(end - run_start_pointer)
- symbol_triples.append(run_start_array_index)
- in_run = False
- if in_run:
- raise Error()
- # Now add manually the two overlapping slices of
- # index from the NEC/IBM extensions.
- run_start_array_index = len(symbol_index)
- symbol_index.extend(index[10736:10744])
- # Later
- symbol_triples.append(10736)
- symbol_triples.append(8)
- symbol_triples.append(run_start_array_index)
- # Earlier
- symbol_triples.append(8644)
- symbol_triples.append(4)
- symbol_triples.append(run_start_array_index)
- static_u16_table("JIS0208_SYMBOLS", symbol_index)
- static_u16_table("JIS0208_SYMBOL_TRIPLES", symbol_triples)
- # Write down the magic numbers needed when preferring the earlier case
- data_file.write('''const IBM_SYMBOL_START: usize = %d;''' % (run_start_array_index + 1))
- data_file.write('''const IBM_SYMBOL_END: usize = %d;''' % (run_start_array_index + 4))
- data_file.write('''const IBM_SYMBOL_POINTER_START: usize = %d;''' % 8645)
- # JIS 0208 ranges (excluding kana)
- range_triples = []
- pointers_to_scan = [
- (188, 281),
- (470, 657),
- (1128, 1159),
- (8634, 8644),
- (10716, 10736),
- ]
- in_run = False
- run_start_pointer = 0
- run_start_code_point = 0
- previous_code_point = 0
- for (start, end) in pointers_to_scan:
- for i in range(start, end):
- code_point = index[i]
- if in_run:
- if code_point:
- if previous_code_point + 1 != code_point:
- range_triples.append(run_start_pointer)
- range_triples.append(i - run_start_pointer)
- range_triples.append(run_start_code_point)
- run_start_pointer = i
- run_start_code_point = code_point
- previous_code_point = code_point
- else:
- range_triples.append(run_start_pointer)
- range_triples.append(i - run_start_pointer)
- range_triples.append(run_start_code_point)
- run_start_pointer = 0
- run_start_code_point = 0
- previous_code_point = 0
- in_run = False
- else:
- if code_point:
- in_run = True
- run_start_pointer = i
- run_start_code_point = code_point
- previous_code_point = code_point
- if in_run:
- range_triples.append(run_start_pointer)
- range_triples.append(end - run_start_pointer)
- range_triples.append(run_start_code_point)
- run_start_pointer = 0
- run_start_code_point = 0
- previous_code_point = 0
- in_run = False
- if in_run:
- raise Error()
- static_u16_table("JIS0208_RANGE_TRIPLES", range_triples)
- # Encoder table for Level 1 Kanji
- # Note: If we were OK with 30 KB more footprint, we
- # could use a directly-indexable table instead...
- level1_kanji_index = index[1410:4375]
- level1_kanji_pairs = []
- for i in xrange(len(level1_kanji_index)):
- pointer = 1410 + i
- (lead, trail) = divmod(pointer, 188)
- lead += 0x81 if lead < 0x1F else 0xC1
- trail += 0x40 if trail < 0x3F else 0x41
- level1_kanji_pairs.append((level1_kanji_index[i], (lead, trail)))
- level1_kanji_pairs.sort(key=lambda x: x[0])
- static_u16_table_from_indexable("JIS0208_LEVEL1_KANJI_CODE_POINTS", level1_kanji_pairs, 0, "kanji-encode")
- static_u8_pair_table_from_indexable("JIS0208_LEVEL1_KANJI_SHIFT_JIS_BYTES", level1_kanji_pairs, 1, "kanji-encode")
- # Fast encoder table for Kanji
- kanji_bytes = [None] * (0x9FA1 - 0x4E00)
- for pointer in xrange(len(index)):
- code_point = index[pointer]
- if code_point and code_point >= 0x4E00 and code_point <= 0x9FA0:
- (lead, trail) = divmod(pointer, 188)
- lead += 0x81 if lead < 0x1F else 0xC1
- trail += 0x40 if trail < 0x3F else 0x41
- # unset the high bit of lead if IBM Kanji
- if pointer >= 8272:
- lead = lead & 0x7F
- kanji_bytes[code_point - 0x4E00] = (lead, trail)
- static_u8_pair_table("JIS0208_KANJI_BYTES", kanji_bytes, "fast-kanji-encode")
- # ISO-2022-JP half-width katakana
- # index is still jis0208
- half_width_index = indexes["iso-2022-jp-katakana"]
- data_file.write('''pub static ISO_2022_JP_HALF_WIDTH_TRAIL: [u8; %d] = [
- ''' % len(half_width_index))
- for i in xrange(len(half_width_index)):
- code_point = half_width_index[i]
- pointer = index.index(code_point)
- trail = pointer % 94 + 0x21
- data_file.write('0x%02X,\n' % trail)
- data_file.write('''];
- ''')
- # EUC-KR
- index = indexes["euc-kr"]
- # Unicode 1.1 Hangul above the old KS X 1001 block
- # Compressed form takes 35% of uncompressed form
- pointers = []
- offsets = []
- previous_code_point = 0
- for row in xrange(0x20):
- for column in xrange(190):
- i = column + (row * 190)
- # Skip the gaps
- if (column >= 0x1A and column < 0x20) or (column >= 0x3A and column < 0x40):
- continue
- code_point = index[i]
- if previous_code_point > code_point:
- raise Error()
- if code_point - previous_code_point != 1:
- adjustment = 0
- if column >= 0x40:
- adjustment = 12
- elif column >= 0x20:
- adjustment = 6
- pointers.append(column - adjustment + (row * (190 - 12)))
- offsets.append(code_point)
- previous_code_point = code_point
- static_u16_table("CP949_TOP_HANGUL_POINTERS", pointers)
- static_u16_table("CP949_TOP_HANGUL_OFFSETS", offsets)
- # Unicode 1.1 Hangul to the left of the old KS X 1001 block
- pointers = []
- offsets = []
- previous_code_point = 0
- for row in xrange(0x46 - 0x20):
- for column in xrange(190 - 94):
- i = 6080 + column + (row * 190)
- # Skip the gaps
- if (column >= 0x1A and column < 0x20) or (column >= 0x3A and column < 0x40):
- continue
- if i > 13127:
- # Exclude unassigned on partial last row
- break
- code_point = index[i]
- if previous_code_point > code_point:
- raise Error()
- if code_point - previous_code_point != 1:
- adjustment = 0
- if column >= 0x40:
- adjustment = 12
- elif column >= 0x20:
- adjustment = 6
- pointers.append(column - adjustment + (row * (190 - 94 - 12)))
- offsets.append(code_point)
- previous_code_point = code_point
- static_u16_table("CP949_LEFT_HANGUL_POINTERS", pointers)
- static_u16_table("CP949_LEFT_HANGUL_OFFSETS", offsets)
- # KS X 1001 Hangul
- hangul_index = []
- previous_code_point = 0
- for row in xrange(0x48 - 0x2F):
- for column in xrange(94):
- code_point = index[9026 + column + (row * 190)]
- if previous_code_point >= code_point:
- raise Error()
- hangul_index.append(code_point)
- previous_code_point = code_point
- static_u16_table("KSX1001_HANGUL", hangul_index)
- # KS X 1001 Hanja
- hanja_index = []
- for row in xrange(0x7D - 0x49):
- for column in xrange(94):
- hanja_index.append(index[13966 + column + (row * 190)])
- static_u16_table("KSX1001_HANJA", hanja_index)
- # KS X 1001 symbols
- symbol_index = []
- for i in range(6176, 6270):
- symbol_index.append(index[i])
- for i in range(6366, 6437):
- symbol_index.append(index[i])
- static_u16_table("KSX1001_SYMBOLS", symbol_index)
- # KS X 1001 Uppercase Latin
- subindex = []
- for i in range(7506, 7521):
- subindex.append(null_to_zero(index[i]))
- static_u16_table("KSX1001_UPPERCASE", subindex)
- # KS X 1001 Lowercase Latin
- subindex = []
- for i in range(7696, 7712):
- subindex.append(index[i])
- static_u16_table("KSX1001_LOWERCASE", subindex)
- # KS X 1001 Box drawing
- subindex = []
- for i in range(7126, 7194):
- subindex.append(index[i])
- static_u16_table("KSX1001_BOX", subindex)
- # KS X 1001 other
- pointers = []
- offsets = []
- previous_code_point = 0
- for row in xrange(10):
- for column in xrange(94):
- i = 6556 + column + (row * 190)
- code_point = index[i]
- # Exclude ranges that were processed as lookup tables
- # or that contain unmapped cells by filling them with
- # ASCII. Upon encode, ASCII code points will
- # never appear as the search key.
- if (i >= 6946 and i <= 6950):
- code_point = i - 6946
- elif (i >= 6961 and i <= 6967):
- code_point = i - 6961
- elif (i >= 6992 and i <= 6999):
- code_point = i - 6992
- elif (i >= 7024 and i <= 7029):
- code_point = i - 7024
- elif (i >= 7126 and i <= 7219):
- code_point = i - 7126
- elif (i >= 7395 and i <= 7409):
- code_point = i - 7395
- elif (i >= 7506 and i <= 7521):
- code_point = i - 7506
- elif (i >= 7696 and i <= 7711):
- code_point = i - 7696
- elif (i >= 7969 and i <= 7979):
- code_point = i - 7969
- elif (i >= 8162 and i <= 8169):
- code_point = i - 8162
- elif (i >= 8299 and i <= 8313):
- code_point = i - 8299
- elif (i >= 8347 and i <= 8359):
- code_point = i - 8347
- if code_point - previous_code_point != 1:
- pointers.append(column + (row * 94))
- offsets.append(code_point)
- previous_code_point = code_point
- static_u16_table("KSX1001_OTHER_POINTERS", pointers)
- # Omit the last offset, because the end of the last line
- # is unmapped, so we don't want to look at it.
- static_u16_table("KSX1001_OTHER_UNSORTED_OFFSETS", offsets[:-1])
- # Fast Hangul and Hanja encode
- hangul_bytes = [None] * (0xD7A4 - 0xAC00)
- hanja_unified_bytes = [None] * (0x9F9D - 0x4E00)
- hanja_compatibility_bytes = [None] * (0xFA0C - 0xF900)
- for row in xrange(0x7D):
- for column in xrange(190):
- pointer = column + (row * 190)
- code_point = index[pointer]
- if code_point:
- lead = 0x81 + row
- trail = 0x41 + column
- if code_point >= 0xAC00 and code_point < 0xD7A4:
- hangul_bytes[code_point - 0xAC00] = (lead, trail)
- elif code_point >= 0x4E00 and code_point < 0x9F9D:
- hanja_unified_bytes[code_point - 0x4E00] = (lead, trail)
- elif code_point >= 0xF900 and code_point < 0xFA0C:
- hanja_compatibility_bytes[code_point - 0xF900] = (lead, trail)
- static_u8_pair_table("CP949_HANGUL_BYTES", hangul_bytes, "fast-hangul-encode")
- static_u8_pair_table("KSX1001_UNIFIED_HANJA_BYTES", hanja_unified_bytes, "fast-hanja-encode")
- static_u8_pair_table("KSX1001_COMPATIBILITY_HANJA_BYTES", hanja_compatibility_bytes, "fast-hanja-encode")
- # JIS 0212
- index = indexes["jis0212"]
- # JIS 0212 Kanji
- static_u16_table("JIS0212_KANJI", index[1410:7211])
- # JIS 0212 accented (all non-Kanji, non-range items)
- symbol_index = []
- symbol_triples = []
- pointers_to_scan = [
- (0, 596),
- (608, 644),
- (656, 1409),
- ]
- in_run = False
- run_start_pointer = 0
- run_start_array_index = 0
- for (start, end) in pointers_to_scan:
- for i in range(start, end):
- code_point = index[i]
- if in_run:
- if code_point:
- symbol_index.append(code_point)
- elif index[i + 1]:
- symbol_index.append(0)
- else:
- symbol_triples.append(run_start_pointer)
- symbol_triples.append(i - run_start_pointer)
- symbol_triples.append(run_start_array_index)
- in_run = False
- else:
- if code_point:
- in_run = True
- run_start_pointer = i
- run_start_array_index = len(symbol_index)
- symbol_index.append(code_point)
- if in_run:
- symbol_triples.append(run_start_pointer)
- symbol_triples.append(end - run_start_pointer)
- symbol_triples.append(run_start_array_index)
- in_run = False
- if in_run:
- raise Error()
- static_u16_table("JIS0212_ACCENTED", symbol_index)
- static_u16_table("JIS0212_ACCENTED_TRIPLES", symbol_triples)
- # gb18030
- index = indexes["gb18030"]
- # Unicode 1.1 ideographs above the old GB2312 block
- # Compressed form takes 63% of uncompressed form
- pointers = []
- offsets = []
- previous_code_point = 0
- for i in xrange(6080):
- code_point = index[i]
- if previous_code_point > code_point:
- raise Error()
- if code_point - previous_code_point != 1:
- pointers.append(i)
- offsets.append(code_point)
- previous_code_point = code_point
- static_u16_table("GBK_TOP_IDEOGRAPH_POINTERS", pointers)
- static_u16_table("GBK_TOP_IDEOGRAPH_OFFSETS", offsets)
- # Unicode 1.1 ideographs to the left of the old GB2312 block
- # Compressed form takes 40% of uncompressed form
- pointers = []
- offsets = []
- previous_code_point = 0
- for row in xrange(0x7D - 0x29):
- for column in xrange(190 - 94):
- i = 7790 + column + (row * 190)
- if i > 23650:
- # Exclude compatibility ideographs at the end
- break
- code_point = index[i]
- if previous_code_point > code_point:
- raise Error()
- if code_point - previous_code_point != 1:
- pointers.append(column + (row * (190 - 94)))
- offsets.append(code_point)
- previous_code_point = code_point
- static_u16_table("GBK_LEFT_IDEOGRAPH_POINTERS", pointers)
- static_u16_table("GBK_LEFT_IDEOGRAPH_OFFSETS", offsets)
- # GBK other (excl. Ext A, Compat & PUA at the bottom)
- pointers = []
- offsets = []
- previous_code_point = 0
- for row in xrange(0x29 - 0x20):
- for column in xrange(190 - 94):
- i = 6080 + column + (row * 190)
- code_point = index[i]
- if code_point - previous_code_point != 1:
- pointers.append(column + (row * (190 - 94)))
- offsets.append(code_point)
- previous_code_point = code_point
- pointers.append((190 - 94) * (0x29 - 0x20))
- static_u16_table("GBK_OTHER_POINTERS", pointers)
- static_u16_table("GBK_OTHER_UNSORTED_OFFSETS", offsets)
- # GBK bottom: Compatibility ideagraphs, Ext A and PUA
- bottom_index = []
- # 5 compat following Unified Ideographs
- for i in range(23651, 23656):
- bottom_index.append(index[i])
- # Last row
- for i in range(23750, 23846):
- bottom_index.append(index[i])
- static_u16_table("GBK_BOTTOM", bottom_index)
- # GB2312 Hanzi
- # (and the 5 PUA code points in between Level 1 and Level 2)
- hanzi_index = []
- for row in xrange(0x77 - 0x2F):
- for column in xrange(94):
- hanzi_index.append(index[9026 + column + (row * 190)])
- static_u16_table("GB2312_HANZI", hanzi_index)
- # GB2312 symbols
- symbol_index = []
- for i in xrange(94):
- symbol_index.append(index[6176 + i])
- static_u16_table("GB2312_SYMBOLS", symbol_index)
- # GB2312 symbols on Greek row (incl. PUA)
- symbol_index = []
- for i in xrange(22):
- symbol_index.append(index[7189 + i])
- static_u16_table("GB2312_SYMBOLS_AFTER_GREEK", symbol_index)
- # GB2312 Pinyin
- pinyin_index = []
- for i in xrange(32):
- pinyin_index.append(index[7506 + i])
- static_u16_table("GB2312_PINYIN", pinyin_index)
- # GB2312 other (excl. bottom PUA)
- pointers = []
- offsets = []
- previous_code_point = 0
- for row in xrange(14):
- for column in xrange(94):
- i = 6366 + column + (row * 190)
- code_point = index[i]
- # Exclude the two ranges that were processed as
- # lookup tables above by filling them with
- # ASCII. Upon encode, ASCII code points will
- # never appear as the search key.
- if (i >= 7189 and i < 7189 + 22):
- code_point = i - 7189
- elif (i >= 7506 and i < 7506 + 32):
- code_point = i - 7506
- if code_point - previous_code_point != 1:
- pointers.append(column + (row * 94))
- offsets.append(code_point)
- previous_code_point = code_point
- pointers.append(14 * 94)
- static_u16_table("GB2312_OTHER_POINTERS", pointers)
- static_u16_table("GB2312_OTHER_UNSORTED_OFFSETS", offsets)
- # Non-gbk code points
- pointers = []
- offsets = []
- for pair in indexes["gb18030-ranges"]:
- if pair[1] == 0x10000:
- break # the last entry doesn't fit in u16
- pointers.append(pair[0])
- offsets.append(pair[1])
- static_u16_table("GB18030_RANGE_POINTERS", pointers)
- static_u16_table("GB18030_RANGE_OFFSETS", offsets)
- # Encoder table for Level 1 Hanzi
- # The units here really fit into 12 bits, but since we're
- # looking for speed here, let's use 16 bits per unit.
- # Once we use 16 bits per unit, we might as well precompute
- # the output bytes.
- level1_hanzi_index = hanzi_index[:(94 * (0xD8 - 0xB0) - 5)]
- level1_hanzi_pairs = []
- for i in xrange(len(level1_hanzi_index)):
- hanzi_lead = (i / 94) + 0xB0
- hanzi_trail = (i % 94) + 0xA1
- level1_hanzi_pairs.append((level1_hanzi_index[i], (hanzi_lead, hanzi_trail)))
- level1_hanzi_pairs.sort(key=lambda x: x[0])
- static_u16_table_from_indexable("GB2312_LEVEL1_HANZI_CODE_POINTS", level1_hanzi_pairs, 0, "gb-hanzi-encode")
- static_u8_pair_table_from_indexable("GB2312_LEVEL1_HANZI_BYTES", level1_hanzi_pairs, 1, "gb-hanzi-encode")
- # Fast Hanzi encoder table
- hanzi_bytes = [None] * (0x9FA7 - 0x4E00)
- for row in xrange(126):
- for column in xrange(190):
- pointer = column + (row * 190)
- code_point = index[pointer]
- if code_point and code_point >= 0x4E00 and code_point <= 0x9FA6:
- hanzi_lead = 0x81 + row
- hanzi_trail = column + (0x40 if column < 0x3F else 0x41)
- hanzi_bytes[code_point - 0x4E00] = (hanzi_lead, hanzi_trail)
- static_u8_pair_table("GBK_HANZI_BYTES", hanzi_bytes, "fast-gb-hanzi-encode")
- data_file.write(data_rs_end)
- data_file.close()
- # Variant
- variant_file = open("src/variant.rs", "w")
- variant_file.write('''// Copyright Mozilla Foundation. See the COPYRIGHT
- // file at the top-level directory of this distribution.
- //
- // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
- // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
- // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
- // option. This file may not be copied, modified, or distributed
- // except according to those terms.
- // THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
- // Instead, please regenerate using generate-encoding-data.py
- //! This module provides enums that wrap the various decoders and encoders.
- //! The purpose is to make `Decoder` and `Encoder` `Sized` by writing the
- //! dispatch explicitly for a finite set of specialized decoders and encoders.
- //! Unfortunately, this means the compiler doesn't generate the dispatch code
- //! and it has to be written here instead.
- //!
- //! The purpose of making `Decoder` and `Encoder` `Sized` is to allow stack
- //! allocation in Rust code, including the convenience methods on `Encoding`.
- ''')
- encoding_variants = [u"single-byte",]
- for encoding in multi_byte:
- if encoding["name"] in [u"UTF-16LE", u"UTF-16BE"]:
- continue
- else:
- encoding_variants.append(encoding["name"])
- encoding_variants.append(u"UTF-16")
- decoder_variants = []
- for variant in encoding_variants:
- if variant == u"GBK":
- continue
- decoder_variants.append(variant)
- encoder_variants = []
- for variant in encoding_variants:
- if variant in [u"replacement", u"GBK", u"UTF-16"]:
- continue
- encoder_variants.append(variant)
- for variant in decoder_variants:
- variant_file.write("use %s::*;\n" % to_snake_name(variant))
- variant_file.write('''use super::*;
- pub enum VariantDecoder {
- ''')
- for variant in decoder_variants:
- variant_file.write(" %s(%sDecoder),\n" % (to_camel_name(variant), to_camel_name(variant)))
- variant_file.write('''}
- impl VariantDecoder {
- ''')
- def write_variant_method(name, mut, arg_list, ret, variants, excludes, kind):
- variant_file.write('''pub fn %s(&''' % name)
- if mut:
- variant_file.write('''mut ''')
- variant_file.write('''self''')
- for arg in arg_list:
- variant_file.write(''', %s: %s''' % (arg[0], arg[1]))
- variant_file.write(''')''')
- if ret:
- variant_file.write(''' -> %s''' % ret)
- variant_file.write(''' {\nmatch *self {\n''')
- for variant in variants:
- variant_file.write('''Variant%s::%s(ref ''' % (kind, to_camel_name(variant)))
- if mut:
- variant_file.write('''mut ''')
- if variant in excludes:
- variant_file.write('''v) => (),''')
- continue
- variant_file.write('''v) => v.%s(''' % name)
- first = True
- for arg in arg_list:
- if not first:
- variant_file.write(''', ''')
- first = False
- variant_file.write(arg[0])
- variant_file.write('''),\n''')
- variant_file.write('''}\n}\n\n''')
- write_variant_method("max_utf16_buffer_length", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder")
- write_variant_method("max_utf8_buffer_length_without_replacement", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder")
- write_variant_method("max_utf8_buffer_length", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder")
- write_variant_method("decode_to_utf16_raw", True, [("src", "&[u8]"),
- ("dst", "&mut [u16]"),
- ("last", "bool")], "(DecoderResult, usize, usize)", decoder_variants, [], "Decoder")
- write_variant_method("decode_to_utf8_raw", True, [("src", "&[u8]"),
- ("dst", "&mut [u8]"),
- ("last", "bool")], "(DecoderResult, usize, usize)", decoder_variants, [], "Decoder")
- variant_file.write('''
- pub fn latin1_byte_compatible_up_to(&self, buffer: &[u8]) -> Option<usize> {
- match *self {
- VariantDecoder::SingleByte(ref v) => {
- return Some(v.latin1_byte_compatible_up_to(buffer));
- }
- VariantDecoder::Utf8(ref v) => {
- if !v.in_neutral_state() {
- return None;
- }
- }
- VariantDecoder::Gb18030(ref v) => {
- if !v.in_neutral_state() {
- return None;
- }
- }
- VariantDecoder::Big5(ref v) => {
- if !v.in_neutral_state() {
- return None;
- }
- }
- VariantDecoder::EucJp(ref v) => {
- if !v.in_neutral_state() {
- return None;
- }
- }
- VariantDecoder::Iso2022Jp(ref v) => {
- if v.in_neutral_state() {
- return Some(Encoding::iso_2022_jp_ascii_valid_up_to(buffer));
- }
- return None;
- }
- VariantDecoder::ShiftJis(ref v) => {
- if !v.in_neutral_state() {
- return None;
- }
- }
- VariantDecoder::EucKr(ref v) => {
- if !v.in_neutral_state() {
- return None;
- }
- }
- VariantDecoder::UserDefined(_) => {}
- VariantDecoder::Replacement(_) | VariantDecoder::Utf16(_) => {
- return None;
- }
- };
- Some(Encoding::ascii_valid_up_to(buffer))
- }
- }
- pub enum VariantEncoder {
- ''')
- for variant in encoder_variants:
- variant_file.write(" %s(%sEncoder),\n" % (to_camel_name(variant), to_camel_name(variant)))
- variant_file.write('''}
- impl VariantEncoder {
- pub fn has_pending_state(&self) -> bool {
- match *self {
- VariantEncoder::Iso2022Jp(ref v) => {
- v.has_pending_state()
- }
- _ => false,
- }
- }
- ''')
- write_variant_method("max_buffer_length_from_utf16_without_replacement", False, [("u16_length", "usize")], "Option<usize>", encoder_variants, [], "Encoder")
- write_variant_method("max_buffer_length_from_utf8_without_replacement", False, [("byte_length", "usize")], "Option<usize>", encoder_variants, [], "Encoder")
- write_variant_method("encode_from_utf16_raw", True, [("src", "&[u16]"),
- ("dst", "&mut [u8]"),
- ("last", "bool")], "(EncoderResult, usize, usize)", encoder_variants, [], "Encoder")
- write_variant_method("encode_from_utf8_raw", True, [("src", "&str"),
- ("dst", "&mut [u8]"),
- ("last", "bool")], "(EncoderResult, usize, usize)", encoder_variants, [], "Encoder")
- variant_file.write('''}
- pub enum VariantEncoding {
- SingleByte(&'static [u16; 128], u16, u8, u8),''')
- for encoding in multi_byte:
- variant_file.write("%s,\n" % to_camel_name(encoding["name"]))
- variant_file.write('''}
- impl VariantEncoding {
- pub fn new_variant_decoder(&self) -> VariantDecoder {
- match *self {
- VariantEncoding::SingleByte(table, _, _, _) => SingleByteDecoder::new(table),
- VariantEncoding::Utf8 => Utf8Decoder::new(),
- VariantEncoding::Gbk | VariantEncoding::Gb18030 => Gb18030Decoder::new(),
- VariantEncoding::Big5 => Big5Decoder::new(),
- VariantEncoding::EucJp => EucJpDecoder::new(),
- VariantEncoding::Iso2022Jp => Iso2022JpDecoder::new(),
- VariantEncoding::ShiftJis => ShiftJisDecoder::new(),
- VariantEncoding::EucKr => EucKrDecoder::new(),
- VariantEncoding::Replacement => ReplacementDecoder::new(),
- VariantEncoding::UserDefined => UserDefinedDecoder::new(),
- VariantEncoding::Utf16Be => Utf16Decoder::new(true),
- VariantEncoding::Utf16Le => Utf16Decoder::new(false),
- }
- }
- pub fn new_encoder(&self, encoding: &'static Encoding) -> Encoder {
- match *self {
- VariantEncoding::SingleByte(table, run_bmp_offset, run_byte_offset, run_length) => SingleByteEncoder::new(encoding, table, run_bmp_offset, run_byte_offset, run_length),
- VariantEncoding::Utf8 => Utf8Encoder::new(encoding),
- VariantEncoding::Gbk => Gb18030Encoder::new(encoding, false),
- VariantEncoding::Gb18030 => Gb18030Encoder::new(encoding, true),
- VariantEncoding::Big5 => Big5Encoder::new(encoding),
- VariantEncoding::EucJp => EucJpEncoder::new(encoding),
- VariantEncoding::Iso2022Jp => Iso2022JpEncoder::new(encoding),
- VariantEncoding::ShiftJis => ShiftJisEncoder::new(encoding),
- VariantEncoding::EucKr => EucKrEncoder::new(encoding),
- VariantEncoding::UserDefined => UserDefinedEncoder::new(encoding),
- VariantEncoding::Utf16Be | VariantEncoding::Replacement |
- VariantEncoding::Utf16Le => unreachable!(),
- }
- }
- pub fn is_single_byte(&self) -> bool {
- match *self {
- VariantEncoding::SingleByte(_, _, _, _) | VariantEncoding::UserDefined => true,
- _ => false,
- }
- }
- }
- ''')
- variant_file.close()
- (ffi_rs_begin, ffi_rs_end) = read_non_generated("../encoding_c/src/lib.rs")
- ffi_file = open("../encoding_c/src/lib.rs", "w")
- ffi_file.write(ffi_rs_begin)
- ffi_file.write("""
- // Instead, please regenerate using generate-encoding-data.py
- /// The minimum length of buffers that may be passed to `encoding_name()`.
- pub const ENCODING_NAME_MAX_LENGTH: usize = %d; // %s
- """ % (longest_name_length, longest_name))
- for name in preferred:
- ffi_file.write('''/// The %s encoding.
- #[no_mangle]
- pub static %s_ENCODING: ConstEncoding = ConstEncoding(&%s_INIT);
- ''' % (to_dom_name(name), to_constant_name(name), to_constant_name(name)))
- ffi_file.write(ffi_rs_end)
- ffi_file.close()
- (single_byte_rs_begin, single_byte_rs_end) = read_non_generated("src/single_byte.rs")
- single_byte_file = open("src/single_byte.rs", "w")
- single_byte_file.write(single_byte_rs_begin)
- single_byte_file.write("""
- // Instead, please regenerate using generate-encoding-data.py
- #[test]
- fn test_single_byte_decode() {""")
- idx = 0 # for Miri, return after 2nd test
- for name in preferred:
- if name == u"ISO-8859-8-I":
- continue;
- if is_single_byte(name):
- single_byte_file.write("""
- decode_single_byte(%s, &data::SINGLE_BYTE_DATA.%s);""" % (to_constant_name(name), to_snake_name(name)))
- idx += 1
- if idx == 2:
- single_byte_file.write("""
- if cfg!(miri) {
- // Miri is too slow
- return;
- }""")
- single_byte_file.write("""
- }
- #[test]
- fn test_single_byte_encode() {""")
- idx = 0 # for Miri, return after 2nd test
- for name in preferred:
- if name == u"ISO-8859-8-I":
- continue;
- if is_single_byte(name):
- single_byte_file.write("""
- encode_single_byte(%s, &data::SINGLE_BYTE_DATA.%s);""" % (to_constant_name(name), to_snake_name(name)))
- idx += 1
- if idx == 2:
- single_byte_file.write("""
- if cfg!(miri) {
- // Miri is too slow
- return;
- }""")
- single_byte_file.write("""
- }
- """)
- single_byte_file.write(single_byte_rs_end)
- single_byte_file.close()
- static_file = open("../encoding_c/include/encoding_rs_statics.h", "w")
- static_file.write("""// Copyright Mozilla Foundation. See the COPYRIGHT
- // file at the top-level directory of this distribution.
- //
- // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
- // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
- // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
- // option. This file may not be copied, modified, or distributed
- // except according to those terms.
- // THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
- // Instead, please regenerate using generate-encoding-data.py
- // This file is not meant to be included directly. Instead, encoding_rs.h
- // includes this file.
- #ifndef encoding_rs_statics_h_
- #define encoding_rs_statics_h_
- #ifndef ENCODING_RS_ENCODING
- #define ENCODING_RS_ENCODING Encoding
- #ifndef __cplusplus
- typedef struct Encoding_ Encoding;
- #endif
- #endif
- #ifndef ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR
- #define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ENCODING_RS_ENCODING*
- #endif
- #ifndef ENCODING_RS_ENCODER
- #define ENCODING_RS_ENCODER Encoder
- #ifndef __cplusplus
- typedef struct Encoder_ Encoder;
- #endif
- #endif
- #ifndef ENCODING_RS_DECODER
- #define ENCODING_RS_DECODER Decoder
- #ifndef __cplusplus
- typedef struct Decoder_ Decoder;
- #endif
- #endif
- #define INPUT_EMPTY 0
- #define OUTPUT_FULL 0xFFFFFFFF
- // %s
- #define ENCODING_NAME_MAX_LENGTH %d
- """ % (longest_name, longest_name_length))
- for name in preferred:
- static_file.write('''/// The %s encoding.
- extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const %s_ENCODING;
- ''' % (to_dom_name(name), to_constant_name(name)))
- static_file.write("""#endif // encoding_rs_statics_h_
- """)
- static_file.close()
- (utf_8_rs_begin, utf_8_rs_end) = read_non_generated("src/utf_8.rs")
- utf_8_file = open("src/utf_8.rs", "w")
- utf_8_file.write(utf_8_rs_begin)
- utf_8_file.write("""
- // Instead, please regenerate using generate-encoding-data.py
- pub static UTF8_DATA: Utf8Data = Utf8Data {
- table: [
- """)
- for i in range(256):
- combined = (1 << 2) # invalid lead
- if i < 0x80 or i > 0xBF:
- combined |= (1 << 3) # normal trail
- if i < 0xA0 or i > 0xBF:
- combined |= (1 << 4) # three-byte special lower bound
- if i < 0x80 or i > 0x9F:
- combined |= (1 << 5) # three-byte special upper bound
- if i < 0x90 or i > 0xBF:
- combined |= (1 << 6) # four-byte special lower bound
- if i < 0x80 or i > 0x8F:
- combined |= (1 << 7) # four-byte special upper bound
- utf_8_file.write("%d," % combined)
- for i in range(128, 256):
- lane = (1 << 2) # invalid lead
- if i >= 0xC2 and i <= 0xDF:
- lane = (1 << 3) # normal trail
- elif i == 0xE0:
- lane = (1 << 4) # three-byte special lower bound
- elif i >= 0xE1 and i <= 0xEC:
- lane = (1 << 3) # normal trail
- elif i == 0xED:
- lane = (1 << 5) # three-byte special upper bound
- elif i >= 0xEE and i <= 0xEF:
- lane = (1 << 3) # normal trail
- elif i == 0xF0:
- lane = (1 << 6) # four-byte special lower bound
- elif i >= 0xF1 and i <= 0xF3:
- lane = (1 << 3) # normal trail
- elif i == 0xF4:
- lane = (1 << 7) # four-byte special upper bound
- utf_8_file.write("%d," % lane)
- utf_8_file.write("""
- ],
- };
- """)
- utf_8_file.write(utf_8_rs_end)
- utf_8_file.close()
- # Unit tests
- TEST_HEADER = '''Any copyright to the test code below this comment is dedicated to the
- Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
- This is a generated file. Please do not edit.
- Instead, please regenerate using generate-encoding-data.py
- '''
- index = indexes["jis0208"]
- jis0208_in_file = open("src/test_data/jis0208_in.txt", "w")
- jis0208_in_file.write(TEST_HEADER)
- for pointer in range(0, 94 * 94):
- (lead, trail) = divmod(pointer, 94)
- lead += 0xA1
- trail += 0xA1
- jis0208_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
- jis0208_in_file.close()
- jis0208_in_ref_file = open("src/test_data/jis0208_in_ref.txt", "w")
- jis0208_in_ref_file.write(TEST_HEADER)
- for pointer in range(0, 94 * 94):
- code_point = index[pointer]
- if code_point:
- jis0208_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
- else:
- jis0208_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
- jis0208_in_ref_file.close()
- jis0208_out_file = open("src/test_data/jis0208_out.txt", "w")
- jis0208_out_ref_file = open("src/test_data/jis0208_out_ref.txt", "w")
- jis0208_out_file.write(TEST_HEADER)
- jis0208_out_ref_file.write(TEST_HEADER)
- for pointer in range(0, 94 * 94):
- code_point = index[pointer]
- if code_point:
- revised_pointer = pointer
- if revised_pointer == 8644 or (revised_pointer >= 1207 and revised_pointer < 1220):
- revised_pointer = index.index(code_point)
- (lead, trail) = divmod(revised_pointer, 94)
- lead += 0xA1
- trail += 0xA1
- jis0208_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
- jis0208_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
- jis0208_out_file.close()
- jis0208_out_ref_file.close()
- shift_jis_in_file = open("src/test_data/shift_jis_in.txt", "w")
- shift_jis_in_file.write(TEST_HEADER)
- for pointer in range(0, len(index)):
- (lead, trail) = divmod(pointer, 188)
- lead += 0x81 if lead < 0x1F else 0xC1
- trail += 0x40 if trail < 0x3F else 0x41
- shift_jis_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
- shift_jis_in_file.close()
- shift_jis_in_ref_file = open("src/test_data/shift_jis_in_ref.txt", "w")
- shift_jis_in_ref_file.write(TEST_HEADER)
- for pointer in range(0, len(index)):
- code_point = 0xE000 - 8836 + pointer if pointer >= 8836 and pointer <= 10715 else index[pointer]
- if code_point:
- shift_jis_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
- else:
- trail = pointer % 188
- trail += 0x40 if trail < 0x3F else 0x41
- if trail < 0x80:
- shift_jis_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8"))
- else:
- shift_jis_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
- shift_jis_in_ref_file.close()
- shift_jis_out_file = open("src/test_data/shift_jis_out.txt", "w")
- shift_jis_out_ref_file = open("src/test_data/shift_jis_out_ref.txt", "w")
- shift_jis_out_file.write(TEST_HEADER)
- shift_jis_out_ref_file.write(TEST_HEADER)
- for pointer in range(0, 8272):
- code_point = index[pointer]
- if code_point:
- revised_pointer = pointer
- if revised_pointer >= 1207 and revised_pointer < 1220:
- revised_pointer = index.index(code_point)
- (lead, trail) = divmod(revised_pointer, 188)
- lead += 0x81 if lead < 0x1F else 0xC1
- trail += 0x40 if trail < 0x3F else 0x41
- shift_jis_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
- shift_jis_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
- for pointer in range(8836, len(index)):
- code_point = index[pointer]
- if code_point:
- revised_pointer = index.index(code_point)
- if revised_pointer >= 8272 and revised_pointer < 8836:
- revised_pointer = pointer
- (lead, trail) = divmod(revised_pointer, 188)
- lead += 0x81 if lead < 0x1F else 0xC1
- trail += 0x40 if trail < 0x3F else 0x41
- shift_jis_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
- shift_jis_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
- shift_jis_out_file.close()
- shift_jis_out_ref_file.close()
- iso_2022_jp_in_file = open("src/test_data/iso_2022_jp_in.txt", "w")
- iso_2022_jp_in_file.write(TEST_HEADER)
- for pointer in range(0, 94 * 94):
- (lead, trail) = divmod(pointer, 94)
- lead += 0x21
- trail += 0x21
- iso_2022_jp_in_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail)))
- iso_2022_jp_in_file.close()
- iso_2022_jp_in_ref_file = open("src/test_data/iso_2022_jp_in_ref.txt", "w")
- iso_2022_jp_in_ref_file.write(TEST_HEADER)
- for pointer in range(0, 94 * 94):
- code_point = index[pointer]
- if code_point:
- iso_2022_jp_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
- else:
- iso_2022_jp_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
- iso_2022_jp_in_ref_file.close()
- iso_2022_jp_out_file = open("src/test_data/iso_2022_jp_out.txt", "w")
- iso_2022_jp_out_ref_file = open("src/test_data/iso_2022_jp_out_ref.txt", "w")
- iso_2022_jp_out_file.write(TEST_HEADER)
- iso_2022_jp_out_ref_file.write(TEST_HEADER)
- for pointer in range(0, 94 * 94):
- code_point = index[pointer]
- if code_point:
- revised_pointer = pointer
- if revised_pointer == 8644 or (revised_pointer >= 1207 and revised_pointer < 1220):
- revised_pointer = index.index(code_point)
- (lead, trail) = divmod(revised_pointer, 94)
- lead += 0x21
- trail += 0x21
- iso_2022_jp_out_ref_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail)))
- iso_2022_jp_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
- for i in xrange(len(half_width_index)):
- code_point = i + 0xFF61
- normalized_code_point = half_width_index[i]
- pointer = index.index(normalized_code_point)
- (lead, trail) = divmod(pointer, 94)
- lead += 0x21
- trail += 0x21
- iso_2022_jp_out_ref_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail)))
- iso_2022_jp_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
- iso_2022_jp_out_file.close()
- iso_2022_jp_out_ref_file.close()
- index = indexes["euc-kr"]
- euc_kr_in_file = open("src/test_data/euc_kr_in.txt", "w")
- euc_kr_in_file.write(TEST_HEADER)
- for pointer in range(0, len(index)):
- (lead, trail) = divmod(pointer, 190)
- lead += 0x81
- trail += 0x41
- euc_kr_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
- euc_kr_in_file.close()
- euc_kr_in_ref_file = open("src/test_data/euc_kr_in_ref.txt", "w")
- euc_kr_in_ref_file.write(TEST_HEADER)
- for pointer in range(0, len(index)):
- code_point = index[pointer]
- if code_point:
- euc_kr_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
- else:
- trail = pointer % 190
- trail += 0x41
- if trail < 0x80:
- euc_kr_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8"))
- else:
- euc_kr_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
- euc_kr_in_ref_file.close()
- euc_kr_out_file = open("src/test_data/euc_kr_out.txt", "w")
- euc_kr_out_ref_file = open("src/test_data/euc_kr_out_ref.txt", "w")
- euc_kr_out_file.write(TEST_HEADER)
- euc_kr_out_ref_file.write(TEST_HEADER)
- for pointer in range(0, len(index)):
- code_point = index[pointer]
- if code_point:
- (lead, trail) = divmod(pointer, 190)
- lead += 0x81
- trail += 0x41
- euc_kr_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
- euc_kr_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
- euc_kr_out_file.close()
- euc_kr_out_ref_file.close()
- index = indexes["gb18030"]
- gb18030_in_file = open("src/test_data/gb18030_in.txt", "w")
- gb18030_in_file.write(TEST_HEADER)
- for pointer in range(0, len(index)):
- (lead, trail) = divmod(pointer, 190)
- lead += 0x81
- trail += 0x40 if trail < 0x3F else 0x41
- gb18030_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
- gb18030_in_file.close()
- gb18030_in_ref_file = open("src/test_data/gb18030_in_ref.txt", "w")
- gb18030_in_ref_file.write(TEST_HEADER)
- for pointer in range(0, len(index)):
- code_point = index[pointer]
- if code_point:
- gb18030_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
- else:
- trail = pointer % 190
- trail += 0x40 if trail < 0x3F else 0x41
- if trail < 0x80:
- gb18030_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8"))
- else:
- gb18030_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
- gb18030_in_ref_file.close()
- gb18030_out_file = open("src/test_data/gb18030_out.txt", "w")
- gb18030_out_ref_file = open("src/test_data/gb18030_out_ref.txt", "w")
- gb18030_out_file.write(TEST_HEADER)
- gb18030_out_ref_file.write(TEST_HEADER)
- for pointer in range(0, len(index)):
- if pointer == 6555:
- continue
- code_point = index[pointer]
- if code_point:
- (lead, trail) = divmod(pointer, 190)
- lead += 0x81
- trail += 0x40 if trail < 0x3F else 0x41
- gb18030_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
- gb18030_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
- gb18030_out_file.close()
- gb18030_out_ref_file.close()
- index = indexes["big5"]
- big5_in_file = open("src/test_data/big5_in.txt", "w")
- big5_in_file.write(TEST_HEADER)
- for pointer in range(0, len(index)):
- (lead, trail) = divmod(pointer, 157)
- lead += 0x81
- trail += 0x40 if trail < 0x3F else 0x62
- big5_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
- big5_in_file.close()
- big5_two_characters = {
- 1133: u"\u00CA\u0304",
- 1135: u"\u00CA\u030C",
- 1164: u"\u00EA\u0304",
- 1166: u"\u00EA\u030C",
- }
- big5_in_ref_file = open("src/test_data/big5_in_ref.txt", "w")
- big5_in_ref_file.write(TEST_HEADER)
- for pointer in range(0, len(index)):
- if pointer in big5_two_characters.keys():
- big5_in_ref_file.write((u"%s\n" % big5_two_characters[pointer]).encode("utf-8"))
- continue
- code_point = index[pointer]
- if code_point:
- big5_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
- else:
- trail = pointer % 157
- trail += 0x40 if trail < 0x3F else 0x62
- if trail < 0x80:
- big5_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8"))
- else:
- big5_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
- big5_in_ref_file.close()
- prefer_last = [
- 0x2550,
- 0x255E,
- 0x2561,
- 0x256A,
- 0x5341,
- 0x5345,
- ]
- pointer_for_prefer_last = []
- for code_point in prefer_last:
- # Python lists don't have .rindex() :-(
- for i in xrange(len(index) - 1, -1, -1):
- candidate = index[i]
- if candidate == code_point:
- pointer_for_prefer_last.append(i)
- break
- big5_out_file = open("src/test_data/big5_out.txt", "w")
- big5_out_ref_file = open("src/test_data/big5_out_ref.txt", "w")
- big5_out_file.write(TEST_HEADER)
- big5_out_ref_file.write(TEST_HEADER)
- for pointer in range(((0xA1 - 0x81) * 157), len(index)):
- code_point = index[pointer]
- if code_point:
- if code_point in prefer_last:
- if pointer != pointer_for_prefer_last[prefer_last.index(code_point)]:
- continue
- else:
- if pointer != index.index(code_point):
- continue
- (lead, trail) = divmod(pointer, 157)
- lead += 0x81
- trail += 0x40 if trail < 0x3F else 0x62
- big5_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
- big5_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
- big5_out_file.close()
- big5_out_ref_file.close()
- index = indexes["jis0212"]
- jis0212_in_file = open("src/test_data/jis0212_in.txt", "w")
- jis0212_in_file.write(TEST_HEADER)
- for pointer in range(0, len(index)):
- (lead, trail) = divmod(pointer, 94)
- lead += 0xA1
- trail += 0xA1
- jis0212_in_file.write("\x8F%s%s\n" % (chr(lead), chr(trail)))
- jis0212_in_file.close()
- jis0212_in_ref_file = open("src/test_data/jis0212_in_ref.txt", "w")
- jis0212_in_ref_file.write(TEST_HEADER)
- for pointer in range(0, len(index)):
- code_point = index[pointer]
- if code_point:
- jis0212_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
- else:
- jis0212_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
- jis0212_in_ref_file.close()
- (codepage_begin, codepage_end) = read_non_generated("../codepage/src/lib.rs")
- codepage_file = open("../codepage/src/lib.rs", "w")
- codepage_file.write(codepage_begin)
- codepage_file.write("""
- // Instead, please regenerate using generate-encoding-data.py
- /// Supported code page numbers in estimated order of usage frequency
- static CODE_PAGES: [u16; %d] = [
- """ % len(code_pages))
- for code_page in code_pages:
- codepage_file.write(" %d,\n" % code_page)
- codepage_file.write("""];
- /// Encodings corresponding to the code page numbers in the same order
- static ENCODINGS: [&'static Encoding; %d] = [
- """ % len(code_pages))
- for code_page in code_pages:
- name = encodings_by_code_page[code_page]
- codepage_file.write(" &%s_INIT,\n" % to_constant_name(name))
- codepage_file.write("""];
- """)
- codepage_file.write(codepage_end)
- codepage_file.close()
- (codepage_test_begin, codepage_test_end) = read_non_generated("../codepage/src/tests.rs")
- codepage_test_file = open("../codepage/src/tests.rs", "w")
- codepage_test_file.write(codepage_test_begin)
- codepage_test_file.write("""
- // Instead, please regenerate using generate-encoding-data.py
- #[test]
- fn test_to_encoding() {
- assert_eq!(to_encoding(0), None);
- """)
- for code_page in code_pages:
- codepage_test_file.write(" assert_eq!(to_encoding(%d), Some(%s));\n" % (code_page, to_constant_name(encodings_by_code_page[code_page])))
- codepage_test_file.write("""}
- #[test]
- fn test_from_encoding() {
- """)
- for name in preferred:
- if code_pages_by_encoding.has_key(name):
- codepage_test_file.write(" assert_eq!(from_encoding(%s), Some(%d));\n" % (to_constant_name(name), code_pages_by_encoding[name]))
- else:
- codepage_test_file.write(" assert_eq!(from_encoding(%s), None);\n" % to_constant_name(name))
- codepage_test_file.write("""}
- """)
- codepage_test_file.write(codepage_test_end)
- codepage_test_file.close()
- subprocess.call(["cargo", "fmt"])
|