123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212 |
- #!/usr/bin/env python
- # -*- coding: utf-8
- #
- # Copyright 2015 The Rust Project Developers. See the COPYRIGHT
- # file at the top-level directory of this distribution and at
- # http://rust-lang.org/COPYRIGHT.
- #
- # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
- # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
- # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
- # option. This file may not be copied, modified, or distributed
- # except according to those terms.
- # This script uses the following Unicode tables:
- # - auxiliary/GraphemeBreakTest.txt
- # - auxiliary/WordBreakTest.txt
- #
- # Since this should not require frequent updates, we just store this
- # out-of-line and check the unicode.rs file into git.
- from __future__ import print_function
- import unicode, re, os, fileinput
- def load_test_data(f, optsplit=[]):
- testRe1 = re.compile(r"^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$")
- unicode.fetch(f)
- data = []
- for line in fileinput.input(os.path.basename(f)):
- # lines that include a test start with the ÷ character
- if len(line) < 2 or not line.startswith('÷'):
- continue
- m = testRe1.match(line)
- if not m:
- print("error: no match on line where test was expected: %s" % line)
- continue
- # process the characters in this test case
- chars = process_split_string(m.group(1))
- # skip test case if it contains invalid characters (viz., surrogates)
- if not chars:
- continue
- # now process test cases
- (chars, info) = process_split_info(m.group(2), chars, optsplit)
- # make sure that we have break info for each break!
- assert len(chars) - 1 == len(info)
- data.append((chars, info))
- return data
- def process_split_info(s, c, o):
- outcs = []
- outis = []
- workcs = c.pop(0)
- # are we on a × or a ÷?
- isX = False
- if s.startswith('×'):
- isX = True
- # find each instance of '(÷|×) [x.y] '
- while s:
- # find the currently considered rule number
- sInd = s.index('[') + 1
- eInd = s.index(']')
- # if it's '× [a.b]' where 'a.b' is in o, then
- # we consider it a split even though it's not
- # marked as one
- # if it's ÷ then it's always a split
- if not isX or s[sInd:eInd] in o:
- outis.append(s[sInd:eInd])
- outcs.append(workcs)
- workcs = c.pop(0)
- else:
- workcs.extend(c.pop(0))
- idx = 1
- while idx < len(s):
- if s[idx:].startswith('×'):
- isX = True
- break
- if s[idx:].startswith('÷'):
- isX = False
- break
- idx += 1
- s = s[idx:]
- outcs.append(workcs)
- return (outcs, outis)
- def process_split_string(s):
- outls = []
- workls = []
- inls = s.split()
- for i in inls:
- if i == '÷' or i == '×':
- outls.append(workls)
- workls = []
- continue
- ival = int(i,16)
- if unicode.is_surrogate(ival):
- return []
- workls.append(ival)
- if workls:
- outls.append(workls)
- return outls
- def showfun(x):
- outstr = '("'
- for c in x[0]:
- outstr += "\\u{%x}" % c
- outstr += '",&['
- xfirst = True
- for xx in x[1:]:
- if not xfirst:
- outstr += '],&['
- xfirst = False
- sfirst = True
- for sp in xx:
- if not sfirst:
- outstr += ','
- sfirst = False
- outstr += '"'
- for c in sp:
- outstr += "\\u{%x}" % c
- outstr += '"'
- outstr += '])'
- return outstr
- def create_grapheme_data(f):
- # rules 9.1 and 9.2 are for extended graphemes only
- optsplits = ['9.1','9.2']
- d = load_test_data("auxiliary/GraphemeBreakTest.txt", optsplits)
- test_same = []
- test_diff = []
- for (c, i) in d:
- allchars = [cn for s in c for cn in s]
- extgraphs = []
- extwork = []
- extwork.extend(c[0])
- for n in range(0,len(i)):
- if i[n] in optsplits:
- extwork.extend(c[n+1])
- else:
- extgraphs.append(extwork)
- extwork = []
- extwork.extend(c[n+1])
- # these are the extended grapheme clusters
- extgraphs.append(extwork)
- if extgraphs == c:
- test_same.append((allchars, c))
- else:
- test_diff.append((allchars, extgraphs, c))
- stype = "&'static [(&'static str, &'static [&'static str])]"
- dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
- f.write(" // official Unicode test data\n")
- f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/GraphemeBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
- unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True)
- unicode.emit_table(f, "TEST_DIFF", test_diff, dtype, True, showfun, True)
- def create_words_data(f):
- d = load_test_data("auxiliary/WordBreakTest.txt")
- test = []
- for (c, i) in d:
- allchars = [cn for s in c for cn in s]
- test.append((allchars, c))
- wtype = "&'static [(&'static str, &'static [&'static str])]"
- f.write(" // official Unicode test data\n")
- f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/WordBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
- unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)
- def create_sentence_data(f):
- d = load_test_data("auxiliary/SentenceBreakTest.txt")
- test = []
- for (c, i) in d:
- allchars = [cn for s in c for cn in s]
- test.append((allchars, c))
- wtype = "&'static [(&'static str, &'static [&'static str])]"
- f.write(" // official Unicode test data\n")
- f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/SentenceBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
- unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)
- if __name__ == "__main__":
- with open("testdata.rs", "w") as rf:
- rf.write(unicode.preamble)
- create_grapheme_data(rf)
- create_words_data(rf)
- create_sentence_data(rf)
|