unicode_gen_breaktests.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8
  3. #
  4. # Copyright 2015 The Rust Project Developers. See the COPYRIGHT
  5. # file at the top-level directory of this distribution and at
  6. # http://rust-lang.org/COPYRIGHT.
  7. #
  8. # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  9. # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  10. # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  11. # option. This file may not be copied, modified, or distributed
  12. # except according to those terms.
  13. # This script uses the following Unicode tables:
  14. # - auxiliary/GraphemeBreakTest.txt
  15. # - auxiliary/WordBreakTest.txt
  16. #
  17. # Since this should not require frequent updates, we just store this
  18. # out-of-line and check the unicode.rs file into git.
  19. from __future__ import print_function
  20. import unicode, re, os, fileinput
  21. def load_test_data(f, optsplit=[]):
  22. testRe1 = re.compile(r"^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$")
  23. unicode.fetch(f)
  24. data = []
  25. for line in fileinput.input(os.path.basename(f)):
  26. # lines that include a test start with the ÷ character
  27. if len(line) < 2 or not line.startswith('÷'):
  28. continue
  29. m = testRe1.match(line)
  30. if not m:
  31. print("error: no match on line where test was expected: %s" % line)
  32. continue
  33. # process the characters in this test case
  34. chars = process_split_string(m.group(1))
  35. # skip test case if it contains invalid characters (viz., surrogates)
  36. if not chars:
  37. continue
  38. # now process test cases
  39. (chars, info) = process_split_info(m.group(2), chars, optsplit)
  40. # make sure that we have break info for each break!
  41. assert len(chars) - 1 == len(info)
  42. data.append((chars, info))
  43. return data
  44. def process_split_info(s, c, o):
  45. outcs = []
  46. outis = []
  47. workcs = c.pop(0)
  48. # are we on a × or a ÷?
  49. isX = False
  50. if s.startswith('×'):
  51. isX = True
  52. # find each instance of '(÷|×) [x.y] '
  53. while s:
  54. # find the currently considered rule number
  55. sInd = s.index('[') + 1
  56. eInd = s.index(']')
  57. # if it's '× [a.b]' where 'a.b' is in o, then
  58. # we consider it a split even though it's not
  59. # marked as one
  60. # if it's ÷ then it's always a split
  61. if not isX or s[sInd:eInd] in o:
  62. outis.append(s[sInd:eInd])
  63. outcs.append(workcs)
  64. workcs = c.pop(0)
  65. else:
  66. workcs.extend(c.pop(0))
  67. idx = 1
  68. while idx < len(s):
  69. if s[idx:].startswith('×'):
  70. isX = True
  71. break
  72. if s[idx:].startswith('÷'):
  73. isX = False
  74. break
  75. idx += 1
  76. s = s[idx:]
  77. outcs.append(workcs)
  78. return (outcs, outis)
  79. def process_split_string(s):
  80. outls = []
  81. workls = []
  82. inls = s.split()
  83. for i in inls:
  84. if i == '÷' or i == '×':
  85. outls.append(workls)
  86. workls = []
  87. continue
  88. ival = int(i,16)
  89. if unicode.is_surrogate(ival):
  90. return []
  91. workls.append(ival)
  92. if workls:
  93. outls.append(workls)
  94. return outls
  95. def showfun(x):
  96. outstr = '("'
  97. for c in x[0]:
  98. outstr += "\\u{%x}" % c
  99. outstr += '",&['
  100. xfirst = True
  101. for xx in x[1:]:
  102. if not xfirst:
  103. outstr += '],&['
  104. xfirst = False
  105. sfirst = True
  106. for sp in xx:
  107. if not sfirst:
  108. outstr += ','
  109. sfirst = False
  110. outstr += '"'
  111. for c in sp:
  112. outstr += "\\u{%x}" % c
  113. outstr += '"'
  114. outstr += '])'
  115. return outstr
  116. def create_grapheme_data(f):
  117. # rules 9.1 and 9.2 are for extended graphemes only
  118. optsplits = ['9.1','9.2']
  119. d = load_test_data("auxiliary/GraphemeBreakTest.txt", optsplits)
  120. test_same = []
  121. test_diff = []
  122. for (c, i) in d:
  123. allchars = [cn for s in c for cn in s]
  124. extgraphs = []
  125. extwork = []
  126. extwork.extend(c[0])
  127. for n in range(0,len(i)):
  128. if i[n] in optsplits:
  129. extwork.extend(c[n+1])
  130. else:
  131. extgraphs.append(extwork)
  132. extwork = []
  133. extwork.extend(c[n+1])
  134. # these are the extended grapheme clusters
  135. extgraphs.append(extwork)
  136. if extgraphs == c:
  137. test_same.append((allchars, c))
  138. else:
  139. test_diff.append((allchars, extgraphs, c))
  140. stype = "&'static [(&'static str, &'static [&'static str])]"
  141. dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
  142. f.write(" // official Unicode test data\n")
  143. f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/GraphemeBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
  144. unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True)
  145. unicode.emit_table(f, "TEST_DIFF", test_diff, dtype, True, showfun, True)
  146. def create_words_data(f):
  147. d = load_test_data("auxiliary/WordBreakTest.txt")
  148. test = []
  149. for (c, i) in d:
  150. allchars = [cn for s in c for cn in s]
  151. test.append((allchars, c))
  152. wtype = "&'static [(&'static str, &'static [&'static str])]"
  153. f.write(" // official Unicode test data\n")
  154. f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/WordBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
  155. unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)
  156. def create_sentence_data(f):
  157. d = load_test_data("auxiliary/SentenceBreakTest.txt")
  158. test = []
  159. for (c, i) in d:
  160. allchars = [cn for s in c for cn in s]
  161. test.append((allchars, c))
  162. wtype = "&'static [(&'static str, &'static [&'static str])]"
  163. f.write(" // official Unicode test data\n")
  164. f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/SentenceBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
  165. unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)
  166. if __name__ == "__main__":
  167. with open("testdata.rs", "w") as rf:
  168. rf.write(unicode.preamble)
  169. create_grapheme_data(rf)
  170. create_words_data(rf)
  171. create_sentence_data(rf)