transform.h 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300
  1. /* Copyright 2013 Google Inc. All Rights Reserved.
  2. Distributed under MIT license.
  3. See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
  4. */
  5. /* Transformations on dictionary words. */
  6. #ifndef BROTLI_DEC_TRANSFORM_H_
  7. #define BROTLI_DEC_TRANSFORM_H_
  8. #include "../common/types.h"
  9. #include "./port.h"
  10. #if defined(__cplusplus) || defined(c_plusplus)
  11. extern "C" {
  12. #endif
  13. enum WordTransformType {
  14. kIdentity = 0,
  15. kOmitLast1 = 1,
  16. kOmitLast2 = 2,
  17. kOmitLast3 = 3,
  18. kOmitLast4 = 4,
  19. kOmitLast5 = 5,
  20. kOmitLast6 = 6,
  21. kOmitLast7 = 7,
  22. kOmitLast8 = 8,
  23. kOmitLast9 = 9,
  24. kUppercaseFirst = 10,
  25. kUppercaseAll = 11,
  26. kOmitFirst1 = 12,
  27. kOmitFirst2 = 13,
  28. kOmitFirst3 = 14,
  29. kOmitFirst4 = 15,
  30. kOmitFirst5 = 16,
  31. kOmitFirst6 = 17,
  32. kOmitFirst7 = 18,
  33. kOmitFirst8 = 19,
  34. kOmitFirst9 = 20
  35. };
  36. typedef struct {
  37. const uint8_t prefix_id;
  38. const uint8_t transform;
  39. const uint8_t suffix_id;
  40. } Transform;
  41. static const char kPrefixSuffix[208] =
  42. "\0 \0, \0 of the \0 of \0s \0.\0 and \0 in \0\"\0 to \0\">\0\n\0. \0]\0"
  43. " for \0 a \0 that \0\'\0 with \0 from \0 by \0(\0. The \0 on \0 as \0"
  44. " is \0ing \0\n\t\0:\0ed \0=\"\0 at \0ly \0,\0=\'\0.com/\0. This \0"
  45. " not \0er \0al \0ful \0ive \0less \0est \0ize \0\xc2\xa0\0ous ";
  46. enum {
  47. /* EMPTY = ""
  48. SP = " "
  49. DQUOT = "\""
  50. SQUOT = "'"
  51. CLOSEBR = "]"
  52. OPEN = "("
  53. SLASH = "/"
  54. NBSP = non-breaking space "\0xc2\xa0"
  55. */
  56. kPFix_EMPTY = 0,
  57. kPFix_SP = 1,
  58. kPFix_COMMASP = 3,
  59. kPFix_SPofSPtheSP = 6,
  60. kPFix_SPtheSP = 9,
  61. kPFix_eSP = 12,
  62. kPFix_SPofSP = 15,
  63. kPFix_sSP = 20,
  64. kPFix_DOT = 23,
  65. kPFix_SPandSP = 25,
  66. kPFix_SPinSP = 31,
  67. kPFix_DQUOT = 36,
  68. kPFix_SPtoSP = 38,
  69. kPFix_DQUOTGT = 43,
  70. kPFix_NEWLINE = 46,
  71. kPFix_DOTSP = 48,
  72. kPFix_CLOSEBR = 51,
  73. kPFix_SPforSP = 53,
  74. kPFix_SPaSP = 59,
  75. kPFix_SPthatSP = 63,
  76. kPFix_SQUOT = 70,
  77. kPFix_SPwithSP = 72,
  78. kPFix_SPfromSP = 79,
  79. kPFix_SPbySP = 86,
  80. kPFix_OPEN = 91,
  81. kPFix_DOTSPTheSP = 93,
  82. kPFix_SPonSP = 100,
  83. kPFix_SPasSP = 105,
  84. kPFix_SPisSP = 110,
  85. kPFix_ingSP = 115,
  86. kPFix_NEWLINETAB = 120,
  87. kPFix_COLON = 123,
  88. kPFix_edSP = 125,
  89. kPFix_EQDQUOT = 129,
  90. kPFix_SPatSP = 132,
  91. kPFix_lySP = 137,
  92. kPFix_COMMA = 141,
  93. kPFix_EQSQUOT = 143,
  94. kPFix_DOTcomSLASH = 146,
  95. kPFix_DOTSPThisSP = 152,
  96. kPFix_SPnotSP = 160,
  97. kPFix_erSP = 166,
  98. kPFix_alSP = 170,
  99. kPFix_fulSP = 174,
  100. kPFix_iveSP = 179,
  101. kPFix_lessSP = 184,
  102. kPFix_estSP = 190,
  103. kPFix_izeSP = 195,
  104. kPFix_NBSP = 200,
  105. kPFix_ousSP = 203
  106. };
  107. static const Transform kTransforms[] = {
  108. { kPFix_EMPTY, kIdentity, kPFix_EMPTY },
  109. { kPFix_EMPTY, kIdentity, kPFix_SP },
  110. { kPFix_SP, kIdentity, kPFix_SP },
  111. { kPFix_EMPTY, kOmitFirst1, kPFix_EMPTY },
  112. { kPFix_EMPTY, kUppercaseFirst, kPFix_SP },
  113. { kPFix_EMPTY, kIdentity, kPFix_SPtheSP },
  114. { kPFix_SP, kIdentity, kPFix_EMPTY },
  115. { kPFix_sSP, kIdentity, kPFix_SP },
  116. { kPFix_EMPTY, kIdentity, kPFix_SPofSP },
  117. { kPFix_EMPTY, kUppercaseFirst, kPFix_EMPTY },
  118. { kPFix_EMPTY, kIdentity, kPFix_SPandSP },
  119. { kPFix_EMPTY, kOmitFirst2, kPFix_EMPTY },
  120. { kPFix_EMPTY, kOmitLast1, kPFix_EMPTY },
  121. { kPFix_COMMASP, kIdentity, kPFix_SP },
  122. { kPFix_EMPTY, kIdentity, kPFix_COMMASP },
  123. { kPFix_SP, kUppercaseFirst, kPFix_SP },
  124. { kPFix_EMPTY, kIdentity, kPFix_SPinSP },
  125. { kPFix_EMPTY, kIdentity, kPFix_SPtoSP },
  126. { kPFix_eSP, kIdentity, kPFix_SP },
  127. { kPFix_EMPTY, kIdentity, kPFix_DQUOT },
  128. { kPFix_EMPTY, kIdentity, kPFix_DOT },
  129. { kPFix_EMPTY, kIdentity, kPFix_DQUOTGT },
  130. { kPFix_EMPTY, kIdentity, kPFix_NEWLINE },
  131. { kPFix_EMPTY, kOmitLast3, kPFix_EMPTY },
  132. { kPFix_EMPTY, kIdentity, kPFix_CLOSEBR },
  133. { kPFix_EMPTY, kIdentity, kPFix_SPforSP },
  134. { kPFix_EMPTY, kOmitFirst3, kPFix_EMPTY },
  135. { kPFix_EMPTY, kOmitLast2, kPFix_EMPTY },
  136. { kPFix_EMPTY, kIdentity, kPFix_SPaSP },
  137. { kPFix_EMPTY, kIdentity, kPFix_SPthatSP },
  138. { kPFix_SP, kUppercaseFirst, kPFix_EMPTY },
  139. { kPFix_EMPTY, kIdentity, kPFix_DOTSP },
  140. { kPFix_DOT, kIdentity, kPFix_EMPTY },
  141. { kPFix_SP, kIdentity, kPFix_COMMASP },
  142. { kPFix_EMPTY, kOmitFirst4, kPFix_EMPTY },
  143. { kPFix_EMPTY, kIdentity, kPFix_SPwithSP },
  144. { kPFix_EMPTY, kIdentity, kPFix_SQUOT },
  145. { kPFix_EMPTY, kIdentity, kPFix_SPfromSP },
  146. { kPFix_EMPTY, kIdentity, kPFix_SPbySP },
  147. { kPFix_EMPTY, kOmitFirst5, kPFix_EMPTY },
  148. { kPFix_EMPTY, kOmitFirst6, kPFix_EMPTY },
  149. { kPFix_SPtheSP, kIdentity, kPFix_EMPTY },
  150. { kPFix_EMPTY, kOmitLast4, kPFix_EMPTY },
  151. { kPFix_EMPTY, kIdentity, kPFix_DOTSPTheSP },
  152. { kPFix_EMPTY, kUppercaseAll, kPFix_EMPTY },
  153. { kPFix_EMPTY, kIdentity, kPFix_SPonSP },
  154. { kPFix_EMPTY, kIdentity, kPFix_SPasSP },
  155. { kPFix_EMPTY, kIdentity, kPFix_SPisSP },
  156. { kPFix_EMPTY, kOmitLast7, kPFix_EMPTY },
  157. { kPFix_EMPTY, kOmitLast1, kPFix_ingSP },
  158. { kPFix_EMPTY, kIdentity, kPFix_NEWLINETAB },
  159. { kPFix_EMPTY, kIdentity, kPFix_COLON },
  160. { kPFix_SP, kIdentity, kPFix_DOTSP },
  161. { kPFix_EMPTY, kIdentity, kPFix_edSP },
  162. { kPFix_EMPTY, kOmitFirst9, kPFix_EMPTY },
  163. { kPFix_EMPTY, kOmitFirst7, kPFix_EMPTY },
  164. { kPFix_EMPTY, kOmitLast6, kPFix_EMPTY },
  165. { kPFix_EMPTY, kIdentity, kPFix_OPEN },
  166. { kPFix_EMPTY, kUppercaseFirst, kPFix_COMMASP },
  167. { kPFix_EMPTY, kOmitLast8, kPFix_EMPTY },
  168. { kPFix_EMPTY, kIdentity, kPFix_SPatSP },
  169. { kPFix_EMPTY, kIdentity, kPFix_lySP },
  170. { kPFix_SPtheSP, kIdentity, kPFix_SPofSP },
  171. { kPFix_EMPTY, kOmitLast5, kPFix_EMPTY },
  172. { kPFix_EMPTY, kOmitLast9, kPFix_EMPTY },
  173. { kPFix_SP, kUppercaseFirst, kPFix_COMMASP },
  174. { kPFix_EMPTY, kUppercaseFirst, kPFix_DQUOT },
  175. { kPFix_DOT, kIdentity, kPFix_OPEN },
  176. { kPFix_EMPTY, kUppercaseAll, kPFix_SP },
  177. { kPFix_EMPTY, kUppercaseFirst, kPFix_DQUOTGT },
  178. { kPFix_EMPTY, kIdentity, kPFix_EQDQUOT },
  179. { kPFix_SP, kIdentity, kPFix_DOT },
  180. { kPFix_DOTcomSLASH, kIdentity, kPFix_EMPTY },
  181. { kPFix_SPtheSP, kIdentity, kPFix_SPofSPtheSP },
  182. { kPFix_EMPTY, kUppercaseFirst, kPFix_SQUOT },
  183. { kPFix_EMPTY, kIdentity, kPFix_DOTSPThisSP },
  184. { kPFix_EMPTY, kIdentity, kPFix_COMMA },
  185. { kPFix_DOT, kIdentity, kPFix_SP },
  186. { kPFix_EMPTY, kUppercaseFirst, kPFix_OPEN },
  187. { kPFix_EMPTY, kUppercaseFirst, kPFix_DOT },
  188. { kPFix_EMPTY, kIdentity, kPFix_SPnotSP },
  189. { kPFix_SP, kIdentity, kPFix_EQDQUOT },
  190. { kPFix_EMPTY, kIdentity, kPFix_erSP },
  191. { kPFix_SP, kUppercaseAll, kPFix_SP },
  192. { kPFix_EMPTY, kIdentity, kPFix_alSP },
  193. { kPFix_SP, kUppercaseAll, kPFix_EMPTY },
  194. { kPFix_EMPTY, kIdentity, kPFix_EQSQUOT },
  195. { kPFix_EMPTY, kUppercaseAll, kPFix_DQUOT },
  196. { kPFix_EMPTY, kUppercaseFirst, kPFix_DOTSP },
  197. { kPFix_SP, kIdentity, kPFix_OPEN },
  198. { kPFix_EMPTY, kIdentity, kPFix_fulSP },
  199. { kPFix_SP, kUppercaseFirst, kPFix_DOTSP },
  200. { kPFix_EMPTY, kIdentity, kPFix_iveSP },
  201. { kPFix_EMPTY, kIdentity, kPFix_lessSP },
  202. { kPFix_EMPTY, kUppercaseAll, kPFix_SQUOT },
  203. { kPFix_EMPTY, kIdentity, kPFix_estSP },
  204. { kPFix_SP, kUppercaseFirst, kPFix_DOT },
  205. { kPFix_EMPTY, kUppercaseAll, kPFix_DQUOTGT },
  206. { kPFix_SP, kIdentity, kPFix_EQSQUOT },
  207. { kPFix_EMPTY, kUppercaseFirst, kPFix_COMMA },
  208. { kPFix_EMPTY, kIdentity, kPFix_izeSP },
  209. { kPFix_EMPTY, kUppercaseAll, kPFix_DOT },
  210. { kPFix_NBSP, kIdentity, kPFix_EMPTY },
  211. { kPFix_SP, kIdentity, kPFix_COMMA },
  212. { kPFix_EMPTY, kUppercaseFirst, kPFix_EQDQUOT },
  213. { kPFix_EMPTY, kUppercaseAll, kPFix_EQDQUOT },
  214. { kPFix_EMPTY, kIdentity, kPFix_ousSP },
  215. { kPFix_EMPTY, kUppercaseAll, kPFix_COMMASP },
  216. { kPFix_EMPTY, kUppercaseFirst, kPFix_EQSQUOT },
  217. { kPFix_SP, kUppercaseFirst, kPFix_COMMA },
  218. { kPFix_SP, kUppercaseAll, kPFix_EQDQUOT },
  219. { kPFix_SP, kUppercaseAll, kPFix_COMMASP },
  220. { kPFix_EMPTY, kUppercaseAll, kPFix_COMMA },
  221. { kPFix_EMPTY, kUppercaseAll, kPFix_OPEN },
  222. { kPFix_EMPTY, kUppercaseAll, kPFix_DOTSP },
  223. { kPFix_SP, kUppercaseAll, kPFix_DOT },
  224. { kPFix_EMPTY, kUppercaseAll, kPFix_EQSQUOT },
  225. { kPFix_SP, kUppercaseAll, kPFix_DOTSP },
  226. { kPFix_SP, kUppercaseFirst, kPFix_EQDQUOT },
  227. { kPFix_SP, kUppercaseAll, kPFix_EQSQUOT },
  228. { kPFix_SP, kUppercaseFirst, kPFix_EQSQUOT },
  229. };
  230. static const int kNumTransforms = sizeof(kTransforms) / sizeof(kTransforms[0]);
  231. static int ToUpperCase(uint8_t* p) {
  232. if (p[0] < 0xc0) {
  233. if (p[0] >= 'a' && p[0] <= 'z') {
  234. p[0] ^= 32;
  235. }
  236. return 1;
  237. }
  238. /* An overly simplified uppercasing model for utf-8. */
  239. if (p[0] < 0xe0) {
  240. p[1] ^= 32;
  241. return 2;
  242. }
  243. /* An arbitrary transform for three byte characters. */
  244. p[2] ^= 5;
  245. return 3;
  246. }
  247. static BROTLI_NOINLINE int TransformDictionaryWord(
  248. uint8_t* dst, const uint8_t* word, int len, int transform) {
  249. int idx = 0;
  250. {
  251. const char* prefix = &kPrefixSuffix[kTransforms[transform].prefix_id];
  252. while (*prefix) { dst[idx++] = (uint8_t)*prefix++; }
  253. }
  254. {
  255. const int t = kTransforms[transform].transform;
  256. int i = 0;
  257. int skip = t - (kOmitFirst1 - 1);
  258. if (skip > 0) {
  259. word += skip;
  260. len -= skip;
  261. } else if (t <= kOmitLast9) {
  262. len -= t;
  263. }
  264. while (i < len) { dst[idx++] = word[i++]; }
  265. if (t == kUppercaseFirst) {
  266. ToUpperCase(&dst[idx - len]);
  267. } else if (t == kUppercaseAll) {
  268. uint8_t* uppercase = &dst[idx - len];
  269. while (len > 0) {
  270. int step = ToUpperCase(uppercase);
  271. uppercase += step;
  272. len -= step;
  273. }
  274. }
  275. }
  276. {
  277. const char* suffix = &kPrefixSuffix[kTransforms[transform].suffix_id];
  278. while (*suffix) { dst[idx++] = (uint8_t)*suffix++; }
  279. return idx;
  280. }
  281. }
  282. #if defined(__cplusplus) || defined(c_plusplus)
  283. } /* extern "C" */
  284. #endif
  285. #endif /* BROTLI_DEC_TRANSFORM_H_ */