static_dict.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480
  1. /* Copyright 2013 Google Inc. All Rights Reserved.
  2. Distributed under MIT license.
  3. See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
  4. */
  5. #include "./static_dict.h"
  6. #include "../common/dictionary.h"
  7. #include "./find_match_length.h"
  8. #include "./port.h"
  9. #include "./static_dict_lut.h"
  10. #if defined(__cplusplus) || defined(c_plusplus)
  11. extern "C" {
  12. #endif
  13. static const uint8_t kUppercaseFirst = 10;
  14. static const uint8_t kOmitLastNTransforms[10] = {
  15. 0, 12, 27, 23, 42, 63, 56, 48, 59, 64,
  16. };
  17. static BROTLI_INLINE uint32_t Hash(const uint8_t *data) {
  18. uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kDictHashMul32;
  19. /* The higher bits contain more mixture from the multiplication,
  20. so we take our results from there. */
  21. return h >> (32 - kDictNumBits);
  22. }
  23. static BROTLI_INLINE void AddMatch(size_t distance, size_t len, size_t len_code,
  24. uint32_t* matches) {
  25. uint32_t match = (uint32_t)((distance << 5) + len_code);
  26. matches[len] = BROTLI_MIN(uint32_t, matches[len], match);
  27. }
  28. static BROTLI_INLINE size_t DictMatchLength(const uint8_t* data,
  29. size_t id,
  30. size_t len,
  31. size_t maxlen) {
  32. const size_t offset = kBrotliDictionaryOffsetsByLength[len] + len * id;
  33. return FindMatchLengthWithLimit(&kBrotliDictionary[offset], data,
  34. BROTLI_MIN(size_t, len, maxlen));
  35. }
  36. static BROTLI_INLINE int IsMatch(
  37. DictWord w, const uint8_t* data, size_t max_length) {
  38. if (w.len > max_length) {
  39. return 0;
  40. } else {
  41. const size_t offset = kBrotliDictionaryOffsetsByLength[w.len] +
  42. (size_t)w.len * (size_t)w.idx;
  43. const uint8_t* dict = &kBrotliDictionary[offset];
  44. if (w.transform == 0) {
  45. /* Match against base dictionary word. */
  46. return FindMatchLengthWithLimit(dict, data, w.len) == w.len;
  47. } else if (w.transform == 10) {
  48. /* Match against uppercase first transform.
  49. Note that there are only ASCII uppercase words in the lookup table. */
  50. return (dict[0] >= 'a' && dict[0] <= 'z' &&
  51. (dict[0] ^ 32) == data[0] &&
  52. FindMatchLengthWithLimit(&dict[1], &data[1], w.len - 1u) ==
  53. w.len - 1u);
  54. } else {
  55. /* Match against uppercase all transform.
  56. Note that there are only ASCII uppercase words in the lookup table. */
  57. size_t i;
  58. for (i = 0; i < w.len; ++i) {
  59. if (dict[i] >= 'a' && dict[i] <= 'z') {
  60. if ((dict[i] ^ 32) != data[i]) return 0;
  61. } else {
  62. if (dict[i] != data[i]) return 0;
  63. }
  64. }
  65. return 1;
  66. }
  67. }
  68. }
  69. int BrotliFindAllStaticDictionaryMatches(const uint8_t* data,
  70. size_t min_length,
  71. size_t max_length,
  72. uint32_t* matches) {
  73. int has_found_match = 0;
  74. size_t key0 = Hash(data);
  75. size_t bucket0 = kStaticDictionaryBuckets[key0];
  76. if (bucket0 != 0) {
  77. size_t num = bucket0 & 0xff;
  78. size_t offset = bucket0 >> 8;
  79. size_t i;
  80. for (i = 0; i < num; ++i) {
  81. const DictWord w = kStaticDictionaryWords[offset + i];
  82. const size_t l = w.len;
  83. const size_t n = (size_t)1 << kBrotliDictionarySizeBitsByLength[l];
  84. const size_t id = w.idx;
  85. if (w.transform == 0) {
  86. const size_t matchlen = DictMatchLength(data, id, l, max_length);
  87. const uint8_t* s;
  88. size_t minlen;
  89. size_t maxlen;
  90. size_t len;
  91. /* Transform "" + kIdentity + "" */
  92. if (matchlen == l) {
  93. AddMatch(id, l, l, matches);
  94. has_found_match = 1;
  95. }
  96. /* Transforms "" + kOmitLast1 + "" and "" + kOmitLast1 + "ing " */
  97. if (matchlen >= l - 1) {
  98. AddMatch(id + 12 * n, l - 1, l, matches);
  99. if (l + 2 < max_length &&
  100. data[l - 1] == 'i' && data[l] == 'n' && data[l + 1] == 'g' &&
  101. data[l + 2] == ' ') {
  102. AddMatch(id + 49 * n, l + 3, l, matches);
  103. }
  104. has_found_match = 1;
  105. }
  106. /* Transform "" + kOmitLastN + "" (N = 2 .. 9) */
  107. minlen = min_length;
  108. if (l > 9) minlen = BROTLI_MAX(size_t, minlen, l - 9);
  109. maxlen = BROTLI_MIN(size_t, matchlen, l - 2);
  110. for (len = minlen; len <= maxlen; ++len) {
  111. AddMatch(id + kOmitLastNTransforms[l - len] * n, len, l, matches);
  112. has_found_match = 1;
  113. }
  114. if (matchlen < l || l + 6 >= max_length) {
  115. continue;
  116. }
  117. s = &data[l];
  118. /* Transforms "" + kIdentity + <suffix> */
  119. if (s[0] == ' ') {
  120. AddMatch(id + n, l + 1, l, matches);
  121. if (s[1] == 'a') {
  122. if (s[2] == ' ') {
  123. AddMatch(id + 28 * n, l + 3, l, matches);
  124. } else if (s[2] == 's') {
  125. if (s[3] == ' ') AddMatch(id + 46 * n, l + 4, l, matches);
  126. } else if (s[2] == 't') {
  127. if (s[3] == ' ') AddMatch(id + 60 * n, l + 4, l, matches);
  128. } else if (s[2] == 'n') {
  129. if (s[3] == 'd' && s[4] == ' ') {
  130. AddMatch(id + 10 * n, l + 5, l, matches);
  131. }
  132. }
  133. } else if (s[1] == 'b') {
  134. if (s[2] == 'y' && s[3] == ' ') {
  135. AddMatch(id + 38 * n, l + 4, l, matches);
  136. }
  137. } else if (s[1] == 'i') {
  138. if (s[2] == 'n') {
  139. if (s[3] == ' ') AddMatch(id + 16 * n, l + 4, l, matches);
  140. } else if (s[2] == 's') {
  141. if (s[3] == ' ') AddMatch(id + 47 * n, l + 4, l, matches);
  142. }
  143. } else if (s[1] == 'f') {
  144. if (s[2] == 'o') {
  145. if (s[3] == 'r' && s[4] == ' ') {
  146. AddMatch(id + 25 * n, l + 5, l, matches);
  147. }
  148. } else if (s[2] == 'r') {
  149. if (s[3] == 'o' && s[4] == 'm' && s[5] == ' ') {
  150. AddMatch(id + 37 * n, l + 6, l, matches);
  151. }
  152. }
  153. } else if (s[1] == 'o') {
  154. if (s[2] == 'f') {
  155. if (s[3] == ' ') AddMatch(id + 8 * n, l + 4, l, matches);
  156. } else if (s[2] == 'n') {
  157. if (s[3] == ' ') AddMatch(id + 45 * n, l + 4, l, matches);
  158. }
  159. } else if (s[1] == 'n') {
  160. if (s[2] == 'o' && s[3] == 't' && s[4] == ' ') {
  161. AddMatch(id + 80 * n, l + 5, l, matches);
  162. }
  163. } else if (s[1] == 't') {
  164. if (s[2] == 'h') {
  165. if (s[3] == 'e') {
  166. if (s[4] == ' ') AddMatch(id + 5 * n, l + 5, l, matches);
  167. } else if (s[3] == 'a') {
  168. if (s[4] == 't' && s[5] == ' ') {
  169. AddMatch(id + 29 * n, l + 6, l, matches);
  170. }
  171. }
  172. } else if (s[2] == 'o') {
  173. if (s[3] == ' ') AddMatch(id + 17 * n, l + 4, l, matches);
  174. }
  175. } else if (s[1] == 'w') {
  176. if (s[2] == 'i' && s[3] == 't' && s[4] == 'h' && s[5] == ' ') {
  177. AddMatch(id + 35 * n, l + 6, l, matches);
  178. }
  179. }
  180. } else if (s[0] == '"') {
  181. AddMatch(id + 19 * n, l + 1, l, matches);
  182. if (s[1] == '>') {
  183. AddMatch(id + 21 * n, l + 2, l, matches);
  184. }
  185. } else if (s[0] == '.') {
  186. AddMatch(id + 20 * n, l + 1, l, matches);
  187. if (s[1] == ' ') {
  188. AddMatch(id + 31 * n, l + 2, l, matches);
  189. if (s[2] == 'T' && s[3] == 'h') {
  190. if (s[4] == 'e') {
  191. if (s[5] == ' ') AddMatch(id + 43 * n, l + 6, l, matches);
  192. } else if (s[4] == 'i') {
  193. if (s[5] == 's' && s[6] == ' ') {
  194. AddMatch(id + 75 * n, l + 7, l, matches);
  195. }
  196. }
  197. }
  198. }
  199. } else if (s[0] == ',') {
  200. AddMatch(id + 76 * n, l + 1, l, matches);
  201. if (s[1] == ' ') {
  202. AddMatch(id + 14 * n, l + 2, l, matches);
  203. }
  204. } else if (s[0] == '\n') {
  205. AddMatch(id + 22 * n, l + 1, l, matches);
  206. if (s[1] == '\t') {
  207. AddMatch(id + 50 * n, l + 2, l, matches);
  208. }
  209. } else if (s[0] == ']') {
  210. AddMatch(id + 24 * n, l + 1, l, matches);
  211. } else if (s[0] == '\'') {
  212. AddMatch(id + 36 * n, l + 1, l, matches);
  213. } else if (s[0] == ':') {
  214. AddMatch(id + 51 * n, l + 1, l, matches);
  215. } else if (s[0] == '(') {
  216. AddMatch(id + 57 * n, l + 1, l, matches);
  217. } else if (s[0] == '=') {
  218. if (s[1] == '"') {
  219. AddMatch(id + 70 * n, l + 2, l, matches);
  220. } else if (s[1] == '\'') {
  221. AddMatch(id + 86 * n, l + 2, l, matches);
  222. }
  223. } else if (s[0] == 'a') {
  224. if (s[1] == 'l' && s[2] == ' ') {
  225. AddMatch(id + 84 * n, l + 3, l, matches);
  226. }
  227. } else if (s[0] == 'e') {
  228. if (s[1] == 'd') {
  229. if (s[2] == ' ') AddMatch(id + 53 * n, l + 3, l, matches);
  230. } else if (s[1] == 'r') {
  231. if (s[2] == ' ') AddMatch(id + 82 * n, l + 3, l, matches);
  232. } else if (s[1] == 's') {
  233. if (s[2] == 't' && s[3] == ' ') {
  234. AddMatch(id + 95 * n, l + 4, l, matches);
  235. }
  236. }
  237. } else if (s[0] == 'f') {
  238. if (s[1] == 'u' && s[2] == 'l' && s[3] == ' ') {
  239. AddMatch(id + 90 * n, l + 4, l, matches);
  240. }
  241. } else if (s[0] == 'i') {
  242. if (s[1] == 'v') {
  243. if (s[2] == 'e' && s[3] == ' ') {
  244. AddMatch(id + 92 * n, l + 4, l, matches);
  245. }
  246. } else if (s[1] == 'z') {
  247. if (s[2] == 'e' && s[3] == ' ') {
  248. AddMatch(id + 100 * n, l + 4, l, matches);
  249. }
  250. }
  251. } else if (s[0] == 'l') {
  252. if (s[1] == 'e') {
  253. if (s[2] == 's' && s[3] == 's' && s[4] == ' ') {
  254. AddMatch(id + 93 * n, l + 5, l, matches);
  255. }
  256. } else if (s[1] == 'y') {
  257. if (s[2] == ' ') AddMatch(id + 61 * n, l + 3, l, matches);
  258. }
  259. } else if (s[0] == 'o') {
  260. if (s[1] == 'u' && s[2] == 's' && s[3] == ' ') {
  261. AddMatch(id + 106 * n, l + 4, l, matches);
  262. }
  263. }
  264. } else {
  265. /* Set is_all_caps=0 for kUppercaseFirst and
  266. is_all_caps=1 otherwise (kUppercaseAll) transform. */
  267. const int is_all_caps = (w.transform != kUppercaseFirst) ? 1 : 0;
  268. const uint8_t* s;
  269. if (!IsMatch(w, data, max_length)) {
  270. continue;
  271. }
  272. /* Transform "" + kUppercase{First,All} + "" */
  273. AddMatch(id + (is_all_caps ? 44 : 9) * n, l, l, matches);
  274. has_found_match = 1;
  275. if (l + 1 >= max_length) {
  276. continue;
  277. }
  278. /* Transforms "" + kUppercase{First,All} + <suffix> */
  279. s = &data[l];
  280. if (s[0] == ' ') {
  281. AddMatch(id + (is_all_caps ? 68 : 4) * n, l + 1, l, matches);
  282. } else if (s[0] == '"') {
  283. AddMatch(id + (is_all_caps ? 87 : 66) * n, l + 1, l, matches);
  284. if (s[1] == '>') {
  285. AddMatch(id + (is_all_caps ? 97 : 69) * n, l + 2, l, matches);
  286. }
  287. } else if (s[0] == '.') {
  288. AddMatch(id + (is_all_caps ? 101 : 79) * n, l + 1, l, matches);
  289. if (s[1] == ' ') {
  290. AddMatch(id + (is_all_caps ? 114 : 88) * n, l + 2, l, matches);
  291. }
  292. } else if (s[0] == ',') {
  293. AddMatch(id + (is_all_caps ? 112 : 99) * n, l + 1, l, matches);
  294. if (s[1] == ' ') {
  295. AddMatch(id + (is_all_caps ? 107 : 58) * n, l + 2, l, matches);
  296. }
  297. } else if (s[0] == '\'') {
  298. AddMatch(id + (is_all_caps ? 94 : 74) * n, l + 1, l, matches);
  299. } else if (s[0] == '(') {
  300. AddMatch(id + (is_all_caps ? 113 : 78) * n, l + 1, l, matches);
  301. } else if (s[0] == '=') {
  302. if (s[1] == '"') {
  303. AddMatch(id + (is_all_caps ? 105 : 104) * n, l + 2, l, matches);
  304. } else if (s[1] == '\'') {
  305. AddMatch(id + (is_all_caps ? 116 : 108) * n, l + 2, l, matches);
  306. }
  307. }
  308. }
  309. }
  310. }
  311. /* Transforms with prefixes " " and "." */
  312. if (max_length >= 5 && (data[0] == ' ' || data[0] == '.')) {
  313. int is_space = (data[0] == ' ') ? 1 : 0;
  314. size_t key1 = Hash(&data[1]);
  315. size_t bucket1 = kStaticDictionaryBuckets[key1];
  316. size_t num = bucket1 & 0xff;
  317. size_t offset = bucket1 >> 8;
  318. size_t i;
  319. for (i = 0; i < num; ++i) {
  320. const DictWord w = kStaticDictionaryWords[offset + i];
  321. const size_t l = w.len;
  322. const size_t n = (size_t)1 << kBrotliDictionarySizeBitsByLength[l];
  323. const size_t id = w.idx;
  324. if (w.transform == 0) {
  325. const uint8_t* s;
  326. if (!IsMatch(w, &data[1], max_length - 1)) {
  327. continue;
  328. }
  329. /* Transforms " " + kIdentity + "" and "." + kIdentity + "" */
  330. AddMatch(id + (is_space ? 6 : 32) * n, l + 1, l, matches);
  331. has_found_match = 1;
  332. if (l + 2 >= max_length) {
  333. continue;
  334. }
  335. /* Transforms " " + kIdentity + <suffix> and "." + kIdentity + <suffix>
  336. */
  337. s = &data[l + 1];
  338. if (s[0] == ' ') {
  339. AddMatch(id + (is_space ? 2 : 77) * n, l + 2, l, matches);
  340. } else if (s[0] == '(') {
  341. AddMatch(id + (is_space ? 89 : 67) * n, l + 2, l, matches);
  342. } else if (is_space) {
  343. if (s[0] == ',') {
  344. AddMatch(id + 103 * n, l + 2, l, matches);
  345. if (s[1] == ' ') {
  346. AddMatch(id + 33 * n, l + 3, l, matches);
  347. }
  348. } else if (s[0] == '.') {
  349. AddMatch(id + 71 * n, l + 2, l, matches);
  350. if (s[1] == ' ') {
  351. AddMatch(id + 52 * n, l + 3, l, matches);
  352. }
  353. } else if (s[0] == '=') {
  354. if (s[1] == '"') {
  355. AddMatch(id + 81 * n, l + 3, l, matches);
  356. } else if (s[1] == '\'') {
  357. AddMatch(id + 98 * n, l + 3, l, matches);
  358. }
  359. }
  360. }
  361. } else if (is_space) {
  362. /* Set is_all_caps=0 for kUppercaseFirst and
  363. is_all_caps=1 otherwise (kUppercaseAll) transform. */
  364. const int is_all_caps = (w.transform != kUppercaseFirst) ? 1 : 0;
  365. const uint8_t* s;
  366. if (!IsMatch(w, &data[1], max_length - 1)) {
  367. continue;
  368. }
  369. /* Transforms " " + kUppercase{First,All} + "" */
  370. AddMatch(id + (is_all_caps ? 85 : 30) * n, l + 1, l, matches);
  371. has_found_match = 1;
  372. if (l + 2 >= max_length) {
  373. continue;
  374. }
  375. /* Transforms " " + kUppercase{First,All} + <suffix> */
  376. s = &data[l + 1];
  377. if (s[0] == ' ') {
  378. AddMatch(id + (is_all_caps ? 83 : 15) * n, l + 2, l, matches);
  379. } else if (s[0] == ',') {
  380. if (!is_all_caps) {
  381. AddMatch(id + 109 * n, l + 2, l, matches);
  382. }
  383. if (s[1] == ' ') {
  384. AddMatch(id + (is_all_caps ? 111 : 65) * n, l + 3, l, matches);
  385. }
  386. } else if (s[0] == '.') {
  387. AddMatch(id + (is_all_caps ? 115 : 96) * n, l + 2, l, matches);
  388. if (s[1] == ' ') {
  389. AddMatch(id + (is_all_caps ? 117 : 91) * n, l + 3, l, matches);
  390. }
  391. } else if (s[0] == '=') {
  392. if (s[1] == '"') {
  393. AddMatch(id + (is_all_caps ? 110 : 118) * n, l + 3, l, matches);
  394. } else if (s[1] == '\'') {
  395. AddMatch(id + (is_all_caps ? 119 : 120) * n, l + 3, l, matches);
  396. }
  397. }
  398. }
  399. }
  400. }
  401. if (max_length >= 6) {
  402. /* Transforms with prefixes "e ", "s ", ", " and "\xc2\xa0" */
  403. if ((data[1] == ' ' &&
  404. (data[0] == 'e' || data[0] == 's' || data[0] == ',')) ||
  405. (data[0] == 0xc2 && data[1] == 0xa0)) {
  406. size_t key2 = Hash(&data[2]);
  407. size_t bucket2 = kStaticDictionaryBuckets[key2];
  408. size_t num = bucket2 & 0xff;
  409. size_t offset = bucket2 >> 8;
  410. size_t i;
  411. for (i = 0; i < num; ++i) {
  412. const DictWord w = kStaticDictionaryWords[offset + i];
  413. const size_t l = w.len;
  414. const size_t n = (size_t)1 << kBrotliDictionarySizeBitsByLength[l];
  415. const size_t id = w.idx;
  416. if (w.transform == 0 && IsMatch(w, &data[2], max_length - 2)) {
  417. if (data[0] == 0xc2) {
  418. AddMatch(id + 102 * n, l + 2, l, matches);
  419. has_found_match = 1;
  420. } else if (l + 2 < max_length && data[l + 2] == ' ') {
  421. size_t t = data[0] == 'e' ? 18 : (data[0] == 's' ? 7 : 13);
  422. AddMatch(id + t * n, l + 3, l, matches);
  423. has_found_match = 1;
  424. }
  425. }
  426. }
  427. }
  428. }
  429. if (max_length >= 9) {
  430. /* Transforms with prefixes " the " and ".com/" */
  431. if ((data[0] == ' ' && data[1] == 't' && data[2] == 'h' &&
  432. data[3] == 'e' && data[4] == ' ') ||
  433. (data[0] == '.' && data[1] == 'c' && data[2] == 'o' &&
  434. data[3] == 'm' && data[4] == '/')) {
  435. size_t key5 = Hash(&data[5]);
  436. size_t bucket5 = kStaticDictionaryBuckets[key5];
  437. size_t num = bucket5 & 0xff;
  438. size_t offset = bucket5 >> 8;
  439. size_t i;
  440. for (i = 0; i < num; ++i) {
  441. const DictWord w = kStaticDictionaryWords[offset + i];
  442. const size_t l = w.len;
  443. const size_t n = (size_t)1 << kBrotliDictionarySizeBitsByLength[l];
  444. const size_t id = w.idx;
  445. if (w.transform == 0 && IsMatch(w, &data[5], max_length - 5)) {
  446. AddMatch(id + (data[0] == ' ' ? 41 : 72) * n, l + 5, l, matches);
  447. has_found_match = 1;
  448. if (l + 5 < max_length) {
  449. const uint8_t* s = &data[l + 5];
  450. if (data[0] == ' ') {
  451. if (l + 8 < max_length &&
  452. s[0] == ' ' && s[1] == 'o' && s[2] == 'f' && s[3] == ' ') {
  453. AddMatch(id + 62 * n, l + 9, l, matches);
  454. if (l + 12 < max_length &&
  455. s[4] == 't' && s[5] == 'h' && s[6] == 'e' && s[7] == ' ') {
  456. AddMatch(id + 73 * n, l + 13, l, matches);
  457. }
  458. }
  459. }
  460. }
  461. }
  462. }
  463. }
  464. }
  465. return has_found_match;
  466. }
  467. #if defined(__cplusplus) || defined(c_plusplus)
  468. } /* extern "C" */
  469. #endif