trie.c 31 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372
  1. /*
  2. * libwebsockets - small server side websockets and web server implementation
  3. *
  4. * Copyright (C) 2010 - 2019 Andy Green <[email protected]>
  5. *
  6. * Permission is hereby granted, free of charge, to any person obtaining a copy
  7. * of this software and associated documentation files (the "Software"), to
  8. * deal in the Software without restriction, including without limitation the
  9. * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  10. * sell copies of the Software, and to permit persons to whom the Software is
  11. * furnished to do so, subject to the following conditions:
  12. *
  13. * The above copyright notice and this permission notice shall be included in
  14. * all copies or substantial portions of the Software.
  15. *
  16. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21. * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22. * IN THE SOFTWARE.
  23. *
  24. * The functions allow
  25. *
  26. * - collecting a concordance of strings from one or more files (eg, a
  27. * directory of files) into a single in-memory, lac-backed trie;
  28. *
  29. * - to optimize and serialize the in-memory trie to an fd;
  30. *
  31. * - to very quickly report any instances of a string in any of the files
  32. * indexed by the trie, by a seeking around a serialized trie fd, without
  33. * having to load it all in memory
  34. */
  35. #include "private-lib-core.h"
  36. #include "private-lib-misc-fts.h"
  37. #include <stdio.h>
  38. #include <string.h>
  39. #include <assert.h>
  40. #include <fcntl.h>
  41. #include <errno.h>
  42. #include <sys/types.h>
  43. struct lws_fts_entry;
  44. /* notice these are stored in t->lwsac_input_head which has input file scope */
  45. struct lws_fts_filepath {
  46. struct lws_fts_filepath *next;
  47. struct lws_fts_filepath *prev;
  48. char filepath[256];
  49. jg2_file_offset ofs;
  50. jg2_file_offset line_table_ofs;
  51. int filepath_len;
  52. int file_index;
  53. int total_lines;
  54. int priority;
  55. };
  56. /* notice these are stored in t->lwsac_input_head which has input file scope */
  57. struct lws_fts_lines {
  58. struct lws_fts_lines *lines_next;
  59. /*
  60. * amount of line numbers needs to meet average count for best
  61. * efficiency.
  62. *
  63. * Line numbers are stored in VLI format since if we don't, around half
  64. * the total lac allocation consists of struct lws_fts_lines...
  65. * size chosen to maintain 8-byte struct alignment
  66. */
  67. uint8_t vli[119];
  68. char count;
  69. };
  70. /* this represents the instances of a symbol inside a given filepath */
  71. struct lws_fts_instance_file {
  72. /* linked-list of tifs generated for current file */
  73. struct lws_fts_instance_file *inst_file_next;
  74. struct lws_fts_entry *owner;
  75. struct lws_fts_lines *lines_list, *lines_tail;
  76. uint32_t file_index;
  77. uint32_t total;
  78. /*
  79. * optimization for the common case there's only 1 - ~3 matches, so we
  80. * don't have to allocate any lws_fts_lines struct
  81. *
  82. * Using 8 bytes total for this maintains 8-byte struct alignment...
  83. */
  84. uint8_t vli[7];
  85. char count;
  86. };
  87. /*
  88. * this is the main trie in-memory allocation object
  89. */
  90. struct lws_fts_entry {
  91. struct lws_fts_entry *parent;
  92. struct lws_fts_entry *child_list;
  93. struct lws_fts_entry *sibling;
  94. /*
  95. * care... this points to content in t->lwsac_input_head, it goes
  96. * out of scope when the input file being indexed completes
  97. */
  98. struct lws_fts_instance_file *inst_file_list;
  99. jg2_file_offset ofs_last_inst_file;
  100. char *suffix; /* suffix string or NULL if one char (in .c) */
  101. jg2_file_offset ofs;
  102. uint32_t child_count;
  103. uint32_t instance_count;
  104. uint32_t agg_inst_count;
  105. uint32_t agg_child_count;
  106. uint32_t suffix_len;
  107. unsigned char c;
  108. };
  109. /* there's only one of these per trie file */
  110. struct lws_fts {
  111. struct lwsac *lwsac_head;
  112. struct lwsac *lwsac_input_head;
  113. struct lws_fts_entry *root;
  114. struct lws_fts_filepath *filepath_list;
  115. struct lws_fts_filepath *fp;
  116. struct lws_fts_entry *parser;
  117. struct lws_fts_entry *root_lookup[256];
  118. /*
  119. * head of linked-list of tifs generated for current file
  120. * care... this points to content in t->lwsac_input_head
  121. */
  122. struct lws_fts_instance_file *tif_list;
  123. jg2_file_offset c; /* length of output file so far */
  124. uint64_t agg_trie_creation_us;
  125. uint64_t agg_raw_input;
  126. uint64_t worst_lwsac_input_size;
  127. int last_file_index;
  128. int chars_in_line;
  129. jg2_file_offset last_block_len_ofs;
  130. int line_number;
  131. int lines_in_unsealed_linetable;
  132. int next_file_index;
  133. int count_entries;
  134. int fd;
  135. unsigned int agg_pos;
  136. unsigned int str_match_pos;
  137. unsigned char aggregate;
  138. unsigned char agg[128];
  139. };
  140. /* since the kernel case allocates >300MB, no point keeping this too low */
  141. #define TRIE_LWSAC_BLOCK_SIZE (1024 * 1024)
  142. #define spill(margin, force) \
  143. if (bp && ((uint32_t)bp >= (sizeof(buf) - (margin)) || (force))) { \
  144. if (write(t->fd, buf, bp) != bp) { \
  145. lwsl_err("%s: write %d failed (%d)\n", __func__, \
  146. bp, errno); \
  147. return 1; \
  148. } \
  149. t->c += bp; \
  150. bp = 0; \
  151. }
  152. static int
  153. g32(unsigned char *b, uint32_t d)
  154. {
  155. *b++ = (d >> 24) & 0xff;
  156. *b++ = (d >> 16) & 0xff;
  157. *b++ = (d >> 8) & 0xff;
  158. *b = d & 0xff;
  159. return 4;
  160. }
  161. static int
  162. g16(unsigned char *b, int d)
  163. {
  164. *b++ = (d >> 8) & 0xff;
  165. *b = d & 0xff;
  166. return 2;
  167. }
  168. static int
  169. wq32(unsigned char *b, uint32_t d)
  170. {
  171. unsigned char *ob = b;
  172. if (d > (1 << 28) - 1)
  173. *b++ = ((d >> 28) | 0x80) & 0xff;
  174. if (d > (1 << 21) - 1)
  175. *b++ = ((d >> 21) | 0x80) & 0xff;
  176. if (d > (1 << 14) - 1)
  177. *b++ = ((d >> 14) | 0x80) & 0xff;
  178. if (d > (1 << 7) - 1)
  179. *b++ = ((d >> 7) | 0x80) & 0xff;
  180. *b++ = d & 0x7f;
  181. return (int)(b - ob);
  182. }
  183. /* read a VLI, return the number of bytes used */
  184. int
  185. rq32(unsigned char *b, uint32_t *d)
  186. {
  187. unsigned char *ob = b;
  188. uint32_t t = 0;
  189. t = *b & 0x7f;
  190. if (*(b++) & 0x80) {
  191. t = (t << 7) | (*b & 0x7f);
  192. if (*(b++) & 0x80) {
  193. t = (t << 7) | (*b & 0x7f);
  194. if (*(b++) & 0x80) {
  195. t = (t << 7) | (*b & 0x7f);
  196. if (*(b++) & 0x80) {
  197. t = (t << 7) | (*b & 0x7f);
  198. b++;
  199. }
  200. }
  201. }
  202. }
  203. *d = t;
  204. return (int)(b - ob);
  205. }
  206. struct lws_fts *
  207. lws_fts_create(int fd)
  208. {
  209. struct lws_fts *t;
  210. struct lwsac *lwsac_head = NULL;
  211. unsigned char buf[TRIE_FILE_HDR_SIZE];
  212. t = lwsac_use(&lwsac_head, sizeof(*t), TRIE_LWSAC_BLOCK_SIZE);
  213. if (!t)
  214. return NULL;
  215. memset(t, 0, sizeof(*t));
  216. t->fd = fd;
  217. t->lwsac_head = lwsac_head;
  218. t->root = lwsac_use(&lwsac_head, sizeof(*t->root),
  219. TRIE_LWSAC_BLOCK_SIZE);
  220. if (!t->root)
  221. goto unwind;
  222. memset(t->root, 0, sizeof(*t->root));
  223. t->parser = t->root;
  224. t->last_file_index = -1;
  225. t->line_number = 1;
  226. t->filepath_list = NULL;
  227. memset(t->root_lookup, 0, sizeof(*t->root_lookup));
  228. /* write the header */
  229. buf[0] = 0xca;
  230. buf[1] = 0x7a;
  231. buf[2] = 0x5f;
  232. buf[3] = 0x75;
  233. /* (these are filled in with correct data at the end) */
  234. /* file offset to root trie entry */
  235. g32(&buf[4], 0);
  236. /* file length when it was created */
  237. g32(&buf[8], 0);
  238. /* fileoffset to the filepath table */
  239. g32(&buf[0xc], 0);
  240. /* count of filepaths */
  241. g32(&buf[0x10], 0);
  242. if (write(t->fd, buf, TRIE_FILE_HDR_SIZE) != TRIE_FILE_HDR_SIZE) {
  243. lwsl_err("%s: trie header write failed\n", __func__);
  244. goto unwind;
  245. }
  246. t->c = TRIE_FILE_HDR_SIZE;
  247. return t;
  248. unwind:
  249. lwsac_free(&lwsac_head);
  250. return NULL;
  251. }
  252. void
  253. lws_fts_destroy(struct lws_fts **trie)
  254. {
  255. struct lwsac *lwsac_head = (*trie)->lwsac_head;
  256. lwsac_free(&(*trie)->lwsac_input_head);
  257. lwsac_free(&lwsac_head);
  258. *trie = NULL;
  259. }
  260. int
  261. lws_fts_file_index(struct lws_fts *t, const char *filepath, int filepath_len,
  262. int priority)
  263. {
  264. struct lws_fts_filepath *fp = t->filepath_list;
  265. #if 0
  266. while (fp) {
  267. if (fp->filepath_len == filepath_len &&
  268. !strcmp(fp->filepath, filepath))
  269. return fp->file_index;
  270. fp = fp->next;
  271. }
  272. #endif
  273. fp = lwsac_use(&t->lwsac_head, sizeof(*fp), TRIE_LWSAC_BLOCK_SIZE);
  274. if (!fp)
  275. return -1;
  276. fp->next = t->filepath_list;
  277. t->filepath_list = fp;
  278. strncpy(fp->filepath, filepath, sizeof(fp->filepath) - 1);
  279. fp->filepath[sizeof(fp->filepath) - 1] = '\0';
  280. fp->filepath_len = filepath_len;
  281. fp->file_index = t->next_file_index++;
  282. fp->line_table_ofs = t->c;
  283. fp->priority = priority;
  284. fp->total_lines = 0;
  285. t->fp = fp;
  286. return fp->file_index;
  287. }
  288. static struct lws_fts_entry *
  289. lws_fts_entry_child_add(struct lws_fts *t, unsigned char c,
  290. struct lws_fts_entry *parent)
  291. {
  292. struct lws_fts_entry *e, **pe;
  293. e = lwsac_use(&t->lwsac_head, sizeof(*e), TRIE_LWSAC_BLOCK_SIZE);
  294. if (!e)
  295. return NULL;
  296. memset(e, 0, sizeof(*e));
  297. e->c = c;
  298. parent->child_count++;
  299. e->parent = parent;
  300. t->count_entries++;
  301. /* keep the parent child list in ascending sort order for c */
  302. pe = &parent->child_list;
  303. while (*pe) {
  304. assert((*pe)->parent == parent);
  305. if ((*pe)->c > c) {
  306. /* add it before */
  307. e->sibling = *pe;
  308. *pe = e;
  309. break;
  310. }
  311. pe = &(*pe)->sibling;
  312. }
  313. if (!*pe) {
  314. /* add it at the end */
  315. e->sibling = NULL;
  316. *pe = e;
  317. }
  318. return e;
  319. }
  320. static int
  321. finalize_per_input(struct lws_fts *t)
  322. {
  323. struct lws_fts_instance_file *tif;
  324. unsigned char buf[8192];
  325. uint64_t lwsac_input_size;
  326. jg2_file_offset temp;
  327. int bp = 0;
  328. bp += g16(&buf[bp], 0);
  329. bp += g16(&buf[bp], 0);
  330. bp += g32(&buf[bp], 0);
  331. if (write(t->fd, buf, bp) != bp)
  332. return 1;
  333. t->c += bp;
  334. bp = 0;
  335. /*
  336. * Write the generated file index + instances (if any)
  337. *
  338. * Notice the next same-parent file instance fileoffset list is
  339. * backwards, so it does not require seeks to fill in. The first
  340. * entry has 0 but the second entry points to the first entry (whose
  341. * fileoffset is known).
  342. *
  343. * After all the file instance structs are finalized,
  344. * .ofs_last_inst_file contains the fileoffset of that child's tif
  345. * list head in the file.
  346. *
  347. * The file instances are written to disk in the order that the files
  348. * were indexed, along with their prev pointers inline.
  349. */
  350. tif = t->tif_list;
  351. while (tif) {
  352. struct lws_fts_lines *i;
  353. spill((3 * MAX_VLI) + tif->count, 0);
  354. temp = tif->owner->ofs_last_inst_file;
  355. if (tif->total)
  356. tif->owner->ofs_last_inst_file = t->c + bp;
  357. assert(!temp || (temp > TRIE_FILE_HDR_SIZE && temp < t->c));
  358. /* fileoffset of prev instance file for this entry, or 0 */
  359. bp += wq32(&buf[bp], temp);
  360. bp += wq32(&buf[bp], tif->file_index);
  361. bp += wq32(&buf[bp], tif->total);
  362. /* remove any pointers into this disposable lac footprint */
  363. tif->owner->inst_file_list = NULL;
  364. memcpy(&buf[bp], &tif->vli, tif->count);
  365. bp += tif->count;
  366. i = tif->lines_list;
  367. while (i) {
  368. spill(i->count, 0);
  369. memcpy(&buf[bp], &i->vli, i->count);
  370. bp += i->count;
  371. i = i->lines_next;
  372. }
  373. tif = tif->inst_file_next;
  374. }
  375. spill(0, 1);
  376. assert(lseek(t->fd, 0, SEEK_END) == (off_t)t->c);
  377. if (t->lwsac_input_head) {
  378. lwsac_input_size = lwsac_total_alloc(t->lwsac_input_head);
  379. if (lwsac_input_size > t->worst_lwsac_input_size)
  380. t->worst_lwsac_input_size = lwsac_input_size;
  381. }
  382. /*
  383. * those per-file allocations are all on a separate lac so we can
  384. * free it cleanly afterwards
  385. */
  386. lwsac_free(&t->lwsac_input_head);
  387. /* and lose the pointer into the deallocated lac */
  388. t->tif_list = NULL;
  389. return 0;
  390. }
  391. /*
  392. * 0 = punctuation, whitespace, brackets etc
  393. * 1 = character inside symbol set
  394. * 2 = upper-case character inside symbol set
  395. */
  396. static char classify[] = {
  397. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  398. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  399. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  400. 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
  401. 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  402. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, //1,
  403. 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  404. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
  405. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  406. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  407. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  408. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  409. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  410. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  411. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  412. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  413. };
  414. #if 0
  415. static const char *
  416. name_entry(struct lws_fts_entry *e1, char *s, int len)
  417. {
  418. struct lws_fts_entry *e2;
  419. int n = len;
  420. s[--n] = '\0';
  421. e2 = e1;
  422. while (e2) {
  423. if (e2->suffix) {
  424. if ((int)e2->suffix_len < n) {
  425. n -= e2->suffix_len;
  426. memcpy(&s[n], e2->suffix, e2->suffix_len);
  427. }
  428. } else {
  429. n--;
  430. s[n] = e2->c;
  431. }
  432. e2 = e2->parent;
  433. }
  434. return &s[n + 1];
  435. }
  436. #endif
  437. /*
  438. * as we parse the input, we create a line length table for the file index.
  439. * Only the file header has been written before we start doing this.
  440. */
  441. int
  442. lws_fts_fill(struct lws_fts *t, uint32_t file_index, const char *buf,
  443. size_t len)
  444. {
  445. unsigned long long tf = lws_now_usecs();
  446. unsigned char c, linetable[256], vlibuf[8];
  447. struct lws_fts_entry *e, *e1, *dcl;
  448. struct lws_fts_instance_file *tif;
  449. int bp = 0, sline, chars, m;
  450. char *osuff, skipline = 0;
  451. struct lws_fts_lines *tl;
  452. unsigned int olen, n;
  453. off_t lbh;
  454. if ((int)file_index != t->last_file_index) {
  455. if (t->last_file_index >= 0)
  456. finalize_per_input(t);
  457. t->last_file_index = file_index;
  458. t->line_number = 1;
  459. t->chars_in_line = 0;
  460. t->lines_in_unsealed_linetable = 0;
  461. }
  462. t->agg_raw_input += len;
  463. resume:
  464. chars = 0;
  465. lbh = t->c;
  466. sline = t->line_number;
  467. bp += g16(&linetable[bp], 0);
  468. bp += g16(&linetable[bp], 0);
  469. bp += g32(&linetable[bp], 0);
  470. while (len) {
  471. char go_around = 0;
  472. if (t->lines_in_unsealed_linetable >= LWS_FTS_LINES_PER_CHUNK)
  473. break;
  474. len--;
  475. c = (unsigned char)*buf++;
  476. t->chars_in_line++;
  477. if (c == '\n') {
  478. skipline = 0;
  479. t->filepath_list->total_lines++;
  480. t->lines_in_unsealed_linetable++;
  481. t->line_number++;
  482. bp += wq32(&linetable[bp], t->chars_in_line);
  483. if ((unsigned int)bp > sizeof(linetable) - 6) {
  484. if (write(t->fd, linetable, bp) != bp) {
  485. lwsl_err("%s: linetable write failed\n",
  486. __func__);
  487. return 1;
  488. }
  489. t->c += bp;
  490. bp = 0;
  491. // assert(lseek(t->fd, 0, SEEK_END) == t->c);
  492. }
  493. chars += t->chars_in_line;
  494. t->chars_in_line = 0;
  495. /*
  496. * Detect overlength lines and skip them (eg, BASE64
  497. * in css etc)
  498. */
  499. if (len > 200) {
  500. n = 0;
  501. m = 0;
  502. while (n < 200 && m < 80 && buf[n] != '\n') {
  503. if (buf[n] == ' ' || buf[n] == '\t')
  504. m = 0;
  505. n++;
  506. m++;
  507. }
  508. /* 80 lines no whitespace, or >=200-char line */
  509. if (m == 80 || n == 200)
  510. skipline = 1;
  511. }
  512. goto seal;
  513. }
  514. if (skipline)
  515. continue;
  516. m = classify[(int)c];
  517. if (!m)
  518. goto seal;
  519. if (m == 2)
  520. c += 'a' - 'A';
  521. if (t->aggregate) {
  522. /*
  523. * We created a trie entry for an earlier char in this
  524. * symbol already. So we know at the moment, any
  525. * further chars in the symbol are the only children.
  526. *
  527. * Aggregate them and add them as a string suffix to
  528. * the trie symbol at the end (when we know how much to
  529. * allocate).
  530. */
  531. if (t->agg_pos < sizeof(t->agg) - 1)
  532. /* symbol is not too long to stash */
  533. t->agg[t->agg_pos++] = c;
  534. continue;
  535. }
  536. if (t->str_match_pos) {
  537. go_around = 1;
  538. goto seal;
  539. }
  540. /* zeroth-iteration child matching */
  541. if (t->parser == t->root) {
  542. e = t->root_lookup[(int)c];
  543. if (e) {
  544. t->parser = e;
  545. continue;
  546. }
  547. } else {
  548. /* look for the char amongst the children */
  549. e = t->parser->child_list;
  550. while (e) {
  551. /* since they're alpha ordered... */
  552. if (e->c > c) {
  553. e = NULL;
  554. break;
  555. }
  556. if (e->c == c) {
  557. t->parser = e;
  558. if (e->suffix)
  559. t->str_match_pos = 1;
  560. break;
  561. }
  562. e = e->sibling;
  563. }
  564. if (e)
  565. continue;
  566. }
  567. /*
  568. * we are blazing a new trail, add a new child representing
  569. * the whole suffix that couldn't be matched until now.
  570. */
  571. e = lws_fts_entry_child_add(t, c, t->parser);
  572. if (!e) {
  573. lwsl_err("%s: lws_fts_entry_child_add failed\n",
  574. __func__);
  575. return 1;
  576. }
  577. /* if it's the root node, keep the root_lookup table in sync */
  578. if (t->parser == t->root)
  579. t->root_lookup[(int)c] = e;
  580. /* follow the new path */
  581. t->parser = e;
  582. {
  583. struct lws_fts_entry **pe = &e->child_list;
  584. while (*pe) {
  585. assert((*pe)->parent == e);
  586. pe = &(*pe)->sibling;
  587. }
  588. }
  589. /*
  590. * If there are any more symbol characters coming, just
  591. * create a suffix string on t->parser instead of what must
  592. * currently be single-child nodes, since we just created e
  593. * as a child with a single character due to no existing match
  594. * on that single character... so if no match on 'h' with this
  595. * guy's parent, we created e that matches on the single char
  596. * 'h'. If the symbol continues ... 'a' 'p' 'p' 'y', then
  597. * instead of creating singleton child nodes under e,
  598. * modify e to match on the whole string suffix "happy".
  599. *
  600. * If later "hoppy" appears, we will remove the suffix on e,
  601. * so it reverts to a char match for 'h', add singleton children
  602. * for 'a' and 'o', and attach a "ppy" suffix child to each of
  603. * those.
  604. *
  605. * We want to do this so we don't have to allocate trie entries
  606. * for every char in the string to save memory and consequently
  607. * time.
  608. *
  609. * Don't try this optimization if the parent is the root node...
  610. * it's not compatible with it's root_lookup table and it's
  611. * highly likely children off the root entry are going to have
  612. * to be fragmented.
  613. */
  614. if (e->parent != t->root) {
  615. t->aggregate = 1;
  616. t->agg_pos = 0;
  617. }
  618. continue;
  619. seal:
  620. if (t->str_match_pos) {
  621. /*
  622. * We're partway through matching an elaborated string
  623. * on a child, not just a character. String matches
  624. * only exist when we met a child entry that only had
  625. * one path until now... so we had an 'h', and the
  626. * only child had a string "hello".
  627. *
  628. * We are following the right path and will not need
  629. * to back up, but we may find as we go we have the
  630. * first instance of a second child path, eg, "help".
  631. *
  632. * When we get to the 'p', we have to split what was
  633. * the only string option "hello" into "hel" and then
  634. * two child entries, for "lo" and 'p'.
  635. */
  636. if (c == t->parser->suffix[t->str_match_pos++]) {
  637. if (t->str_match_pos < t->parser->suffix_len)
  638. continue;
  639. /*
  640. * We simply matched everything, continue
  641. * parsing normally from this trie entry.
  642. */
  643. t->str_match_pos = 0;
  644. continue;
  645. }
  646. /*
  647. * So... we hit a mismatch somewhere... it means we
  648. * have to split this string entry.
  649. *
  650. * We know the first char actually matched in order to
  651. * start down this road. So for the current trie entry,
  652. * we need to truncate his suffix at the char before
  653. * this mismatched one, where we diverged (if the
  654. * second char, simply remove the suffix string from the
  655. * current trie entry to turn it back to a 1-char match)
  656. *
  657. * The original entry, which becomes the lhs post-split,
  658. * is t->parser.
  659. */
  660. olen = t->parser->suffix_len;
  661. osuff = t->parser->suffix;
  662. if (t->str_match_pos == 2)
  663. t->parser->suffix = NULL;
  664. else
  665. t->parser->suffix_len = t->str_match_pos - 1;
  666. /*
  667. * Then we need to create a new child trie entry that
  668. * represents the remainder of the original string
  669. * path that we didn't match. For the "hello" /
  670. * "help" case, this guy will have "lo".
  671. *
  672. * Any instances or children (not siblings...) that were
  673. * attached to the original trie entry must be detached
  674. * first and then migrate to this new guy that completes
  675. * the original string.
  676. */
  677. dcl = t->parser->child_list;
  678. m = t->parser->child_count;
  679. t->parser->child_list = NULL;
  680. t->parser->child_count = 0;
  681. e = lws_fts_entry_child_add(t,
  682. osuff[t->str_match_pos - 1], t->parser);
  683. if (!e) {
  684. lwsl_err("%s: lws_fts_entry_child_add fail1\n",
  685. __func__);
  686. return 1;
  687. }
  688. e->child_list = dcl;
  689. e->child_count = m;
  690. /*
  691. * any children we took over must point to us as the
  692. * parent now they appear on our child list
  693. */
  694. e1 = e->child_list;
  695. while (e1) {
  696. e1->parent = e;
  697. e1 = e1->sibling;
  698. }
  699. /*
  700. * We detached any children, gave them to the new guy
  701. * and replaced them with just our new guy
  702. */
  703. t->parser->child_count = 1;
  704. t->parser->child_list = e;
  705. /*
  706. * any instances that belonged to the original entry we
  707. * are splitting now must be reassigned to the end
  708. * part
  709. */
  710. e->inst_file_list = t->parser->inst_file_list;
  711. if (e->inst_file_list)
  712. e->inst_file_list->owner = e;
  713. t->parser->inst_file_list = NULL;
  714. e->instance_count = t->parser->instance_count;
  715. t->parser->instance_count = 0;
  716. e->ofs_last_inst_file = t->parser->ofs_last_inst_file;
  717. t->parser->ofs_last_inst_file = 0;
  718. if (t->str_match_pos != olen) {
  719. /* we diverged partway */
  720. e->suffix = &osuff[t->str_match_pos - 1];
  721. e->suffix_len = olen - (t->str_match_pos - 1);
  722. }
  723. /*
  724. * if the current char is a terminal, skip creating a
  725. * new way forward.
  726. */
  727. if (classify[(int)c]) {
  728. /*
  729. * Lastly we need to create a new child trie
  730. * entry that represents the new way forward
  731. * from the point that we diverged. For the
  732. * "hello" / "help" case, this guy will start
  733. * as a child of "hel" with the single
  734. * character match 'p'.
  735. *
  736. * Since he becomes the current parser context,
  737. * more symbol characters may be coming to make
  738. * him into, eg, "helping", in which case he
  739. * will acquire a suffix eventually of "ping"
  740. * via the aggregation stuff
  741. */
  742. e = lws_fts_entry_child_add(t, c, t->parser);
  743. if (!e) {
  744. lwsl_err("%s: child_add fail2\n",
  745. __func__);
  746. return 1;
  747. }
  748. }
  749. /* go on following this path */
  750. t->parser = e;
  751. t->aggregate = 1;
  752. t->agg_pos = 0;
  753. t->str_match_pos = 0;
  754. if (go_around)
  755. continue;
  756. /* this is intended to be a seal */
  757. }
  758. /* end of token */
  759. if (t->aggregate && t->agg_pos) {
  760. /* if nothing in agg[]: leave as single char match */
  761. /* otherwise copy out the symbol aggregation */
  762. t->parser->suffix = lwsac_use(&t->lwsac_head,
  763. t->agg_pos + 1,
  764. TRIE_LWSAC_BLOCK_SIZE);
  765. if (!t->parser->suffix) {
  766. lwsl_err("%s: lac for suffix failed\n",
  767. __func__);
  768. return 1;
  769. }
  770. /* add the first char at the beginning */
  771. *t->parser->suffix = t->parser->c;
  772. /* and then add the agg buffer stuff */
  773. memcpy(t->parser->suffix + 1, t->agg, t->agg_pos);
  774. t->parser->suffix_len = t->agg_pos + 1;
  775. }
  776. t->aggregate = 0;
  777. if (t->parser == t->root) /* multiple terminal chars */
  778. continue;
  779. if (!t->parser->inst_file_list ||
  780. t->parser->inst_file_list->file_index != file_index) {
  781. tif = lwsac_use(&t->lwsac_input_head, sizeof(*tif),
  782. TRIE_LWSAC_BLOCK_SIZE);
  783. if (!tif) {
  784. lwsl_err("%s: lac for tif failed\n",
  785. __func__);
  786. return 1;
  787. }
  788. tif->file_index = file_index;
  789. tif->owner = t->parser;
  790. tif->lines_list = NULL;
  791. tif->lines_tail = NULL;
  792. tif->total = 0;
  793. tif->count = 0;
  794. tif->inst_file_next = t->tif_list;
  795. t->tif_list = tif;
  796. t->parser->inst_file_list = tif;
  797. }
  798. /*
  799. * A naive allocation strategy for this leads to 50% of the
  800. * total inmem lac allocation being for line numbers...
  801. *
  802. * It's mainly solved by only holding the instance and line
  803. * number tables for the duration of a file being input, as soon
  804. * as one input file is finished it is written to disk.
  805. *
  806. * For the common case of 1 - ~3 matches the line number are
  807. * stored in a small VLI array inside the filepath inst. If the
  808. * next one won't fit, it allocates a line number struct with
  809. * more vli space and continues chaining those if needed.
  810. */
  811. n = wq32(vlibuf, t->line_number);
  812. tif = t->parser->inst_file_list;
  813. if (!tif->lines_list) {
  814. /* we are still trying to use the file inst vli */
  815. if (LWS_ARRAY_SIZE(tif->vli) - tif->count >= n) {
  816. tif->count += wq32(tif->vli + tif->count,
  817. t->line_number);
  818. goto after;
  819. }
  820. /* we are going to have to allocate */
  821. }
  822. /* can we add to an existing line numbers struct? */
  823. if (tif->lines_tail &&
  824. LWS_ARRAY_SIZE(tif->lines_tail->vli) -
  825. tif->lines_tail->count >= n) {
  826. tif->lines_tail->count += wq32(tif->lines_tail->vli +
  827. tif->lines_tail->count,
  828. t->line_number);
  829. goto after;
  830. }
  831. /* either no existing line numbers struct at tail, or full */
  832. /* have to create a(nother) line numbers struct */
  833. tl = lwsac_use(&t->lwsac_input_head, sizeof(*tl),
  834. TRIE_LWSAC_BLOCK_SIZE);
  835. if (!tl) {
  836. lwsl_err("%s: lac for tl failed\n", __func__);
  837. return 1;
  838. }
  839. tl->lines_next = NULL;
  840. if (tif->lines_tail)
  841. tif->lines_tail->lines_next = tl;
  842. tif->lines_tail = tl;
  843. if (!tif->lines_list)
  844. tif->lines_list = tl;
  845. tl->count = wq32(tl->vli, t->line_number);
  846. after:
  847. tif->total++;
  848. #if 0
  849. {
  850. char s[128];
  851. const char *ne = name_entry(t->parser, s, sizeof(s));
  852. if (!strcmp(ne, "describ")) {
  853. lwsl_err(" %s %d\n", ne, t->str_match_pos);
  854. write(1, buf - 10, 20);
  855. }
  856. }
  857. #endif
  858. t->parser->instance_count++;
  859. t->parser = t->root;
  860. t->str_match_pos = 0;
  861. }
  862. /* seal off the line length table block */
  863. if (bp) {
  864. if (write(t->fd, linetable, bp) != bp)
  865. return 1;
  866. t->c += bp;
  867. bp = 0;
  868. }
  869. if (lseek(t->fd, lbh, SEEK_SET) < 0) {
  870. lwsl_err("%s: seek to 0x%llx failed\n", __func__,
  871. (unsigned long long)lbh);
  872. return 1;
  873. }
  874. g16(linetable, t->c - lbh);
  875. g16(linetable + 2, t->line_number - sline);
  876. g32(linetable + 4, chars);
  877. if (write(t->fd, linetable, 8) != 8) {
  878. lwsl_err("%s: write linetable header failed\n", __func__);
  879. return 1;
  880. }
  881. assert(lseek(t->fd, 0, SEEK_END) == (off_t)t->c);
  882. if (lseek(t->fd, t->c, SEEK_SET) < 0) {
  883. lwsl_err("%s: end seek failed\n", __func__);
  884. return 1;
  885. }
  886. bp = 0;
  887. if (len) {
  888. t->lines_in_unsealed_linetable = 0;
  889. goto resume;
  890. }
  891. /* dump the collected per-input instance and line data, and free it */
  892. t->agg_trie_creation_us += lws_now_usecs() - tf;
  893. return 0;
  894. }
  895. /* refer to ./README.md */
  896. int
  897. lws_fts_serialize(struct lws_fts *t)
  898. {
  899. struct lws_fts_filepath *fp = t->filepath_list, *ofp;
  900. unsigned long long tf = lws_now_usecs();
  901. struct lws_fts_entry *e, *e1, *s[256];
  902. unsigned char buf[8192], stasis;
  903. int n, bp, sp = 0, do_parent;
  904. (void)tf;
  905. finalize_per_input(t);
  906. /*
  907. * Compute aggregated instance counts (parents should know the total
  908. * number of instances below each child path)
  909. *
  910. *
  911. * If we have
  912. *
  913. * (root) -> (c1) -> (c2)
  914. * -> (c3)
  915. *
  916. * we need to visit the nodes in the order
  917. *
  918. * c2, c1, c3, root
  919. */
  920. sp = 0;
  921. s[0] = t->root;
  922. do_parent = 0;
  923. while (sp >= 0) {
  924. int n;
  925. /* aggregate in every antecedent */
  926. for (n = 0; n <= sp; n++) {
  927. s[n]->agg_inst_count += s[sp]->instance_count;
  928. s[n]->agg_child_count += s[sp]->child_count;
  929. }
  930. /* handle any children before the parent */
  931. if (s[sp]->child_list) {
  932. if (sp + 1 == LWS_ARRAY_SIZE(s)) {
  933. lwsl_err("Stack too deep\n");
  934. goto bail;
  935. }
  936. s[sp + 1] = s[sp]->child_list;
  937. sp++;
  938. continue;
  939. }
  940. do {
  941. if (s[sp]->sibling) {
  942. s[sp] = s[sp]->sibling;
  943. break;
  944. } else
  945. sp--;
  946. } while (sp >= 0);
  947. }
  948. /* dump the filepaths and set prev */
  949. fp = t->filepath_list;
  950. ofp = NULL;
  951. bp = 0;
  952. while (fp) {
  953. fp->ofs = t->c + bp;
  954. n = (int)strlen(fp->filepath);
  955. spill(15 + n, 0);
  956. bp += wq32(&buf[bp], fp->line_table_ofs);
  957. bp += wq32(&buf[bp], fp->total_lines);
  958. bp += wq32(&buf[bp], n);
  959. memcpy(&buf[bp], fp->filepath, n);
  960. bp += n;
  961. fp->prev = ofp;
  962. ofp = fp;
  963. fp = fp->next;
  964. }
  965. spill(0, 1);
  966. /* record the fileoffset of the filepath map and filepath count */
  967. if (lseek(t->fd, 0xc, SEEK_SET) < 0)
  968. goto bail_seek;
  969. g32(buf, t->c + bp);
  970. g32(buf + 4, t->next_file_index);
  971. if (write(t->fd, buf, 8) != 8)
  972. goto bail;
  973. if (lseek(t->fd, t->c + bp, SEEK_SET) < 0)
  974. goto bail_seek;
  975. /* dump the filepath map, starting from index 0, which is at the tail */
  976. fp = ofp;
  977. bp = 0;
  978. while (fp) {
  979. spill(5, 0);
  980. g32(buf + bp, fp->ofs);
  981. bp += 4;
  982. fp = fp->prev;
  983. }
  984. spill(0, 1);
  985. /*
  986. * The trie entries in reverse order... because of the reversal, we have
  987. * always written children first, and marked them with their file offset
  988. * before we come to refer to them.
  989. */
  990. bp = 0;
  991. sp = 0;
  992. s[0] = t->root;
  993. do_parent = 0;
  994. while (s[sp]) {
  995. /* handle any children before the parent */
  996. if (!do_parent && s[sp]->child_list) {
  997. if (sp + 1 == LWS_ARRAY_SIZE(s)) {
  998. lwsl_err("Stack too deep\n");
  999. goto bail;
  1000. }
  1001. s[sp + 1] = s[sp]->child_list;
  1002. sp++;
  1003. continue;
  1004. }
  1005. /* leaf nodes with no children */
  1006. e = s[sp];
  1007. e->ofs = t->c + bp;
  1008. /* write the trie entry header */
  1009. spill((3 * MAX_VLI), 0);
  1010. bp += wq32(&buf[bp], e->ofs_last_inst_file);
  1011. bp += wq32(&buf[bp], e->child_count);
  1012. bp += wq32(&buf[bp], e->instance_count);
  1013. bp += wq32(&buf[bp], e->agg_inst_count);
  1014. /* sort the children in order of highest aggregate hits first */
  1015. do {
  1016. struct lws_fts_entry **pe, *te1, *te2;
  1017. stasis = 1;
  1018. /* bubble sort keeps going until nothing changed */
  1019. pe = &e->child_list;
  1020. while (*pe) {
  1021. te1 = *pe;
  1022. te2 = te1->sibling;
  1023. if (te2 && te1->agg_inst_count <
  1024. te2->agg_inst_count) {
  1025. stasis = 0;
  1026. *pe = te2;
  1027. te1->sibling = te2->sibling;
  1028. te2->sibling = te1;
  1029. }
  1030. pe = &(*pe)->sibling;
  1031. }
  1032. } while (!stasis);
  1033. /* write the children */
  1034. e1 = e->child_list;
  1035. while (e1) {
  1036. spill((5 * MAX_VLI) + e1->suffix_len + 1, 0);
  1037. bp += wq32(&buf[bp], e1->ofs);
  1038. bp += wq32(&buf[bp], e1->instance_count);
  1039. bp += wq32(&buf[bp], e1->agg_inst_count);
  1040. bp += wq32(&buf[bp], e1->agg_child_count);
  1041. if (e1->suffix) { /* string */
  1042. bp += wq32(&buf[bp], e1->suffix_len);
  1043. memmove(&buf[bp], e1->suffix, e1->suffix_len);
  1044. bp += e1->suffix_len;
  1045. } else { /* char */
  1046. bp += wq32(&buf[bp], 1);
  1047. buf[bp++] = e1->c;
  1048. }
  1049. #if 0
  1050. if (e1->suffix && e1->suffix_len == 3 &&
  1051. !memcmp(e1->suffix, "cri", 3)) {
  1052. struct lws_fts_entry *e2;
  1053. e2 = e1;
  1054. while (e2){
  1055. if (e2->suffix)
  1056. lwsl_notice("%s\n", e2->suffix);
  1057. else
  1058. lwsl_notice("%c\n", e2->c);
  1059. e2 = e2->parent;
  1060. }
  1061. lwsl_err("*** %c CRI inst %d ch %d\n", e1->parent->c,
  1062. e1->instance_count, e1->child_count);
  1063. }
  1064. #endif
  1065. e1 = e1->sibling;
  1066. }
  1067. /* if there are siblings, do those next */
  1068. if (do_parent) {
  1069. do_parent = 0;
  1070. sp--;
  1071. }
  1072. if (s[sp]->sibling)
  1073. s[sp] = s[sp]->sibling;
  1074. else {
  1075. /* if there are no siblings, do the parent */
  1076. do_parent = 1;
  1077. s[sp] = s[sp]->parent;
  1078. }
  1079. }
  1080. spill(0, 1);
  1081. assert(lseek(t->fd, 0, SEEK_END) == (off_t)t->c);
  1082. /* drop the correct root trie offset + file length into the header */
  1083. if (lseek(t->fd, 4, SEEK_SET) < 0) {
  1084. lwsl_err("%s: unable to seek\n", __func__);
  1085. goto bail;
  1086. }
  1087. g32(buf, t->root->ofs);
  1088. g32(buf + 4, t->c);
  1089. if (write(t->fd, buf, 0x8) != 0x8)
  1090. goto bail;
  1091. lwsl_notice("%s: index %d files (%uMiB) cpu time %dms, "
  1092. "alloc: %dKiB + %dKiB, "
  1093. "serialize: %dms, file: %dKiB\n", __func__,
  1094. t->next_file_index,
  1095. (int)(t->agg_raw_input / (1024 * 1024)),
  1096. (int)(t->agg_trie_creation_us / 1000),
  1097. (int)(lwsac_total_alloc(t->lwsac_head) / 1024),
  1098. (int)(t->worst_lwsac_input_size / 1024),
  1099. (int)((lws_now_usecs() - tf) / 1000),
  1100. (int)(t->c / 1024));
  1101. return 0;
  1102. bail_seek:
  1103. lwsl_err("%s: problem seekings\n", __func__);
  1104. bail:
  1105. return 1;
  1106. }