| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372 |
- /*
- * libwebsockets - small server side websockets and web server implementation
- *
- * Copyright (C) 2010 - 2019 Andy Green <[email protected]>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * The functions allow
- *
- * - collecting a concordance of strings from one or more files (eg, a
- * directory of files) into a single in-memory, lac-backed trie;
- *
- * - to optimize and serialize the in-memory trie to an fd;
- *
- * - to very quickly report any instances of a string in any of the files
- * indexed by the trie, by a seeking around a serialized trie fd, without
- * having to load it all in memory
- */
- #include "private-lib-core.h"
- #include "private-lib-misc-fts.h"
- #include <stdio.h>
- #include <string.h>
- #include <assert.h>
- #include <fcntl.h>
- #include <errno.h>
- #include <sys/types.h>
- struct lws_fts_entry;
- /* notice these are stored in t->lwsac_input_head which has input file scope */
- struct lws_fts_filepath {
- struct lws_fts_filepath *next;
- struct lws_fts_filepath *prev;
- char filepath[256];
- jg2_file_offset ofs;
- jg2_file_offset line_table_ofs;
- int filepath_len;
- int file_index;
- int total_lines;
- int priority;
- };
- /* notice these are stored in t->lwsac_input_head which has input file scope */
- struct lws_fts_lines {
- struct lws_fts_lines *lines_next;
- /*
- * amount of line numbers needs to meet average count for best
- * efficiency.
- *
- * Line numbers are stored in VLI format since if we don't, around half
- * the total lac allocation consists of struct lws_fts_lines...
- * size chosen to maintain 8-byte struct alignment
- */
- uint8_t vli[119];
- char count;
- };
- /* this represents the instances of a symbol inside a given filepath */
- struct lws_fts_instance_file {
- /* linked-list of tifs generated for current file */
- struct lws_fts_instance_file *inst_file_next;
- struct lws_fts_entry *owner;
- struct lws_fts_lines *lines_list, *lines_tail;
- uint32_t file_index;
- uint32_t total;
- /*
- * optimization for the common case there's only 1 - ~3 matches, so we
- * don't have to allocate any lws_fts_lines struct
- *
- * Using 8 bytes total for this maintains 8-byte struct alignment...
- */
- uint8_t vli[7];
- char count;
- };
- /*
- * this is the main trie in-memory allocation object
- */
- struct lws_fts_entry {
- struct lws_fts_entry *parent;
- struct lws_fts_entry *child_list;
- struct lws_fts_entry *sibling;
- /*
- * care... this points to content in t->lwsac_input_head, it goes
- * out of scope when the input file being indexed completes
- */
- struct lws_fts_instance_file *inst_file_list;
- jg2_file_offset ofs_last_inst_file;
- char *suffix; /* suffix string or NULL if one char (in .c) */
- jg2_file_offset ofs;
- uint32_t child_count;
- uint32_t instance_count;
- uint32_t agg_inst_count;
- uint32_t agg_child_count;
- uint32_t suffix_len;
- unsigned char c;
- };
- /* there's only one of these per trie file */
- struct lws_fts {
- struct lwsac *lwsac_head;
- struct lwsac *lwsac_input_head;
- struct lws_fts_entry *root;
- struct lws_fts_filepath *filepath_list;
- struct lws_fts_filepath *fp;
- struct lws_fts_entry *parser;
- struct lws_fts_entry *root_lookup[256];
- /*
- * head of linked-list of tifs generated for current file
- * care... this points to content in t->lwsac_input_head
- */
- struct lws_fts_instance_file *tif_list;
- jg2_file_offset c; /* length of output file so far */
- uint64_t agg_trie_creation_us;
- uint64_t agg_raw_input;
- uint64_t worst_lwsac_input_size;
- int last_file_index;
- int chars_in_line;
- jg2_file_offset last_block_len_ofs;
- int line_number;
- int lines_in_unsealed_linetable;
- int next_file_index;
- int count_entries;
- int fd;
- unsigned int agg_pos;
- unsigned int str_match_pos;
- unsigned char aggregate;
- unsigned char agg[128];
- };
- /* since the kernel case allocates >300MB, no point keeping this too low */
- #define TRIE_LWSAC_BLOCK_SIZE (1024 * 1024)
- #define spill(margin, force) \
- if (bp && ((uint32_t)bp >= (sizeof(buf) - (margin)) || (force))) { \
- if (write(t->fd, buf, bp) != bp) { \
- lwsl_err("%s: write %d failed (%d)\n", __func__, \
- bp, errno); \
- return 1; \
- } \
- t->c += bp; \
- bp = 0; \
- }
- static int
- g32(unsigned char *b, uint32_t d)
- {
- *b++ = (d >> 24) & 0xff;
- *b++ = (d >> 16) & 0xff;
- *b++ = (d >> 8) & 0xff;
- *b = d & 0xff;
- return 4;
- }
- static int
- g16(unsigned char *b, int d)
- {
- *b++ = (d >> 8) & 0xff;
- *b = d & 0xff;
- return 2;
- }
- static int
- wq32(unsigned char *b, uint32_t d)
- {
- unsigned char *ob = b;
- if (d > (1 << 28) - 1)
- *b++ = ((d >> 28) | 0x80) & 0xff;
- if (d > (1 << 21) - 1)
- *b++ = ((d >> 21) | 0x80) & 0xff;
- if (d > (1 << 14) - 1)
- *b++ = ((d >> 14) | 0x80) & 0xff;
- if (d > (1 << 7) - 1)
- *b++ = ((d >> 7) | 0x80) & 0xff;
- *b++ = d & 0x7f;
- return (int)(b - ob);
- }
- /* read a VLI, return the number of bytes used */
- int
- rq32(unsigned char *b, uint32_t *d)
- {
- unsigned char *ob = b;
- uint32_t t = 0;
- t = *b & 0x7f;
- if (*(b++) & 0x80) {
- t = (t << 7) | (*b & 0x7f);
- if (*(b++) & 0x80) {
- t = (t << 7) | (*b & 0x7f);
- if (*(b++) & 0x80) {
- t = (t << 7) | (*b & 0x7f);
- if (*(b++) & 0x80) {
- t = (t << 7) | (*b & 0x7f);
- b++;
- }
- }
- }
- }
- *d = t;
- return (int)(b - ob);
- }
- struct lws_fts *
- lws_fts_create(int fd)
- {
- struct lws_fts *t;
- struct lwsac *lwsac_head = NULL;
- unsigned char buf[TRIE_FILE_HDR_SIZE];
- t = lwsac_use(&lwsac_head, sizeof(*t), TRIE_LWSAC_BLOCK_SIZE);
- if (!t)
- return NULL;
- memset(t, 0, sizeof(*t));
- t->fd = fd;
- t->lwsac_head = lwsac_head;
- t->root = lwsac_use(&lwsac_head, sizeof(*t->root),
- TRIE_LWSAC_BLOCK_SIZE);
- if (!t->root)
- goto unwind;
- memset(t->root, 0, sizeof(*t->root));
- t->parser = t->root;
- t->last_file_index = -1;
- t->line_number = 1;
- t->filepath_list = NULL;
- memset(t->root_lookup, 0, sizeof(*t->root_lookup));
- /* write the header */
- buf[0] = 0xca;
- buf[1] = 0x7a;
- buf[2] = 0x5f;
- buf[3] = 0x75;
- /* (these are filled in with correct data at the end) */
- /* file offset to root trie entry */
- g32(&buf[4], 0);
- /* file length when it was created */
- g32(&buf[8], 0);
- /* fileoffset to the filepath table */
- g32(&buf[0xc], 0);
- /* count of filepaths */
- g32(&buf[0x10], 0);
- if (write(t->fd, buf, TRIE_FILE_HDR_SIZE) != TRIE_FILE_HDR_SIZE) {
- lwsl_err("%s: trie header write failed\n", __func__);
- goto unwind;
- }
- t->c = TRIE_FILE_HDR_SIZE;
- return t;
- unwind:
- lwsac_free(&lwsac_head);
- return NULL;
- }
- void
- lws_fts_destroy(struct lws_fts **trie)
- {
- struct lwsac *lwsac_head = (*trie)->lwsac_head;
- lwsac_free(&(*trie)->lwsac_input_head);
- lwsac_free(&lwsac_head);
- *trie = NULL;
- }
- int
- lws_fts_file_index(struct lws_fts *t, const char *filepath, int filepath_len,
- int priority)
- {
- struct lws_fts_filepath *fp = t->filepath_list;
- #if 0
- while (fp) {
- if (fp->filepath_len == filepath_len &&
- !strcmp(fp->filepath, filepath))
- return fp->file_index;
- fp = fp->next;
- }
- #endif
- fp = lwsac_use(&t->lwsac_head, sizeof(*fp), TRIE_LWSAC_BLOCK_SIZE);
- if (!fp)
- return -1;
- fp->next = t->filepath_list;
- t->filepath_list = fp;
- strncpy(fp->filepath, filepath, sizeof(fp->filepath) - 1);
- fp->filepath[sizeof(fp->filepath) - 1] = '\0';
- fp->filepath_len = filepath_len;
- fp->file_index = t->next_file_index++;
- fp->line_table_ofs = t->c;
- fp->priority = priority;
- fp->total_lines = 0;
- t->fp = fp;
- return fp->file_index;
- }
- static struct lws_fts_entry *
- lws_fts_entry_child_add(struct lws_fts *t, unsigned char c,
- struct lws_fts_entry *parent)
- {
- struct lws_fts_entry *e, **pe;
- e = lwsac_use(&t->lwsac_head, sizeof(*e), TRIE_LWSAC_BLOCK_SIZE);
- if (!e)
- return NULL;
- memset(e, 0, sizeof(*e));
- e->c = c;
- parent->child_count++;
- e->parent = parent;
- t->count_entries++;
- /* keep the parent child list in ascending sort order for c */
- pe = &parent->child_list;
- while (*pe) {
- assert((*pe)->parent == parent);
- if ((*pe)->c > c) {
- /* add it before */
- e->sibling = *pe;
- *pe = e;
- break;
- }
- pe = &(*pe)->sibling;
- }
- if (!*pe) {
- /* add it at the end */
- e->sibling = NULL;
- *pe = e;
- }
- return e;
- }
- static int
- finalize_per_input(struct lws_fts *t)
- {
- struct lws_fts_instance_file *tif;
- unsigned char buf[8192];
- uint64_t lwsac_input_size;
- jg2_file_offset temp;
- int bp = 0;
- bp += g16(&buf[bp], 0);
- bp += g16(&buf[bp], 0);
- bp += g32(&buf[bp], 0);
- if (write(t->fd, buf, bp) != bp)
- return 1;
- t->c += bp;
- bp = 0;
- /*
- * Write the generated file index + instances (if any)
- *
- * Notice the next same-parent file instance fileoffset list is
- * backwards, so it does not require seeks to fill in. The first
- * entry has 0 but the second entry points to the first entry (whose
- * fileoffset is known).
- *
- * After all the file instance structs are finalized,
- * .ofs_last_inst_file contains the fileoffset of that child's tif
- * list head in the file.
- *
- * The file instances are written to disk in the order that the files
- * were indexed, along with their prev pointers inline.
- */
- tif = t->tif_list;
- while (tif) {
- struct lws_fts_lines *i;
- spill((3 * MAX_VLI) + tif->count, 0);
- temp = tif->owner->ofs_last_inst_file;
- if (tif->total)
- tif->owner->ofs_last_inst_file = t->c + bp;
- assert(!temp || (temp > TRIE_FILE_HDR_SIZE && temp < t->c));
- /* fileoffset of prev instance file for this entry, or 0 */
- bp += wq32(&buf[bp], temp);
- bp += wq32(&buf[bp], tif->file_index);
- bp += wq32(&buf[bp], tif->total);
- /* remove any pointers into this disposable lac footprint */
- tif->owner->inst_file_list = NULL;
- memcpy(&buf[bp], &tif->vli, tif->count);
- bp += tif->count;
- i = tif->lines_list;
- while (i) {
- spill(i->count, 0);
- memcpy(&buf[bp], &i->vli, i->count);
- bp += i->count;
- i = i->lines_next;
- }
- tif = tif->inst_file_next;
- }
- spill(0, 1);
- assert(lseek(t->fd, 0, SEEK_END) == (off_t)t->c);
- if (t->lwsac_input_head) {
- lwsac_input_size = lwsac_total_alloc(t->lwsac_input_head);
- if (lwsac_input_size > t->worst_lwsac_input_size)
- t->worst_lwsac_input_size = lwsac_input_size;
- }
- /*
- * those per-file allocations are all on a separate lac so we can
- * free it cleanly afterwards
- */
- lwsac_free(&t->lwsac_input_head);
- /* and lose the pointer into the deallocated lac */
- t->tif_list = NULL;
- return 0;
- }
- /*
- * 0 = punctuation, whitespace, brackets etc
- * 1 = character inside symbol set
- * 2 = upper-case character inside symbol set
- */
- static char classify[] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
- 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, //1,
- 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- };
- #if 0
- static const char *
- name_entry(struct lws_fts_entry *e1, char *s, int len)
- {
- struct lws_fts_entry *e2;
- int n = len;
- s[--n] = '\0';
- e2 = e1;
- while (e2) {
- if (e2->suffix) {
- if ((int)e2->suffix_len < n) {
- n -= e2->suffix_len;
- memcpy(&s[n], e2->suffix, e2->suffix_len);
- }
- } else {
- n--;
- s[n] = e2->c;
- }
- e2 = e2->parent;
- }
- return &s[n + 1];
- }
- #endif
- /*
- * as we parse the input, we create a line length table for the file index.
- * Only the file header has been written before we start doing this.
- */
- int
- lws_fts_fill(struct lws_fts *t, uint32_t file_index, const char *buf,
- size_t len)
- {
- unsigned long long tf = lws_now_usecs();
- unsigned char c, linetable[256], vlibuf[8];
- struct lws_fts_entry *e, *e1, *dcl;
- struct lws_fts_instance_file *tif;
- int bp = 0, sline, chars, m;
- char *osuff, skipline = 0;
- struct lws_fts_lines *tl;
- unsigned int olen, n;
- off_t lbh;
- if ((int)file_index != t->last_file_index) {
- if (t->last_file_index >= 0)
- finalize_per_input(t);
- t->last_file_index = file_index;
- t->line_number = 1;
- t->chars_in_line = 0;
- t->lines_in_unsealed_linetable = 0;
- }
- t->agg_raw_input += len;
- resume:
- chars = 0;
- lbh = t->c;
- sline = t->line_number;
- bp += g16(&linetable[bp], 0);
- bp += g16(&linetable[bp], 0);
- bp += g32(&linetable[bp], 0);
- while (len) {
- char go_around = 0;
- if (t->lines_in_unsealed_linetable >= LWS_FTS_LINES_PER_CHUNK)
- break;
- len--;
- c = (unsigned char)*buf++;
- t->chars_in_line++;
- if (c == '\n') {
- skipline = 0;
- t->filepath_list->total_lines++;
- t->lines_in_unsealed_linetable++;
- t->line_number++;
- bp += wq32(&linetable[bp], t->chars_in_line);
- if ((unsigned int)bp > sizeof(linetable) - 6) {
- if (write(t->fd, linetable, bp) != bp) {
- lwsl_err("%s: linetable write failed\n",
- __func__);
- return 1;
- }
- t->c += bp;
- bp = 0;
- // assert(lseek(t->fd, 0, SEEK_END) == t->c);
- }
- chars += t->chars_in_line;
- t->chars_in_line = 0;
- /*
- * Detect overlength lines and skip them (eg, BASE64
- * in css etc)
- */
- if (len > 200) {
- n = 0;
- m = 0;
- while (n < 200 && m < 80 && buf[n] != '\n') {
- if (buf[n] == ' ' || buf[n] == '\t')
- m = 0;
- n++;
- m++;
- }
- /* 80 lines no whitespace, or >=200-char line */
- if (m == 80 || n == 200)
- skipline = 1;
- }
- goto seal;
- }
- if (skipline)
- continue;
- m = classify[(int)c];
- if (!m)
- goto seal;
- if (m == 2)
- c += 'a' - 'A';
- if (t->aggregate) {
- /*
- * We created a trie entry for an earlier char in this
- * symbol already. So we know at the moment, any
- * further chars in the symbol are the only children.
- *
- * Aggregate them and add them as a string suffix to
- * the trie symbol at the end (when we know how much to
- * allocate).
- */
- if (t->agg_pos < sizeof(t->agg) - 1)
- /* symbol is not too long to stash */
- t->agg[t->agg_pos++] = c;
- continue;
- }
- if (t->str_match_pos) {
- go_around = 1;
- goto seal;
- }
- /* zeroth-iteration child matching */
- if (t->parser == t->root) {
- e = t->root_lookup[(int)c];
- if (e) {
- t->parser = e;
- continue;
- }
- } else {
- /* look for the char amongst the children */
- e = t->parser->child_list;
- while (e) {
- /* since they're alpha ordered... */
- if (e->c > c) {
- e = NULL;
- break;
- }
- if (e->c == c) {
- t->parser = e;
- if (e->suffix)
- t->str_match_pos = 1;
- break;
- }
- e = e->sibling;
- }
- if (e)
- continue;
- }
- /*
- * we are blazing a new trail, add a new child representing
- * the whole suffix that couldn't be matched until now.
- */
- e = lws_fts_entry_child_add(t, c, t->parser);
- if (!e) {
- lwsl_err("%s: lws_fts_entry_child_add failed\n",
- __func__);
- return 1;
- }
- /* if it's the root node, keep the root_lookup table in sync */
- if (t->parser == t->root)
- t->root_lookup[(int)c] = e;
- /* follow the new path */
- t->parser = e;
- {
- struct lws_fts_entry **pe = &e->child_list;
- while (*pe) {
- assert((*pe)->parent == e);
- pe = &(*pe)->sibling;
- }
- }
- /*
- * If there are any more symbol characters coming, just
- * create a suffix string on t->parser instead of what must
- * currently be single-child nodes, since we just created e
- * as a child with a single character due to no existing match
- * on that single character... so if no match on 'h' with this
- * guy's parent, we created e that matches on the single char
- * 'h'. If the symbol continues ... 'a' 'p' 'p' 'y', then
- * instead of creating singleton child nodes under e,
- * modify e to match on the whole string suffix "happy".
- *
- * If later "hoppy" appears, we will remove the suffix on e,
- * so it reverts to a char match for 'h', add singleton children
- * for 'a' and 'o', and attach a "ppy" suffix child to each of
- * those.
- *
- * We want to do this so we don't have to allocate trie entries
- * for every char in the string to save memory and consequently
- * time.
- *
- * Don't try this optimization if the parent is the root node...
- * it's not compatible with it's root_lookup table and it's
- * highly likely children off the root entry are going to have
- * to be fragmented.
- */
- if (e->parent != t->root) {
- t->aggregate = 1;
- t->agg_pos = 0;
- }
- continue;
- seal:
- if (t->str_match_pos) {
- /*
- * We're partway through matching an elaborated string
- * on a child, not just a character. String matches
- * only exist when we met a child entry that only had
- * one path until now... so we had an 'h', and the
- * only child had a string "hello".
- *
- * We are following the right path and will not need
- * to back up, but we may find as we go we have the
- * first instance of a second child path, eg, "help".
- *
- * When we get to the 'p', we have to split what was
- * the only string option "hello" into "hel" and then
- * two child entries, for "lo" and 'p'.
- */
- if (c == t->parser->suffix[t->str_match_pos++]) {
- if (t->str_match_pos < t->parser->suffix_len)
- continue;
- /*
- * We simply matched everything, continue
- * parsing normally from this trie entry.
- */
- t->str_match_pos = 0;
- continue;
- }
- /*
- * So... we hit a mismatch somewhere... it means we
- * have to split this string entry.
- *
- * We know the first char actually matched in order to
- * start down this road. So for the current trie entry,
- * we need to truncate his suffix at the char before
- * this mismatched one, where we diverged (if the
- * second char, simply remove the suffix string from the
- * current trie entry to turn it back to a 1-char match)
- *
- * The original entry, which becomes the lhs post-split,
- * is t->parser.
- */
- olen = t->parser->suffix_len;
- osuff = t->parser->suffix;
- if (t->str_match_pos == 2)
- t->parser->suffix = NULL;
- else
- t->parser->suffix_len = t->str_match_pos - 1;
- /*
- * Then we need to create a new child trie entry that
- * represents the remainder of the original string
- * path that we didn't match. For the "hello" /
- * "help" case, this guy will have "lo".
- *
- * Any instances or children (not siblings...) that were
- * attached to the original trie entry must be detached
- * first and then migrate to this new guy that completes
- * the original string.
- */
- dcl = t->parser->child_list;
- m = t->parser->child_count;
- t->parser->child_list = NULL;
- t->parser->child_count = 0;
- e = lws_fts_entry_child_add(t,
- osuff[t->str_match_pos - 1], t->parser);
- if (!e) {
- lwsl_err("%s: lws_fts_entry_child_add fail1\n",
- __func__);
- return 1;
- }
- e->child_list = dcl;
- e->child_count = m;
- /*
- * any children we took over must point to us as the
- * parent now they appear on our child list
- */
- e1 = e->child_list;
- while (e1) {
- e1->parent = e;
- e1 = e1->sibling;
- }
- /*
- * We detached any children, gave them to the new guy
- * and replaced them with just our new guy
- */
- t->parser->child_count = 1;
- t->parser->child_list = e;
- /*
- * any instances that belonged to the original entry we
- * are splitting now must be reassigned to the end
- * part
- */
- e->inst_file_list = t->parser->inst_file_list;
- if (e->inst_file_list)
- e->inst_file_list->owner = e;
- t->parser->inst_file_list = NULL;
- e->instance_count = t->parser->instance_count;
- t->parser->instance_count = 0;
- e->ofs_last_inst_file = t->parser->ofs_last_inst_file;
- t->parser->ofs_last_inst_file = 0;
- if (t->str_match_pos != olen) {
- /* we diverged partway */
- e->suffix = &osuff[t->str_match_pos - 1];
- e->suffix_len = olen - (t->str_match_pos - 1);
- }
- /*
- * if the current char is a terminal, skip creating a
- * new way forward.
- */
- if (classify[(int)c]) {
- /*
- * Lastly we need to create a new child trie
- * entry that represents the new way forward
- * from the point that we diverged. For the
- * "hello" / "help" case, this guy will start
- * as a child of "hel" with the single
- * character match 'p'.
- *
- * Since he becomes the current parser context,
- * more symbol characters may be coming to make
- * him into, eg, "helping", in which case he
- * will acquire a suffix eventually of "ping"
- * via the aggregation stuff
- */
- e = lws_fts_entry_child_add(t, c, t->parser);
- if (!e) {
- lwsl_err("%s: child_add fail2\n",
- __func__);
- return 1;
- }
- }
- /* go on following this path */
- t->parser = e;
- t->aggregate = 1;
- t->agg_pos = 0;
- t->str_match_pos = 0;
- if (go_around)
- continue;
- /* this is intended to be a seal */
- }
- /* end of token */
- if (t->aggregate && t->agg_pos) {
- /* if nothing in agg[]: leave as single char match */
- /* otherwise copy out the symbol aggregation */
- t->parser->suffix = lwsac_use(&t->lwsac_head,
- t->agg_pos + 1,
- TRIE_LWSAC_BLOCK_SIZE);
- if (!t->parser->suffix) {
- lwsl_err("%s: lac for suffix failed\n",
- __func__);
- return 1;
- }
- /* add the first char at the beginning */
- *t->parser->suffix = t->parser->c;
- /* and then add the agg buffer stuff */
- memcpy(t->parser->suffix + 1, t->agg, t->agg_pos);
- t->parser->suffix_len = t->agg_pos + 1;
- }
- t->aggregate = 0;
- if (t->parser == t->root) /* multiple terminal chars */
- continue;
- if (!t->parser->inst_file_list ||
- t->parser->inst_file_list->file_index != file_index) {
- tif = lwsac_use(&t->lwsac_input_head, sizeof(*tif),
- TRIE_LWSAC_BLOCK_SIZE);
- if (!tif) {
- lwsl_err("%s: lac for tif failed\n",
- __func__);
- return 1;
- }
- tif->file_index = file_index;
- tif->owner = t->parser;
- tif->lines_list = NULL;
- tif->lines_tail = NULL;
- tif->total = 0;
- tif->count = 0;
- tif->inst_file_next = t->tif_list;
- t->tif_list = tif;
- t->parser->inst_file_list = tif;
- }
- /*
- * A naive allocation strategy for this leads to 50% of the
- * total inmem lac allocation being for line numbers...
- *
- * It's mainly solved by only holding the instance and line
- * number tables for the duration of a file being input, as soon
- * as one input file is finished it is written to disk.
- *
- * For the common case of 1 - ~3 matches the line number are
- * stored in a small VLI array inside the filepath inst. If the
- * next one won't fit, it allocates a line number struct with
- * more vli space and continues chaining those if needed.
- */
- n = wq32(vlibuf, t->line_number);
- tif = t->parser->inst_file_list;
- if (!tif->lines_list) {
- /* we are still trying to use the file inst vli */
- if (LWS_ARRAY_SIZE(tif->vli) - tif->count >= n) {
- tif->count += wq32(tif->vli + tif->count,
- t->line_number);
- goto after;
- }
- /* we are going to have to allocate */
- }
- /* can we add to an existing line numbers struct? */
- if (tif->lines_tail &&
- LWS_ARRAY_SIZE(tif->lines_tail->vli) -
- tif->lines_tail->count >= n) {
- tif->lines_tail->count += wq32(tif->lines_tail->vli +
- tif->lines_tail->count,
- t->line_number);
- goto after;
- }
- /* either no existing line numbers struct at tail, or full */
- /* have to create a(nother) line numbers struct */
- tl = lwsac_use(&t->lwsac_input_head, sizeof(*tl),
- TRIE_LWSAC_BLOCK_SIZE);
- if (!tl) {
- lwsl_err("%s: lac for tl failed\n", __func__);
- return 1;
- }
- tl->lines_next = NULL;
- if (tif->lines_tail)
- tif->lines_tail->lines_next = tl;
- tif->lines_tail = tl;
- if (!tif->lines_list)
- tif->lines_list = tl;
- tl->count = wq32(tl->vli, t->line_number);
- after:
- tif->total++;
- #if 0
- {
- char s[128];
- const char *ne = name_entry(t->parser, s, sizeof(s));
- if (!strcmp(ne, "describ")) {
- lwsl_err(" %s %d\n", ne, t->str_match_pos);
- write(1, buf - 10, 20);
- }
- }
- #endif
- t->parser->instance_count++;
- t->parser = t->root;
- t->str_match_pos = 0;
- }
- /* seal off the line length table block */
- if (bp) {
- if (write(t->fd, linetable, bp) != bp)
- return 1;
- t->c += bp;
- bp = 0;
- }
- if (lseek(t->fd, lbh, SEEK_SET) < 0) {
- lwsl_err("%s: seek to 0x%llx failed\n", __func__,
- (unsigned long long)lbh);
- return 1;
- }
- g16(linetable, t->c - lbh);
- g16(linetable + 2, t->line_number - sline);
- g32(linetable + 4, chars);
- if (write(t->fd, linetable, 8) != 8) {
- lwsl_err("%s: write linetable header failed\n", __func__);
- return 1;
- }
- assert(lseek(t->fd, 0, SEEK_END) == (off_t)t->c);
- if (lseek(t->fd, t->c, SEEK_SET) < 0) {
- lwsl_err("%s: end seek failed\n", __func__);
- return 1;
- }
- bp = 0;
- if (len) {
- t->lines_in_unsealed_linetable = 0;
- goto resume;
- }
- /* dump the collected per-input instance and line data, and free it */
- t->agg_trie_creation_us += lws_now_usecs() - tf;
- return 0;
- }
- /* refer to ./README.md */
- int
- lws_fts_serialize(struct lws_fts *t)
- {
- struct lws_fts_filepath *fp = t->filepath_list, *ofp;
- unsigned long long tf = lws_now_usecs();
- struct lws_fts_entry *e, *e1, *s[256];
- unsigned char buf[8192], stasis;
- int n, bp, sp = 0, do_parent;
- (void)tf;
- finalize_per_input(t);
- /*
- * Compute aggregated instance counts (parents should know the total
- * number of instances below each child path)
- *
- *
- * If we have
- *
- * (root) -> (c1) -> (c2)
- * -> (c3)
- *
- * we need to visit the nodes in the order
- *
- * c2, c1, c3, root
- */
- sp = 0;
- s[0] = t->root;
- do_parent = 0;
- while (sp >= 0) {
- int n;
- /* aggregate in every antecedent */
- for (n = 0; n <= sp; n++) {
- s[n]->agg_inst_count += s[sp]->instance_count;
- s[n]->agg_child_count += s[sp]->child_count;
- }
- /* handle any children before the parent */
- if (s[sp]->child_list) {
- if (sp + 1 == LWS_ARRAY_SIZE(s)) {
- lwsl_err("Stack too deep\n");
- goto bail;
- }
- s[sp + 1] = s[sp]->child_list;
- sp++;
- continue;
- }
- do {
- if (s[sp]->sibling) {
- s[sp] = s[sp]->sibling;
- break;
- } else
- sp--;
- } while (sp >= 0);
- }
- /* dump the filepaths and set prev */
- fp = t->filepath_list;
- ofp = NULL;
- bp = 0;
- while (fp) {
- fp->ofs = t->c + bp;
- n = (int)strlen(fp->filepath);
- spill(15 + n, 0);
- bp += wq32(&buf[bp], fp->line_table_ofs);
- bp += wq32(&buf[bp], fp->total_lines);
- bp += wq32(&buf[bp], n);
- memcpy(&buf[bp], fp->filepath, n);
- bp += n;
- fp->prev = ofp;
- ofp = fp;
- fp = fp->next;
- }
- spill(0, 1);
- /* record the fileoffset of the filepath map and filepath count */
- if (lseek(t->fd, 0xc, SEEK_SET) < 0)
- goto bail_seek;
- g32(buf, t->c + bp);
- g32(buf + 4, t->next_file_index);
- if (write(t->fd, buf, 8) != 8)
- goto bail;
- if (lseek(t->fd, t->c + bp, SEEK_SET) < 0)
- goto bail_seek;
- /* dump the filepath map, starting from index 0, which is at the tail */
- fp = ofp;
- bp = 0;
- while (fp) {
- spill(5, 0);
- g32(buf + bp, fp->ofs);
- bp += 4;
- fp = fp->prev;
- }
- spill(0, 1);
- /*
- * The trie entries in reverse order... because of the reversal, we have
- * always written children first, and marked them with their file offset
- * before we come to refer to them.
- */
- bp = 0;
- sp = 0;
- s[0] = t->root;
- do_parent = 0;
- while (s[sp]) {
- /* handle any children before the parent */
- if (!do_parent && s[sp]->child_list) {
- if (sp + 1 == LWS_ARRAY_SIZE(s)) {
- lwsl_err("Stack too deep\n");
- goto bail;
- }
- s[sp + 1] = s[sp]->child_list;
- sp++;
- continue;
- }
- /* leaf nodes with no children */
- e = s[sp];
- e->ofs = t->c + bp;
- /* write the trie entry header */
- spill((3 * MAX_VLI), 0);
- bp += wq32(&buf[bp], e->ofs_last_inst_file);
- bp += wq32(&buf[bp], e->child_count);
- bp += wq32(&buf[bp], e->instance_count);
- bp += wq32(&buf[bp], e->agg_inst_count);
- /* sort the children in order of highest aggregate hits first */
- do {
- struct lws_fts_entry **pe, *te1, *te2;
- stasis = 1;
- /* bubble sort keeps going until nothing changed */
- pe = &e->child_list;
- while (*pe) {
- te1 = *pe;
- te2 = te1->sibling;
- if (te2 && te1->agg_inst_count <
- te2->agg_inst_count) {
- stasis = 0;
- *pe = te2;
- te1->sibling = te2->sibling;
- te2->sibling = te1;
- }
- pe = &(*pe)->sibling;
- }
- } while (!stasis);
- /* write the children */
- e1 = e->child_list;
- while (e1) {
- spill((5 * MAX_VLI) + e1->suffix_len + 1, 0);
- bp += wq32(&buf[bp], e1->ofs);
- bp += wq32(&buf[bp], e1->instance_count);
- bp += wq32(&buf[bp], e1->agg_inst_count);
- bp += wq32(&buf[bp], e1->agg_child_count);
- if (e1->suffix) { /* string */
- bp += wq32(&buf[bp], e1->suffix_len);
- memmove(&buf[bp], e1->suffix, e1->suffix_len);
- bp += e1->suffix_len;
- } else { /* char */
- bp += wq32(&buf[bp], 1);
- buf[bp++] = e1->c;
- }
- #if 0
- if (e1->suffix && e1->suffix_len == 3 &&
- !memcmp(e1->suffix, "cri", 3)) {
- struct lws_fts_entry *e2;
- e2 = e1;
- while (e2){
- if (e2->suffix)
- lwsl_notice("%s\n", e2->suffix);
- else
- lwsl_notice("%c\n", e2->c);
- e2 = e2->parent;
- }
- lwsl_err("*** %c CRI inst %d ch %d\n", e1->parent->c,
- e1->instance_count, e1->child_count);
- }
- #endif
- e1 = e1->sibling;
- }
- /* if there are siblings, do those next */
- if (do_parent) {
- do_parent = 0;
- sp--;
- }
- if (s[sp]->sibling)
- s[sp] = s[sp]->sibling;
- else {
- /* if there are no siblings, do the parent */
- do_parent = 1;
- s[sp] = s[sp]->parent;
- }
- }
- spill(0, 1);
- assert(lseek(t->fd, 0, SEEK_END) == (off_t)t->c);
- /* drop the correct root trie offset + file length into the header */
- if (lseek(t->fd, 4, SEEK_SET) < 0) {
- lwsl_err("%s: unable to seek\n", __func__);
- goto bail;
- }
- g32(buf, t->root->ofs);
- g32(buf + 4, t->c);
- if (write(t->fd, buf, 0x8) != 0x8)
- goto bail;
- lwsl_notice("%s: index %d files (%uMiB) cpu time %dms, "
- "alloc: %dKiB + %dKiB, "
- "serialize: %dms, file: %dKiB\n", __func__,
- t->next_file_index,
- (int)(t->agg_raw_input / (1024 * 1024)),
- (int)(t->agg_trie_creation_us / 1000),
- (int)(lwsac_total_alloc(t->lwsac_head) / 1024),
- (int)(t->worst_lwsac_input_size / 1024),
- (int)((lws_now_usecs() - tf) / 1000),
- (int)(t->c / 1024));
- return 0;
- bail_seek:
- lwsl_err("%s: problem seekings\n", __func__);
- bail:
- return 1;
- }
|