You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1373 lines
31 KiB

/*
* libwebsockets - small server side websockets and web server implementation
*
* Copyright (C) 2010 - 2019 Andy Green <andy@warmcat.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* The functions allow
*
* - collecting a concordance of strings from one or more files (eg, a
* directory of files) into a single in-memory, lac-backed trie;
*
* - to optimize and serialize the in-memory trie to an fd;
*
* - to very quickly report any instances of a string in any of the files
* indexed by the trie, by a seeking around a serialized trie fd, without
* having to load it all in memory
*/
#include "private-lib-core.h"
#include "private-lib-misc-fts.h"
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/types.h>
struct lws_fts_entry;
/* notice these are stored in t->lwsac_input_head which has input file scope */
struct lws_fts_filepath {
struct lws_fts_filepath *next;
struct lws_fts_filepath *prev;
char filepath[256];
jg2_file_offset ofs;
jg2_file_offset line_table_ofs;
int filepath_len;
int file_index;
int total_lines;
int priority;
};
/* notice these are stored in t->lwsac_input_head which has input file scope */
struct lws_fts_lines {
struct lws_fts_lines *lines_next;
/*
* amount of line numbers needs to meet average count for best
* efficiency.
*
* Line numbers are stored in VLI format since if we don't, around half
* the total lac allocation consists of struct lws_fts_lines...
* size chosen to maintain 8-byte struct alignment
*/
uint8_t vli[119];
char count;
};
/* this represents the instances of a symbol inside a given filepath */
struct lws_fts_instance_file {
/* linked-list of tifs generated for current file */
struct lws_fts_instance_file *inst_file_next;
struct lws_fts_entry *owner;
struct lws_fts_lines *lines_list, *lines_tail;
uint32_t file_index;
uint32_t total;
/*
* optimization for the common case there's only 1 - ~3 matches, so we
* don't have to allocate any lws_fts_lines struct
*
* Using 8 bytes total for this maintains 8-byte struct alignment...
*/
uint8_t vli[7];
char count;
};
/*
* this is the main trie in-memory allocation object
*/
struct lws_fts_entry {
struct lws_fts_entry *parent;
struct lws_fts_entry *child_list;
struct lws_fts_entry *sibling;
/*
* care... this points to content in t->lwsac_input_head, it goes
* out of scope when the input file being indexed completes
*/
struct lws_fts_instance_file *inst_file_list;
jg2_file_offset ofs_last_inst_file;
char *suffix; /* suffix string or NULL if one char (in .c) */
jg2_file_offset ofs;
uint32_t child_count;
uint32_t instance_count;
uint32_t agg_inst_count;
uint32_t agg_child_count;
uint32_t suffix_len;
unsigned char c;
};
/* there's only one of these per trie file */
struct lws_fts {
struct lwsac *lwsac_head;
struct lwsac *lwsac_input_head;
struct lws_fts_entry *root;
struct lws_fts_filepath *filepath_list;
struct lws_fts_filepath *fp;
struct lws_fts_entry *parser;
struct lws_fts_entry *root_lookup[256];
/*
* head of linked-list of tifs generated for current file
* care... this points to content in t->lwsac_input_head
*/
struct lws_fts_instance_file *tif_list;
jg2_file_offset c; /* length of output file so far */
uint64_t agg_trie_creation_us;
uint64_t agg_raw_input;
uint64_t worst_lwsac_input_size;
int last_file_index;
int chars_in_line;
jg2_file_offset last_block_len_ofs;
int line_number;
int lines_in_unsealed_linetable;
int next_file_index;
int count_entries;
int fd;
unsigned int agg_pos;
unsigned int str_match_pos;
unsigned char aggregate;
unsigned char agg[128];
};
/* since the kernel case allocates >300MB, no point keeping this too low */
#define TRIE_LWSAC_BLOCK_SIZE (1024 * 1024)
#define spill(margin, force) \
if (bp && ((uint32_t)bp >= (sizeof(buf) - (margin)) || (force))) { \
if (write(t->fd, buf, bp) != bp) { \
lwsl_err("%s: write %d failed (%d)\n", __func__, \
bp, errno); \
return 1; \
} \
t->c += bp; \
bp = 0; \
}
static int
g32(unsigned char *b, uint32_t d)
{
*b++ = (d >> 24) & 0xff;
*b++ = (d >> 16) & 0xff;
*b++ = (d >> 8) & 0xff;
*b = d & 0xff;
return 4;
}
static int
g16(unsigned char *b, int d)
{
*b++ = (d >> 8) & 0xff;
*b = d & 0xff;
return 2;
}
static int
wq32(unsigned char *b, uint32_t d)
{
unsigned char *ob = b;
if (d > (1 << 28) - 1)
*b++ = ((d >> 28) | 0x80) & 0xff;
if (d > (1 << 21) - 1)
*b++ = ((d >> 21) | 0x80) & 0xff;
if (d > (1 << 14) - 1)
*b++ = ((d >> 14) | 0x80) & 0xff;
if (d > (1 << 7) - 1)
*b++ = ((d >> 7) | 0x80) & 0xff;
*b++ = d & 0x7f;
return (int)(b - ob);
}
/* read a VLI, return the number of bytes used */
int
rq32(unsigned char *b, uint32_t *d)
{
unsigned char *ob = b;
uint32_t t = 0;
t = *b & 0x7f;
if (*(b++) & 0x80) {
t = (t << 7) | (*b & 0x7f);
if (*(b++) & 0x80) {
t = (t << 7) | (*b & 0x7f);
if (*(b++) & 0x80) {
t = (t << 7) | (*b & 0x7f);
if (*(b++) & 0x80) {
t = (t << 7) | (*b & 0x7f);
b++;
}
}
}
}
*d = t;
return (int)(b - ob);
}
struct lws_fts *
lws_fts_create(int fd)
{
struct lws_fts *t;
struct lwsac *lwsac_head = NULL;
unsigned char buf[TRIE_FILE_HDR_SIZE];
t = lwsac_use(&lwsac_head, sizeof(*t), TRIE_LWSAC_BLOCK_SIZE);
if (!t)
return NULL;
memset(t, 0, sizeof(*t));
t->fd = fd;
t->lwsac_head = lwsac_head;
t->root = lwsac_use(&lwsac_head, sizeof(*t->root),
TRIE_LWSAC_BLOCK_SIZE);
if (!t->root)
goto unwind;
memset(t->root, 0, sizeof(*t->root));
t->parser = t->root;
t->last_file_index = -1;
t->line_number = 1;
t->filepath_list = NULL;
memset(t->root_lookup, 0, sizeof(*t->root_lookup));
/* write the header */
buf[0] = 0xca;
buf[1] = 0x7a;
buf[2] = 0x5f;
buf[3] = 0x75;
/* (these are filled in with correct data at the end) */
/* file offset to root trie entry */
g32(&buf[4], 0);
/* file length when it was created */
g32(&buf[8], 0);
/* fileoffset to the filepath table */
g32(&buf[0xc], 0);
/* count of filepaths */
g32(&buf[0x10], 0);
if (write(t->fd, buf, TRIE_FILE_HDR_SIZE) != TRIE_FILE_HDR_SIZE) {
lwsl_err("%s: trie header write failed\n", __func__);
goto unwind;
}
t->c = TRIE_FILE_HDR_SIZE;
return t;
unwind:
lwsac_free(&lwsac_head);
return NULL;
}
void
lws_fts_destroy(struct lws_fts **trie)
{
struct lwsac *lwsac_head = (*trie)->lwsac_head;
lwsac_free(&(*trie)->lwsac_input_head);
lwsac_free(&lwsac_head);
*trie = NULL;
}
int
lws_fts_file_index(struct lws_fts *t, const char *filepath, int filepath_len,
int priority)
{
struct lws_fts_filepath *fp = t->filepath_list;
#if 0
while (fp) {
if (fp->filepath_len == filepath_len &&
!strcmp(fp->filepath, filepath))
return fp->file_index;
fp = fp->next;
}
#endif
fp = lwsac_use(&t->lwsac_head, sizeof(*fp), TRIE_LWSAC_BLOCK_SIZE);
if (!fp)
return -1;
fp->next = t->filepath_list;
t->filepath_list = fp;
strncpy(fp->filepath, filepath, sizeof(fp->filepath) - 1);
fp->filepath[sizeof(fp->filepath) - 1] = '\0';
fp->filepath_len = filepath_len;
fp->file_index = t->next_file_index++;
fp->line_table_ofs = t->c;
fp->priority = priority;
fp->total_lines = 0;
t->fp = fp;
return fp->file_index;
}
static struct lws_fts_entry *
lws_fts_entry_child_add(struct lws_fts *t, unsigned char c,
struct lws_fts_entry *parent)
{
struct lws_fts_entry *e, **pe;
e = lwsac_use(&t->lwsac_head, sizeof(*e), TRIE_LWSAC_BLOCK_SIZE);
if (!e)
return NULL;
memset(e, 0, sizeof(*e));
e->c = c;
parent->child_count++;
e->parent = parent;
t->count_entries++;
/* keep the parent child list in ascending sort order for c */
pe = &parent->child_list;
while (*pe) {
assert((*pe)->parent == parent);
if ((*pe)->c > c) {
/* add it before */
e->sibling = *pe;
*pe = e;
break;
}
pe = &(*pe)->sibling;
}
if (!*pe) {
/* add it at the end */
e->sibling = NULL;
*pe = e;
}
return e;
}
static int
finalize_per_input(struct lws_fts *t)
{
struct lws_fts_instance_file *tif;
unsigned char buf[8192];
uint64_t lwsac_input_size;
jg2_file_offset temp;
int bp = 0;
bp += g16(&buf[bp], 0);
bp += g16(&buf[bp], 0);
bp += g32(&buf[bp], 0);
if (write(t->fd, buf, bp) != bp)
return 1;
t->c += bp;
bp = 0;
/*
* Write the generated file index + instances (if any)
*
* Notice the next same-parent file instance fileoffset list is
* backwards, so it does not require seeks to fill in. The first
* entry has 0 but the second entry points to the first entry (whose
* fileoffset is known).
*
* After all the file instance structs are finalized,
* .ofs_last_inst_file contains the fileoffset of that child's tif
* list head in the file.
*
* The file instances are written to disk in the order that the files
* were indexed, along with their prev pointers inline.
*/
tif = t->tif_list;
while (tif) {
struct lws_fts_lines *i;
spill((3 * MAX_VLI) + tif->count, 0);
temp = tif->owner->ofs_last_inst_file;
if (tif->total)
tif->owner->ofs_last_inst_file = t->c + bp;
assert(!temp || (temp > TRIE_FILE_HDR_SIZE && temp < t->c));
/* fileoffset of prev instance file for this entry, or 0 */
bp += wq32(&buf[bp], temp);
bp += wq32(&buf[bp], tif->file_index);
bp += wq32(&buf[bp], tif->total);
/* remove any pointers into this disposable lac footprint */
tif->owner->inst_file_list = NULL;
memcpy(&buf[bp], &tif->vli, tif->count);
bp += tif->count;
i = tif->lines_list;
while (i) {
spill(i->count, 0);
memcpy(&buf[bp], &i->vli, i->count);
bp += i->count;
i = i->lines_next;
}
tif = tif->inst_file_next;
}
spill(0, 1);
assert(lseek(t->fd, 0, SEEK_END) == (off_t)t->c);
if (t->lwsac_input_head) {
lwsac_input_size = lwsac_total_alloc(t->lwsac_input_head);
if (lwsac_input_size > t->worst_lwsac_input_size)
t->worst_lwsac_input_size = lwsac_input_size;
}
/*
* those per-file allocations are all on a separate lac so we can
* free it cleanly afterwards
*/
lwsac_free(&t->lwsac_input_head);
/* and lose the pointer into the deallocated lac */
t->tif_list = NULL;
return 0;
}
/*
* 0 = punctuation, whitespace, brackets etc
* 1 = character inside symbol set
* 2 = upper-case character inside symbol set
*/
static char classify[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, //1,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
};
#if 0
static const char *
name_entry(struct lws_fts_entry *e1, char *s, int len)
{
struct lws_fts_entry *e2;
int n = len;
s[--n] = '\0';
e2 = e1;
while (e2) {
if (e2->suffix) {
if ((int)e2->suffix_len < n) {
n -= e2->suffix_len;
memcpy(&s[n], e2->suffix, e2->suffix_len);
}
} else {
n--;
s[n] = e2->c;
}
e2 = e2->parent;
}
return &s[n + 1];
}
#endif
/*
* as we parse the input, we create a line length table for the file index.
* Only the file header has been written before we start doing this.
*/
int
lws_fts_fill(struct lws_fts *t, uint32_t file_index, const char *buf,
size_t len)
{
unsigned long long tf = lws_now_usecs();
unsigned char c, linetable[256], vlibuf[8];
struct lws_fts_entry *e, *e1, *dcl;
struct lws_fts_instance_file *tif;
int bp = 0, sline, chars, m;
char *osuff, skipline = 0;
struct lws_fts_lines *tl;
unsigned int olen, n;
off_t lbh;
if ((int)file_index != t->last_file_index) {
if (t->last_file_index >= 0)
finalize_per_input(t);
t->last_file_index = file_index;
t->line_number = 1;
t->chars_in_line = 0;
t->lines_in_unsealed_linetable = 0;
}
t->agg_raw_input += len;
resume:
chars = 0;
lbh = t->c;
sline = t->line_number;
bp += g16(&linetable[bp], 0);
bp += g16(&linetable[bp], 0);
bp += g32(&linetable[bp], 0);
while (len) {
char go_around = 0;
if (t->lines_in_unsealed_linetable >= LWS_FTS_LINES_PER_CHUNK)
break;
len--;
c = (unsigned char)*buf++;
t->chars_in_line++;
if (c == '\n') {
skipline = 0;
t->filepath_list->total_lines++;
t->lines_in_unsealed_linetable++;
t->line_number++;
bp += wq32(&linetable[bp], t->chars_in_line);
if ((unsigned int)bp > sizeof(linetable) - 6) {
if (write(t->fd, linetable, bp) != bp) {
lwsl_err("%s: linetable write failed\n",
__func__);
return 1;
}
t->c += bp;
bp = 0;
// assert(lseek(t->fd, 0, SEEK_END) == t->c);
}
chars += t->chars_in_line;
t->chars_in_line = 0;
/*
* Detect overlength lines and skip them (eg, BASE64
* in css etc)
*/
if (len > 200) {
n = 0;
m = 0;
while (n < 200 && m < 80 && buf[n] != '\n') {
if (buf[n] == ' ' || buf[n] == '\t')
m = 0;
n++;
m++;
}
/* 80 lines no whitespace, or >=200-char line */
if (m == 80 || n == 200)
skipline = 1;
}
goto seal;
}
if (skipline)
continue;
m = classify[(int)c];
if (!m)
goto seal;
if (m == 2)
c += 'a' - 'A';
if (t->aggregate) {
/*
* We created a trie entry for an earlier char in this
* symbol already. So we know at the moment, any
* further chars in the symbol are the only children.
*
* Aggregate them and add them as a string suffix to
* the trie symbol at the end (when we know how much to
* allocate).
*/
if (t->agg_pos < sizeof(t->agg) - 1)
/* symbol is not too long to stash */
t->agg[t->agg_pos++] = c;
continue;
}
if (t->str_match_pos) {
go_around = 1;
goto seal;
}
/* zeroth-iteration child matching */
if (t->parser == t->root) {
e = t->root_lookup[(int)c];
if (e) {
t->parser = e;
continue;
}
} else {
/* look for the char amongst the children */
e = t->parser->child_list;
while (e) {
/* since they're alpha ordered... */
if (e->c > c) {
e = NULL;
break;
}
if (e->c == c) {
t->parser = e;
if (e->suffix)
t->str_match_pos = 1;
break;
}
e = e->sibling;
}
if (e)
continue;
}
/*
* we are blazing a new trail, add a new child representing
* the whole suffix that couldn't be matched until now.
*/
e = lws_fts_entry_child_add(t, c, t->parser);
if (!e) {
lwsl_err("%s: lws_fts_entry_child_add failed\n",
__func__);
return 1;
}
/* if it's the root node, keep the root_lookup table in sync */
if (t->parser == t->root)
t->root_lookup[(int)c] = e;
/* follow the new path */
t->parser = e;
{
struct lws_fts_entry **pe = &e->child_list;
while (*pe) {
assert((*pe)->parent == e);
pe = &(*pe)->sibling;
}
}
/*
* If there are any more symbol characters coming, just
* create a suffix string on t->parser instead of what must
* currently be single-child nodes, since we just created e
* as a child with a single character due to no existing match
* on that single character... so if no match on 'h' with this
* guy's parent, we created e that matches on the single char
* 'h'. If the symbol continues ... 'a' 'p' 'p' 'y', then
* instead of creating singleton child nodes under e,
* modify e to match on the whole string suffix "happy".
*
* If later "hoppy" appears, we will remove the suffix on e,
* so it reverts to a char match for 'h', add singleton children
* for 'a' and 'o', and attach a "ppy" suffix child to each of
* those.
*
* We want to do this so we don't have to allocate trie entries
* for every char in the string to save memory and consequently
* time.
*
* Don't try this optimization if the parent is the root node...
* it's not compatible with it's root_lookup table and it's
* highly likely children off the root entry are going to have
* to be fragmented.
*/
if (e->parent != t->root) {
t->aggregate = 1;
t->agg_pos = 0;
}
continue;
seal:
if (t->str_match_pos) {
/*
* We're partway through matching an elaborated string
* on a child, not just a character. String matches
* only exist when we met a child entry that only had
* one path until now... so we had an 'h', and the
* only child had a string "hello".
*
* We are following the right path and will not need
* to back up, but we may find as we go we have the
* first instance of a second child path, eg, "help".
*
* When we get to the 'p', we have to split what was
* the only string option "hello" into "hel" and then
* two child entries, for "lo" and 'p'.
*/
if (c == t->parser->suffix[t->str_match_pos++]) {
if (t->str_match_pos < t->parser->suffix_len)
continue;
/*
* We simply matched everything, continue
* parsing normally from this trie entry.
*/
t->str_match_pos = 0;
continue;
}
/*
* So... we hit a mismatch somewhere... it means we
* have to split this string entry.
*
* We know the first char actually matched in order to
* start down this road. So for the current trie entry,
* we need to truncate his suffix at the char before
* this mismatched one, where we diverged (if the
* second char, simply remove the suffix string from the
* current trie entry to turn it back to a 1-char match)
*
* The original entry, which becomes the lhs post-split,
* is t->parser.
*/
olen = t->parser->suffix_len;
osuff = t->parser->suffix;
if (t->str_match_pos == 2)
t->parser->suffix = NULL;
else
t->parser->suffix_len = t->str_match_pos - 1;
/*
* Then we need to create a new child trie entry that
* represents the remainder of the original string
* path that we didn't match. For the "hello" /
* "help" case, this guy will have "lo".
*
* Any instances or children (not siblings...) that were
* attached to the original trie entry must be detached
* first and then migrate to this new guy that completes
* the original string.
*/
dcl = t->parser->child_list;
m = t->parser->child_count;
t->parser->child_list = NULL;
t->parser->child_count = 0;
e = lws_fts_entry_child_add(t,
osuff[t->str_match_pos - 1], t->parser);
if (!e) {
lwsl_err("%s: lws_fts_entry_child_add fail1\n",
__func__);
return 1;
}
e->child_list = dcl;
e->child_count = m;
/*
* any children we took over must point to us as the
* parent now they appear on our child list
*/
e1 = e->child_list;
while (e1) {
e1->parent = e;
e1 = e1->sibling;
}
/*
* We detached any children, gave them to the new guy
* and replaced them with just our new guy
*/
t->parser->child_count = 1;
t->parser->child_list = e;
/*
* any instances that belonged to the original entry we
* are splitting now must be reassigned to the end
* part
*/
e->inst_file_list = t->parser->inst_file_list;
if (e->inst_file_list)
e->inst_file_list->owner = e;
t->parser->inst_file_list = NULL;
e->instance_count = t->parser->instance_count;
t->parser->instance_count = 0;
e->ofs_last_inst_file = t->parser->ofs_last_inst_file;
t->parser->ofs_last_inst_file = 0;
if (t->str_match_pos != olen) {
/* we diverged partway */
e->suffix = &osuff[t->str_match_pos - 1];
e->suffix_len = olen - (t->str_match_pos - 1);
}
/*
* if the current char is a terminal, skip creating a
* new way forward.
*/
if (classify[(int)c]) {
/*
* Lastly we need to create a new child trie
* entry that represents the new way forward
* from the point that we diverged. For the
* "hello" / "help" case, this guy will start
* as a child of "hel" with the single
* character match 'p'.
*
* Since he becomes the current parser context,
* more symbol characters may be coming to make
* him into, eg, "helping", in which case he
* will acquire a suffix eventually of "ping"
* via the aggregation stuff
*/
e = lws_fts_entry_child_add(t, c, t->parser);
if (!e) {
lwsl_err("%s: child_add fail2\n",
__func__);
return 1;
}
}
/* go on following this path */
t->parser = e;
t->aggregate = 1;
t->agg_pos = 0;
t->str_match_pos = 0;
if (go_around)
continue;
/* this is intended to be a seal */
}
/* end of token */
if (t->aggregate && t->agg_pos) {
/* if nothing in agg[]: leave as single char match */
/* otherwise copy out the symbol aggregation */
t->parser->suffix = lwsac_use(&t->lwsac_head,
t->agg_pos + 1,
TRIE_LWSAC_BLOCK_SIZE);
if (!t->parser->suffix) {
lwsl_err("%s: lac for suffix failed\n",
__func__);
return 1;
}
/* add the first char at the beginning */
*t->parser->suffix = t->parser->c;
/* and then add the agg buffer stuff */
memcpy(t->parser->suffix + 1, t->agg, t->agg_pos);
t->parser->suffix_len = t->agg_pos + 1;
}
t->aggregate = 0;
if (t->parser == t->root) /* multiple terminal chars */
continue;
if (!t->parser->inst_file_list ||
t->parser->inst_file_list->file_index != file_index) {
tif = lwsac_use(&t->lwsac_input_head, sizeof(*tif),
TRIE_LWSAC_BLOCK_SIZE);
if (!tif) {
lwsl_err("%s: lac for tif failed\n",
__func__);
return 1;
}
tif->file_index = file_index;
tif->owner = t->parser;
tif->lines_list = NULL;
tif->lines_tail = NULL;
tif->total = 0;
tif->count = 0;
tif->inst_file_next = t->tif_list;
t->tif_list = tif;
t->parser->inst_file_list = tif;
}
/*
* A naive allocation strategy for this leads to 50% of the
* total inmem lac allocation being for line numbers...
*
* It's mainly solved by only holding the instance and line
* number tables for the duration of a file being input, as soon
* as one input file is finished it is written to disk.
*
* For the common case of 1 - ~3 matches the line number are
* stored in a small VLI array inside the filepath inst. If the
* next one won't fit, it allocates a line number struct with
* more vli space and continues chaining those if needed.
*/
n = wq32(vlibuf, t->line_number);
tif = t->parser->inst_file_list;
if (!tif->lines_list) {
/* we are still trying to use the file inst vli */
if (LWS_ARRAY_SIZE(tif->vli) - tif->count >= n) {
tif->count += wq32(tif->vli + tif->count,
t->line_number);
goto after;
}
/* we are going to have to allocate */
}
/* can we add to an existing line numbers struct? */
if (tif->lines_tail &&
LWS_ARRAY_SIZE(tif->lines_tail->vli) -
tif->lines_tail->count >= n) {
tif->lines_tail->count += wq32(tif->lines_tail->vli +
tif->lines_tail->count,
t->line_number);
goto after;
}
/* either no existing line numbers struct at tail, or full */
/* have to create a(nother) line numbers struct */
tl = lwsac_use(&t->lwsac_input_head, sizeof(*tl),
TRIE_LWSAC_BLOCK_SIZE);
if (!tl) {
lwsl_err("%s: lac for tl failed\n", __func__);
return 1;
}
tl->lines_next = NULL;
if (tif->lines_tail)
tif->lines_tail->lines_next = tl;
tif->lines_tail = tl;
if (!tif->lines_list)
tif->lines_list = tl;
tl->count = wq32(tl->vli, t->line_number);
after:
tif->total++;
#if 0
{
char s[128];
const char *ne = name_entry(t->parser, s, sizeof(s));
if (!strcmp(ne, "describ")) {
lwsl_err(" %s %d\n", ne, t->str_match_pos);
write(1, buf - 10, 20);
}
}
#endif
t->parser->instance_count++;
t->parser = t->root;
t->str_match_pos = 0;
}
/* seal off the line length table block */
if (bp) {
if (write(t->fd, linetable, bp) != bp)
return 1;
t->c += bp;
bp = 0;
}
if (lseek(t->fd, lbh, SEEK_SET) < 0) {
lwsl_err("%s: seek to 0x%llx failed\n", __func__,
(unsigned long long)lbh);
return 1;
}
g16(linetable, t->c - lbh);
g16(linetable + 2, t->line_number - sline);
g32(linetable + 4, chars);
if (write(t->fd, linetable, 8) != 8) {
lwsl_err("%s: write linetable header failed\n", __func__);
return 1;
}
assert(lseek(t->fd, 0, SEEK_END) == (off_t)t->c);
if (lseek(t->fd, t->c, SEEK_SET) < 0) {
lwsl_err("%s: end seek failed\n", __func__);
return 1;
}
bp = 0;
if (len) {
t->lines_in_unsealed_linetable = 0;
goto resume;
}
/* dump the collected per-input instance and line data, and free it */
t->agg_trie_creation_us += lws_now_usecs() - tf;
return 0;
}
/* refer to ./README.md */
int
lws_fts_serialize(struct lws_fts *t)
{
struct lws_fts_filepath *fp = t->filepath_list, *ofp;
unsigned long long tf = lws_now_usecs();
struct lws_fts_entry *e, *e1, *s[256];
unsigned char buf[8192], stasis;
int n, bp, sp = 0, do_parent;
(void)tf;
finalize_per_input(t);
/*
* Compute aggregated instance counts (parents should know the total
* number of instances below each child path)
*
*
* If we have
*
* (root) -> (c1) -> (c2)
* -> (c3)
*
* we need to visit the nodes in the order
*
* c2, c1, c3, root
*/
sp = 0;
s[0] = t->root;
do_parent = 0;
while (sp >= 0) {
int n;
/* aggregate in every antecedent */
for (n = 0; n <= sp; n++) {
s[n]->agg_inst_count += s[sp]->instance_count;
s[n]->agg_child_count += s[sp]->child_count;
}
/* handle any children before the parent */
if (s[sp]->child_list) {
if (sp + 1 == LWS_ARRAY_SIZE(s)) {
lwsl_err("Stack too deep\n");
goto bail;
}
s[sp + 1] = s[sp]->child_list;
sp++;
continue;
}
do {
if (s[sp]->sibling) {
s[sp] = s[sp]->sibling;
break;
} else
sp--;
} while (sp >= 0);
}
/* dump the filepaths and set prev */
fp = t->filepath_list;
ofp = NULL;
bp = 0;
while (fp) {
fp->ofs = t->c + bp;
n = (int)strlen(fp->filepath);
spill(15 + n, 0);
bp += wq32(&buf[bp], fp->line_table_ofs);
bp += wq32(&buf[bp], fp->total_lines);
bp += wq32(&buf[bp], n);
memcpy(&buf[bp], fp->filepath, n);
bp += n;
fp->prev = ofp;
ofp = fp;
fp = fp->next;
}
spill(0, 1);
/* record the fileoffset of the filepath map and filepath count */
if (lseek(t->fd, 0xc, SEEK_SET) < 0)
goto bail_seek;
g32(buf, t->c + bp);
g32(buf + 4, t->next_file_index);
if (write(t->fd, buf, 8) != 8)
goto bail;
if (lseek(t->fd, t->c + bp, SEEK_SET) < 0)
goto bail_seek;
/* dump the filepath map, starting from index 0, which is at the tail */
fp = ofp;
bp = 0;
while (fp) {
spill(5, 0);
g32(buf + bp, fp->ofs);
bp += 4;
fp = fp->prev;
}
spill(0, 1);
/*
* The trie entries in reverse order... because of the reversal, we have
* always written children first, and marked them with their file offset
* before we come to refer to them.
*/
bp = 0;
sp = 0;
s[0] = t->root;
do_parent = 0;
while (s[sp]) {
/* handle any children before the parent */
if (!do_parent && s[sp]->child_list) {
if (sp + 1 == LWS_ARRAY_SIZE(s)) {
lwsl_err("Stack too deep\n");
goto bail;
}
s[sp + 1] = s[sp]->child_list;
sp++;
continue;
}
/* leaf nodes with no children */
e = s[sp];
e->ofs = t->c + bp;
/* write the trie entry header */
spill((3 * MAX_VLI), 0);
bp += wq32(&buf[bp], e->ofs_last_inst_file);
bp += wq32(&buf[bp], e->child_count);
bp += wq32(&buf[bp], e->instance_count);
bp += wq32(&buf[bp], e->agg_inst_count);
/* sort the children in order of highest aggregate hits first */
do {
struct lws_fts_entry **pe, *te1, *te2;
stasis = 1;
/* bubble sort keeps going until nothing changed */
pe = &e->child_list;
while (*pe) {
te1 = *pe;
te2 = te1->sibling;
if (te2 && te1->agg_inst_count <
te2->agg_inst_count) {
stasis = 0;
*pe = te2;
te1->sibling = te2->sibling;
te2->sibling = te1;
}
pe = &(*pe)->sibling;
}
} while (!stasis);
/* write the children */
e1 = e->child_list;
while (e1) {
spill((5 * MAX_VLI) + e1->suffix_len + 1, 0);
bp += wq32(&buf[bp], e1->ofs);
bp += wq32(&buf[bp], e1->instance_count);
bp += wq32(&buf[bp], e1->agg_inst_count);
bp += wq32(&buf[bp], e1->agg_child_count);
if (e1->suffix) { /* string */
bp += wq32(&buf[bp], e1->suffix_len);
memmove(&buf[bp], e1->suffix, e1->suffix_len);
bp += e1->suffix_len;
} else { /* char */
bp += wq32(&buf[bp], 1);
buf[bp++] = e1->c;
}
#if 0
if (e1->suffix && e1->suffix_len == 3 &&
!memcmp(e1->suffix, "cri", 3)) {
struct lws_fts_entry *e2;
e2 = e1;
while (e2){
if (e2->suffix)
lwsl_notice("%s\n", e2->suffix);
else
lwsl_notice("%c\n", e2->c);
e2 = e2->parent;
}
lwsl_err("*** %c CRI inst %d ch %d\n", e1->parent->c,
e1->instance_count, e1->child_count);
}
#endif
e1 = e1->sibling;
}
/* if there are siblings, do those next */
if (do_parent) {
do_parent = 0;
sp--;
}
if (s[sp]->sibling)
s[sp] = s[sp]->sibling;
else {
/* if there are no siblings, do the parent */
do_parent = 1;
s[sp] = s[sp]->parent;
}
}
spill(0, 1);
assert(lseek(t->fd, 0, SEEK_END) == (off_t)t->c);
/* drop the correct root trie offset + file length into the header */
if (lseek(t->fd, 4, SEEK_SET) < 0) {
lwsl_err("%s: unable to seek\n", __func__);
goto bail;
}
g32(buf, t->root->ofs);
g32(buf + 4, t->c);
if (write(t->fd, buf, 0x8) != 0x8)
goto bail;
lwsl_notice("%s: index %d files (%uMiB) cpu time %dms, "
"alloc: %dKiB + %dKiB, "
"serialize: %dms, file: %dKiB\n", __func__,
t->next_file_index,
(int)(t->agg_raw_input / (1024 * 1024)),
(int)(t->agg_trie_creation_us / 1000),
(int)(lwsac_total_alloc(t->lwsac_head) / 1024),
(int)(t->worst_lwsac_input_size / 1024),
(int)((lws_now_usecs() - tf) / 1000),
(int)(t->c / 1024));
return 0;
bail_seek:
lwsl_err("%s: problem seekings\n", __func__);
bail:
return 1;
}