You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1373 lines
31 KiB
1373 lines
31 KiB
/*
|
|
* libwebsockets - small server side websockets and web server implementation
|
|
*
|
|
* Copyright (C) 2010 - 2019 Andy Green <andy@warmcat.com>
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
* of this software and associated documentation files (the "Software"), to
|
|
* deal in the Software without restriction, including without limitation the
|
|
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
|
* sell copies of the Software, and to permit persons to whom the Software is
|
|
* furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
* IN THE SOFTWARE.
|
|
*
|
|
* The functions allow
|
|
*
|
|
* - collecting a concordance of strings from one or more files (eg, a
|
|
* directory of files) into a single in-memory, lac-backed trie;
|
|
*
|
|
* - to optimize and serialize the in-memory trie to an fd;
|
|
*
|
|
* - to very quickly report any instances of a string in any of the files
|
|
* indexed by the trie, by a seeking around a serialized trie fd, without
|
|
* having to load it all in memory
|
|
*/
|
|
|
|
#include "private-lib-core.h"
|
|
#include "private-lib-misc-fts.h"
|
|
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <assert.h>
|
|
#include <fcntl.h>
|
|
#include <errno.h>
|
|
#include <sys/types.h>
|
|
|
|
struct lws_fts_entry;
|
|
|
|
/* notice these are stored in t->lwsac_input_head which has input file scope */
|
|
|
|
struct lws_fts_filepath {
|
|
struct lws_fts_filepath *next;
|
|
struct lws_fts_filepath *prev;
|
|
char filepath[256];
|
|
jg2_file_offset ofs;
|
|
jg2_file_offset line_table_ofs;
|
|
int filepath_len;
|
|
int file_index;
|
|
int total_lines;
|
|
int priority;
|
|
};
|
|
|
|
/* notice these are stored in t->lwsac_input_head which has input file scope */
|
|
|
|
struct lws_fts_lines {
|
|
struct lws_fts_lines *lines_next;
|
|
/*
|
|
* amount of line numbers needs to meet average count for best
|
|
* efficiency.
|
|
*
|
|
* Line numbers are stored in VLI format since if we don't, around half
|
|
* the total lac allocation consists of struct lws_fts_lines...
|
|
* size chosen to maintain 8-byte struct alignment
|
|
*/
|
|
uint8_t vli[119];
|
|
char count;
|
|
};
|
|
|
|
/* this represents the instances of a symbol inside a given filepath */
|
|
|
|
struct lws_fts_instance_file {
|
|
/* linked-list of tifs generated for current file */
|
|
struct lws_fts_instance_file *inst_file_next;
|
|
struct lws_fts_entry *owner;
|
|
struct lws_fts_lines *lines_list, *lines_tail;
|
|
uint32_t file_index;
|
|
uint32_t total;
|
|
|
|
/*
|
|
* optimization for the common case there's only 1 - ~3 matches, so we
|
|
* don't have to allocate any lws_fts_lines struct
|
|
*
|
|
* Using 8 bytes total for this maintains 8-byte struct alignment...
|
|
*/
|
|
|
|
uint8_t vli[7];
|
|
char count;
|
|
};
|
|
|
|
/*
|
|
* this is the main trie in-memory allocation object
|
|
*/
|
|
|
|
struct lws_fts_entry {
|
|
struct lws_fts_entry *parent;
|
|
|
|
struct lws_fts_entry *child_list;
|
|
struct lws_fts_entry *sibling;
|
|
|
|
/*
|
|
* care... this points to content in t->lwsac_input_head, it goes
|
|
* out of scope when the input file being indexed completes
|
|
*/
|
|
struct lws_fts_instance_file *inst_file_list;
|
|
|
|
jg2_file_offset ofs_last_inst_file;
|
|
|
|
char *suffix; /* suffix string or NULL if one char (in .c) */
|
|
jg2_file_offset ofs;
|
|
uint32_t child_count;
|
|
uint32_t instance_count;
|
|
uint32_t agg_inst_count;
|
|
uint32_t agg_child_count;
|
|
uint32_t suffix_len;
|
|
unsigned char c;
|
|
};
|
|
|
|
/* there's only one of these per trie file */
|
|
|
|
struct lws_fts {
|
|
struct lwsac *lwsac_head;
|
|
struct lwsac *lwsac_input_head;
|
|
struct lws_fts_entry *root;
|
|
struct lws_fts_filepath *filepath_list;
|
|
struct lws_fts_filepath *fp;
|
|
|
|
struct lws_fts_entry *parser;
|
|
struct lws_fts_entry *root_lookup[256];
|
|
|
|
/*
|
|
* head of linked-list of tifs generated for current file
|
|
* care... this points to content in t->lwsac_input_head
|
|
*/
|
|
struct lws_fts_instance_file *tif_list;
|
|
|
|
jg2_file_offset c; /* length of output file so far */
|
|
|
|
uint64_t agg_trie_creation_us;
|
|
uint64_t agg_raw_input;
|
|
uint64_t worst_lwsac_input_size;
|
|
int last_file_index;
|
|
int chars_in_line;
|
|
jg2_file_offset last_block_len_ofs;
|
|
int line_number;
|
|
int lines_in_unsealed_linetable;
|
|
int next_file_index;
|
|
int count_entries;
|
|
|
|
int fd;
|
|
unsigned int agg_pos;
|
|
unsigned int str_match_pos;
|
|
|
|
unsigned char aggregate;
|
|
unsigned char agg[128];
|
|
};
|
|
|
|
/* since the kernel case allocates >300MB, no point keeping this too low */
|
|
|
|
#define TRIE_LWSAC_BLOCK_SIZE (1024 * 1024)
|
|
|
|
#define spill(margin, force) \
|
|
if (bp && ((uint32_t)bp >= (sizeof(buf) - (margin)) || (force))) { \
|
|
if (write(t->fd, buf, bp) != bp) { \
|
|
lwsl_err("%s: write %d failed (%d)\n", __func__, \
|
|
bp, errno); \
|
|
return 1; \
|
|
} \
|
|
t->c += bp; \
|
|
bp = 0; \
|
|
}
|
|
|
|
static int
|
|
g32(unsigned char *b, uint32_t d)
|
|
{
|
|
*b++ = (d >> 24) & 0xff;
|
|
*b++ = (d >> 16) & 0xff;
|
|
*b++ = (d >> 8) & 0xff;
|
|
*b = d & 0xff;
|
|
|
|
return 4;
|
|
}
|
|
|
|
static int
|
|
g16(unsigned char *b, int d)
|
|
{
|
|
*b++ = (d >> 8) & 0xff;
|
|
*b = d & 0xff;
|
|
|
|
return 2;
|
|
}
|
|
|
|
static int
|
|
wq32(unsigned char *b, uint32_t d)
|
|
{
|
|
unsigned char *ob = b;
|
|
|
|
if (d > (1 << 28) - 1)
|
|
*b++ = ((d >> 28) | 0x80) & 0xff;
|
|
|
|
if (d > (1 << 21) - 1)
|
|
*b++ = ((d >> 21) | 0x80) & 0xff;
|
|
|
|
if (d > (1 << 14) - 1)
|
|
*b++ = ((d >> 14) | 0x80) & 0xff;
|
|
|
|
if (d > (1 << 7) - 1)
|
|
*b++ = ((d >> 7) | 0x80) & 0xff;
|
|
|
|
*b++ = d & 0x7f;
|
|
|
|
return (int)(b - ob);
|
|
}
|
|
|
|
|
|
/* read a VLI, return the number of bytes used */
|
|
|
|
int
|
|
rq32(unsigned char *b, uint32_t *d)
|
|
{
|
|
unsigned char *ob = b;
|
|
uint32_t t = 0;
|
|
|
|
t = *b & 0x7f;
|
|
if (*(b++) & 0x80) {
|
|
t = (t << 7) | (*b & 0x7f);
|
|
if (*(b++) & 0x80) {
|
|
t = (t << 7) | (*b & 0x7f);
|
|
if (*(b++) & 0x80) {
|
|
t = (t << 7) | (*b & 0x7f);
|
|
if (*(b++) & 0x80) {
|
|
t = (t << 7) | (*b & 0x7f);
|
|
b++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
*d = t;
|
|
|
|
return (int)(b - ob);
|
|
}
|
|
|
|
struct lws_fts *
|
|
lws_fts_create(int fd)
|
|
{
|
|
struct lws_fts *t;
|
|
struct lwsac *lwsac_head = NULL;
|
|
unsigned char buf[TRIE_FILE_HDR_SIZE];
|
|
|
|
t = lwsac_use(&lwsac_head, sizeof(*t), TRIE_LWSAC_BLOCK_SIZE);
|
|
if (!t)
|
|
return NULL;
|
|
|
|
memset(t, 0, sizeof(*t));
|
|
|
|
t->fd = fd;
|
|
t->lwsac_head = lwsac_head;
|
|
t->root = lwsac_use(&lwsac_head, sizeof(*t->root),
|
|
TRIE_LWSAC_BLOCK_SIZE);
|
|
if (!t->root)
|
|
goto unwind;
|
|
|
|
memset(t->root, 0, sizeof(*t->root));
|
|
t->parser = t->root;
|
|
t->last_file_index = -1;
|
|
t->line_number = 1;
|
|
t->filepath_list = NULL;
|
|
|
|
memset(t->root_lookup, 0, sizeof(*t->root_lookup));
|
|
|
|
/* write the header */
|
|
|
|
buf[0] = 0xca;
|
|
buf[1] = 0x7a;
|
|
buf[2] = 0x5f;
|
|
buf[3] = 0x75;
|
|
|
|
/* (these are filled in with correct data at the end) */
|
|
|
|
/* file offset to root trie entry */
|
|
g32(&buf[4], 0);
|
|
/* file length when it was created */
|
|
g32(&buf[8], 0);
|
|
/* fileoffset to the filepath table */
|
|
g32(&buf[0xc], 0);
|
|
/* count of filepaths */
|
|
g32(&buf[0x10], 0);
|
|
|
|
if (write(t->fd, buf, TRIE_FILE_HDR_SIZE) != TRIE_FILE_HDR_SIZE) {
|
|
lwsl_err("%s: trie header write failed\n", __func__);
|
|
goto unwind;
|
|
}
|
|
|
|
t->c = TRIE_FILE_HDR_SIZE;
|
|
|
|
return t;
|
|
|
|
unwind:
|
|
lwsac_free(&lwsac_head);
|
|
|
|
return NULL;
|
|
}
|
|
|
|
void
|
|
lws_fts_destroy(struct lws_fts **trie)
|
|
{
|
|
struct lwsac *lwsac_head = (*trie)->lwsac_head;
|
|
lwsac_free(&(*trie)->lwsac_input_head);
|
|
lwsac_free(&lwsac_head);
|
|
*trie = NULL;
|
|
}
|
|
|
|
int
|
|
lws_fts_file_index(struct lws_fts *t, const char *filepath, int filepath_len,
|
|
int priority)
|
|
{
|
|
struct lws_fts_filepath *fp = t->filepath_list;
|
|
#if 0
|
|
while (fp) {
|
|
if (fp->filepath_len == filepath_len &&
|
|
!strcmp(fp->filepath, filepath))
|
|
return fp->file_index;
|
|
|
|
fp = fp->next;
|
|
}
|
|
#endif
|
|
fp = lwsac_use(&t->lwsac_head, sizeof(*fp), TRIE_LWSAC_BLOCK_SIZE);
|
|
if (!fp)
|
|
return -1;
|
|
|
|
fp->next = t->filepath_list;
|
|
t->filepath_list = fp;
|
|
strncpy(fp->filepath, filepath, sizeof(fp->filepath) - 1);
|
|
fp->filepath[sizeof(fp->filepath) - 1] = '\0';
|
|
fp->filepath_len = filepath_len;
|
|
fp->file_index = t->next_file_index++;
|
|
fp->line_table_ofs = t->c;
|
|
fp->priority = priority;
|
|
fp->total_lines = 0;
|
|
t->fp = fp;
|
|
|
|
return fp->file_index;
|
|
}
|
|
|
|
static struct lws_fts_entry *
|
|
lws_fts_entry_child_add(struct lws_fts *t, unsigned char c,
|
|
struct lws_fts_entry *parent)
|
|
{
|
|
struct lws_fts_entry *e, **pe;
|
|
|
|
e = lwsac_use(&t->lwsac_head, sizeof(*e), TRIE_LWSAC_BLOCK_SIZE);
|
|
if (!e)
|
|
return NULL;
|
|
|
|
memset(e, 0, sizeof(*e));
|
|
|
|
e->c = c;
|
|
parent->child_count++;
|
|
e->parent = parent;
|
|
t->count_entries++;
|
|
|
|
/* keep the parent child list in ascending sort order for c */
|
|
|
|
pe = &parent->child_list;
|
|
while (*pe) {
|
|
assert((*pe)->parent == parent);
|
|
if ((*pe)->c > c) {
|
|
/* add it before */
|
|
e->sibling = *pe;
|
|
*pe = e;
|
|
break;
|
|
}
|
|
pe = &(*pe)->sibling;
|
|
}
|
|
|
|
if (!*pe) {
|
|
/* add it at the end */
|
|
e->sibling = NULL;
|
|
*pe = e;
|
|
}
|
|
|
|
return e;
|
|
}
|
|
|
|
static int
|
|
finalize_per_input(struct lws_fts *t)
|
|
{
|
|
struct lws_fts_instance_file *tif;
|
|
unsigned char buf[8192];
|
|
uint64_t lwsac_input_size;
|
|
jg2_file_offset temp;
|
|
int bp = 0;
|
|
|
|
bp += g16(&buf[bp], 0);
|
|
bp += g16(&buf[bp], 0);
|
|
bp += g32(&buf[bp], 0);
|
|
if (write(t->fd, buf, bp) != bp)
|
|
return 1;
|
|
t->c += bp;
|
|
bp = 0;
|
|
|
|
/*
|
|
* Write the generated file index + instances (if any)
|
|
*
|
|
* Notice the next same-parent file instance fileoffset list is
|
|
* backwards, so it does not require seeks to fill in. The first
|
|
* entry has 0 but the second entry points to the first entry (whose
|
|
* fileoffset is known).
|
|
*
|
|
* After all the file instance structs are finalized,
|
|
* .ofs_last_inst_file contains the fileoffset of that child's tif
|
|
* list head in the file.
|
|
*
|
|
* The file instances are written to disk in the order that the files
|
|
* were indexed, along with their prev pointers inline.
|
|
*/
|
|
|
|
tif = t->tif_list;
|
|
while (tif) {
|
|
struct lws_fts_lines *i;
|
|
|
|
spill((3 * MAX_VLI) + tif->count, 0);
|
|
|
|
temp = tif->owner->ofs_last_inst_file;
|
|
if (tif->total)
|
|
tif->owner->ofs_last_inst_file = t->c + bp;
|
|
|
|
assert(!temp || (temp > TRIE_FILE_HDR_SIZE && temp < t->c));
|
|
|
|
/* fileoffset of prev instance file for this entry, or 0 */
|
|
bp += wq32(&buf[bp], temp);
|
|
bp += wq32(&buf[bp], tif->file_index);
|
|
bp += wq32(&buf[bp], tif->total);
|
|
|
|
/* remove any pointers into this disposable lac footprint */
|
|
tif->owner->inst_file_list = NULL;
|
|
|
|
memcpy(&buf[bp], &tif->vli, tif->count);
|
|
bp += tif->count;
|
|
|
|
i = tif->lines_list;
|
|
while (i) {
|
|
spill(i->count, 0);
|
|
memcpy(&buf[bp], &i->vli, i->count);
|
|
bp += i->count;
|
|
|
|
i = i->lines_next;
|
|
}
|
|
|
|
tif = tif->inst_file_next;
|
|
}
|
|
|
|
spill(0, 1);
|
|
|
|
assert(lseek(t->fd, 0, SEEK_END) == (off_t)t->c);
|
|
|
|
if (t->lwsac_input_head) {
|
|
lwsac_input_size = lwsac_total_alloc(t->lwsac_input_head);
|
|
if (lwsac_input_size > t->worst_lwsac_input_size)
|
|
t->worst_lwsac_input_size = lwsac_input_size;
|
|
}
|
|
|
|
/*
|
|
* those per-file allocations are all on a separate lac so we can
|
|
* free it cleanly afterwards
|
|
*/
|
|
lwsac_free(&t->lwsac_input_head);
|
|
|
|
/* and lose the pointer into the deallocated lac */
|
|
t->tif_list = NULL;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* 0 = punctuation, whitespace, brackets etc
|
|
* 1 = character inside symbol set
|
|
* 2 = upper-case character inside symbol set
|
|
*/
|
|
|
|
static char classify[] = {
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, //1,
|
|
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
};
|
|
|
|
#if 0
|
|
static const char *
|
|
name_entry(struct lws_fts_entry *e1, char *s, int len)
|
|
{
|
|
struct lws_fts_entry *e2;
|
|
int n = len;
|
|
|
|
s[--n] = '\0';
|
|
|
|
e2 = e1;
|
|
while (e2) {
|
|
if (e2->suffix) {
|
|
if ((int)e2->suffix_len < n) {
|
|
n -= e2->suffix_len;
|
|
memcpy(&s[n], e2->suffix, e2->suffix_len);
|
|
}
|
|
} else {
|
|
n--;
|
|
s[n] = e2->c;
|
|
}
|
|
|
|
e2 = e2->parent;
|
|
}
|
|
|
|
return &s[n + 1];
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* as we parse the input, we create a line length table for the file index.
|
|
* Only the file header has been written before we start doing this.
|
|
*/
|
|
|
|
int
|
|
lws_fts_fill(struct lws_fts *t, uint32_t file_index, const char *buf,
|
|
size_t len)
|
|
{
|
|
unsigned long long tf = lws_now_usecs();
|
|
unsigned char c, linetable[256], vlibuf[8];
|
|
struct lws_fts_entry *e, *e1, *dcl;
|
|
struct lws_fts_instance_file *tif;
|
|
int bp = 0, sline, chars, m;
|
|
char *osuff, skipline = 0;
|
|
struct lws_fts_lines *tl;
|
|
unsigned int olen, n;
|
|
off_t lbh;
|
|
|
|
if ((int)file_index != t->last_file_index) {
|
|
if (t->last_file_index >= 0)
|
|
finalize_per_input(t);
|
|
t->last_file_index = file_index;
|
|
t->line_number = 1;
|
|
t->chars_in_line = 0;
|
|
t->lines_in_unsealed_linetable = 0;
|
|
}
|
|
|
|
t->agg_raw_input += len;
|
|
|
|
resume:
|
|
|
|
chars = 0;
|
|
lbh = t->c;
|
|
sline = t->line_number;
|
|
bp += g16(&linetable[bp], 0);
|
|
bp += g16(&linetable[bp], 0);
|
|
bp += g32(&linetable[bp], 0);
|
|
|
|
while (len) {
|
|
char go_around = 0;
|
|
|
|
if (t->lines_in_unsealed_linetable >= LWS_FTS_LINES_PER_CHUNK)
|
|
break;
|
|
|
|
len--;
|
|
|
|
c = (unsigned char)*buf++;
|
|
t->chars_in_line++;
|
|
if (c == '\n') {
|
|
skipline = 0;
|
|
t->filepath_list->total_lines++;
|
|
t->lines_in_unsealed_linetable++;
|
|
t->line_number++;
|
|
|
|
bp += wq32(&linetable[bp], t->chars_in_line);
|
|
if ((unsigned int)bp > sizeof(linetable) - 6) {
|
|
if (write(t->fd, linetable, bp) != bp) {
|
|
lwsl_err("%s: linetable write failed\n",
|
|
__func__);
|
|
return 1;
|
|
}
|
|
t->c += bp;
|
|
bp = 0;
|
|
// assert(lseek(t->fd, 0, SEEK_END) == t->c);
|
|
}
|
|
|
|
chars += t->chars_in_line;
|
|
t->chars_in_line = 0;
|
|
|
|
/*
|
|
* Detect overlength lines and skip them (eg, BASE64
|
|
* in css etc)
|
|
*/
|
|
|
|
if (len > 200) {
|
|
n = 0;
|
|
m = 0;
|
|
while (n < 200 && m < 80 && buf[n] != '\n') {
|
|
if (buf[n] == ' ' || buf[n] == '\t')
|
|
m = 0;
|
|
n++;
|
|
m++;
|
|
}
|
|
|
|
/* 80 lines no whitespace, or >=200-char line */
|
|
|
|
if (m == 80 || n == 200)
|
|
skipline = 1;
|
|
}
|
|
|
|
goto seal;
|
|
}
|
|
if (skipline)
|
|
continue;
|
|
|
|
m = classify[(int)c];
|
|
if (!m)
|
|
goto seal;
|
|
if (m == 2)
|
|
c += 'a' - 'A';
|
|
|
|
if (t->aggregate) {
|
|
|
|
/*
|
|
* We created a trie entry for an earlier char in this
|
|
* symbol already. So we know at the moment, any
|
|
* further chars in the symbol are the only children.
|
|
*
|
|
* Aggregate them and add them as a string suffix to
|
|
* the trie symbol at the end (when we know how much to
|
|
* allocate).
|
|
*/
|
|
|
|
if (t->agg_pos < sizeof(t->agg) - 1)
|
|
/* symbol is not too long to stash */
|
|
t->agg[t->agg_pos++] = c;
|
|
|
|
continue;
|
|
}
|
|
|
|
if (t->str_match_pos) {
|
|
go_around = 1;
|
|
goto seal;
|
|
}
|
|
|
|
/* zeroth-iteration child matching */
|
|
|
|
if (t->parser == t->root) {
|
|
e = t->root_lookup[(int)c];
|
|
if (e) {
|
|
t->parser = e;
|
|
continue;
|
|
}
|
|
} else {
|
|
|
|
/* look for the char amongst the children */
|
|
|
|
e = t->parser->child_list;
|
|
while (e) {
|
|
|
|
/* since they're alpha ordered... */
|
|
if (e->c > c) {
|
|
e = NULL;
|
|
break;
|
|
}
|
|
if (e->c == c) {
|
|
t->parser = e;
|
|
|
|
if (e->suffix)
|
|
t->str_match_pos = 1;
|
|
|
|
break;
|
|
}
|
|
|
|
e = e->sibling;
|
|
}
|
|
|
|
if (e)
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* we are blazing a new trail, add a new child representing
|
|
* the whole suffix that couldn't be matched until now.
|
|
*/
|
|
|
|
e = lws_fts_entry_child_add(t, c, t->parser);
|
|
if (!e) {
|
|
lwsl_err("%s: lws_fts_entry_child_add failed\n",
|
|
__func__);
|
|
return 1;
|
|
}
|
|
|
|
/* if it's the root node, keep the root_lookup table in sync */
|
|
|
|
if (t->parser == t->root)
|
|
t->root_lookup[(int)c] = e;
|
|
|
|
/* follow the new path */
|
|
t->parser = e;
|
|
|
|
{
|
|
struct lws_fts_entry **pe = &e->child_list;
|
|
while (*pe) {
|
|
assert((*pe)->parent == e);
|
|
|
|
pe = &(*pe)->sibling;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* If there are any more symbol characters coming, just
|
|
* create a suffix string on t->parser instead of what must
|
|
* currently be single-child nodes, since we just created e
|
|
* as a child with a single character due to no existing match
|
|
* on that single character... so if no match on 'h' with this
|
|
* guy's parent, we created e that matches on the single char
|
|
* 'h'. If the symbol continues ... 'a' 'p' 'p' 'y', then
|
|
* instead of creating singleton child nodes under e,
|
|
* modify e to match on the whole string suffix "happy".
|
|
*
|
|
* If later "hoppy" appears, we will remove the suffix on e,
|
|
* so it reverts to a char match for 'h', add singleton children
|
|
* for 'a' and 'o', and attach a "ppy" suffix child to each of
|
|
* those.
|
|
*
|
|
* We want to do this so we don't have to allocate trie entries
|
|
* for every char in the string to save memory and consequently
|
|
* time.
|
|
*
|
|
* Don't try this optimization if the parent is the root node...
|
|
* it's not compatible with it's root_lookup table and it's
|
|
* highly likely children off the root entry are going to have
|
|
* to be fragmented.
|
|
*/
|
|
|
|
if (e->parent != t->root) {
|
|
t->aggregate = 1;
|
|
t->agg_pos = 0;
|
|
}
|
|
|
|
continue;
|
|
|
|
seal:
|
|
if (t->str_match_pos) {
|
|
|
|
/*
|
|
* We're partway through matching an elaborated string
|
|
* on a child, not just a character. String matches
|
|
* only exist when we met a child entry that only had
|
|
* one path until now... so we had an 'h', and the
|
|
* only child had a string "hello".
|
|
*
|
|
* We are following the right path and will not need
|
|
* to back up, but we may find as we go we have the
|
|
* first instance of a second child path, eg, "help".
|
|
*
|
|
* When we get to the 'p', we have to split what was
|
|
* the only string option "hello" into "hel" and then
|
|
* two child entries, for "lo" and 'p'.
|
|
*/
|
|
|
|
if (c == t->parser->suffix[t->str_match_pos++]) {
|
|
if (t->str_match_pos < t->parser->suffix_len)
|
|
continue;
|
|
|
|
/*
|
|
* We simply matched everything, continue
|
|
* parsing normally from this trie entry.
|
|
*/
|
|
|
|
t->str_match_pos = 0;
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* So... we hit a mismatch somewhere... it means we
|
|
* have to split this string entry.
|
|
*
|
|
* We know the first char actually matched in order to
|
|
* start down this road. So for the current trie entry,
|
|
* we need to truncate his suffix at the char before
|
|
* this mismatched one, where we diverged (if the
|
|
* second char, simply remove the suffix string from the
|
|
* current trie entry to turn it back to a 1-char match)
|
|
*
|
|
* The original entry, which becomes the lhs post-split,
|
|
* is t->parser.
|
|
*/
|
|
|
|
olen = t->parser->suffix_len;
|
|
osuff = t->parser->suffix;
|
|
|
|
if (t->str_match_pos == 2)
|
|
t->parser->suffix = NULL;
|
|
else
|
|
t->parser->suffix_len = t->str_match_pos - 1;
|
|
|
|
/*
|
|
* Then we need to create a new child trie entry that
|
|
* represents the remainder of the original string
|
|
* path that we didn't match. For the "hello" /
|
|
* "help" case, this guy will have "lo".
|
|
*
|
|
* Any instances or children (not siblings...) that were
|
|
* attached to the original trie entry must be detached
|
|
* first and then migrate to this new guy that completes
|
|
* the original string.
|
|
*/
|
|
|
|
dcl = t->parser->child_list;
|
|
m = t->parser->child_count;
|
|
|
|
t->parser->child_list = NULL;
|
|
t->parser->child_count = 0;
|
|
|
|
e = lws_fts_entry_child_add(t,
|
|
osuff[t->str_match_pos - 1], t->parser);
|
|
if (!e) {
|
|
lwsl_err("%s: lws_fts_entry_child_add fail1\n",
|
|
__func__);
|
|
return 1;
|
|
}
|
|
|
|
e->child_list = dcl;
|
|
e->child_count = m;
|
|
/*
|
|
* any children we took over must point to us as the
|
|
* parent now they appear on our child list
|
|
*/
|
|
e1 = e->child_list;
|
|
while (e1) {
|
|
e1->parent = e;
|
|
e1 = e1->sibling;
|
|
}
|
|
|
|
/*
|
|
* We detached any children, gave them to the new guy
|
|
* and replaced them with just our new guy
|
|
*/
|
|
t->parser->child_count = 1;
|
|
t->parser->child_list = e;
|
|
|
|
/*
|
|
* any instances that belonged to the original entry we
|
|
* are splitting now must be reassigned to the end
|
|
* part
|
|
*/
|
|
|
|
e->inst_file_list = t->parser->inst_file_list;
|
|
if (e->inst_file_list)
|
|
e->inst_file_list->owner = e;
|
|
t->parser->inst_file_list = NULL;
|
|
e->instance_count = t->parser->instance_count;
|
|
t->parser->instance_count = 0;
|
|
|
|
e->ofs_last_inst_file = t->parser->ofs_last_inst_file;
|
|
t->parser->ofs_last_inst_file = 0;
|
|
|
|
if (t->str_match_pos != olen) {
|
|
/* we diverged partway */
|
|
e->suffix = &osuff[t->str_match_pos - 1];
|
|
e->suffix_len = olen - (t->str_match_pos - 1);
|
|
}
|
|
|
|
/*
|
|
* if the current char is a terminal, skip creating a
|
|
* new way forward.
|
|
*/
|
|
|
|
if (classify[(int)c]) {
|
|
|
|
/*
|
|
* Lastly we need to create a new child trie
|
|
* entry that represents the new way forward
|
|
* from the point that we diverged. For the
|
|
* "hello" / "help" case, this guy will start
|
|
* as a child of "hel" with the single
|
|
* character match 'p'.
|
|
*
|
|
* Since he becomes the current parser context,
|
|
* more symbol characters may be coming to make
|
|
* him into, eg, "helping", in which case he
|
|
* will acquire a suffix eventually of "ping"
|
|
* via the aggregation stuff
|
|
*/
|
|
|
|
e = lws_fts_entry_child_add(t, c, t->parser);
|
|
if (!e) {
|
|
lwsl_err("%s: child_add fail2\n",
|
|
__func__);
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
/* go on following this path */
|
|
t->parser = e;
|
|
|
|
t->aggregate = 1;
|
|
t->agg_pos = 0;
|
|
|
|
t->str_match_pos = 0;
|
|
|
|
if (go_around)
|
|
continue;
|
|
|
|
/* this is intended to be a seal */
|
|
}
|
|
|
|
|
|
/* end of token */
|
|
|
|
if (t->aggregate && t->agg_pos) {
|
|
|
|
/* if nothing in agg[]: leave as single char match */
|
|
|
|
/* otherwise copy out the symbol aggregation */
|
|
t->parser->suffix = lwsac_use(&t->lwsac_head,
|
|
t->agg_pos + 1,
|
|
TRIE_LWSAC_BLOCK_SIZE);
|
|
if (!t->parser->suffix) {
|
|
lwsl_err("%s: lac for suffix failed\n",
|
|
__func__);
|
|
return 1;
|
|
}
|
|
|
|
/* add the first char at the beginning */
|
|
*t->parser->suffix = t->parser->c;
|
|
/* and then add the agg buffer stuff */
|
|
memcpy(t->parser->suffix + 1, t->agg, t->agg_pos);
|
|
t->parser->suffix_len = t->agg_pos + 1;
|
|
}
|
|
t->aggregate = 0;
|
|
|
|
if (t->parser == t->root) /* multiple terminal chars */
|
|
continue;
|
|
|
|
if (!t->parser->inst_file_list ||
|
|
t->parser->inst_file_list->file_index != file_index) {
|
|
tif = lwsac_use(&t->lwsac_input_head, sizeof(*tif),
|
|
TRIE_LWSAC_BLOCK_SIZE);
|
|
if (!tif) {
|
|
lwsl_err("%s: lac for tif failed\n",
|
|
__func__);
|
|
return 1;
|
|
}
|
|
|
|
tif->file_index = file_index;
|
|
tif->owner = t->parser;
|
|
tif->lines_list = NULL;
|
|
tif->lines_tail = NULL;
|
|
tif->total = 0;
|
|
tif->count = 0;
|
|
tif->inst_file_next = t->tif_list;
|
|
t->tif_list = tif;
|
|
|
|
t->parser->inst_file_list = tif;
|
|
}
|
|
|
|
/*
|
|
* A naive allocation strategy for this leads to 50% of the
|
|
* total inmem lac allocation being for line numbers...
|
|
*
|
|
* It's mainly solved by only holding the instance and line
|
|
* number tables for the duration of a file being input, as soon
|
|
* as one input file is finished it is written to disk.
|
|
*
|
|
* For the common case of 1 - ~3 matches the line number are
|
|
* stored in a small VLI array inside the filepath inst. If the
|
|
* next one won't fit, it allocates a line number struct with
|
|
* more vli space and continues chaining those if needed.
|
|
*/
|
|
|
|
n = wq32(vlibuf, t->line_number);
|
|
tif = t->parser->inst_file_list;
|
|
|
|
if (!tif->lines_list) {
|
|
/* we are still trying to use the file inst vli */
|
|
if (LWS_ARRAY_SIZE(tif->vli) - tif->count >= n) {
|
|
tif->count += wq32(tif->vli + tif->count,
|
|
t->line_number);
|
|
goto after;
|
|
}
|
|
/* we are going to have to allocate */
|
|
}
|
|
|
|
/* can we add to an existing line numbers struct? */
|
|
if (tif->lines_tail &&
|
|
LWS_ARRAY_SIZE(tif->lines_tail->vli) -
|
|
tif->lines_tail->count >= n) {
|
|
tif->lines_tail->count += wq32(tif->lines_tail->vli +
|
|
tif->lines_tail->count,
|
|
t->line_number);
|
|
goto after;
|
|
}
|
|
|
|
/* either no existing line numbers struct at tail, or full */
|
|
|
|
/* have to create a(nother) line numbers struct */
|
|
tl = lwsac_use(&t->lwsac_input_head, sizeof(*tl),
|
|
TRIE_LWSAC_BLOCK_SIZE);
|
|
if (!tl) {
|
|
lwsl_err("%s: lac for tl failed\n", __func__);
|
|
return 1;
|
|
}
|
|
tl->lines_next = NULL;
|
|
if (tif->lines_tail)
|
|
tif->lines_tail->lines_next = tl;
|
|
|
|
tif->lines_tail = tl;
|
|
if (!tif->lines_list)
|
|
tif->lines_list = tl;
|
|
|
|
tl->count = wq32(tl->vli, t->line_number);
|
|
after:
|
|
tif->total++;
|
|
#if 0
|
|
{
|
|
char s[128];
|
|
const char *ne = name_entry(t->parser, s, sizeof(s));
|
|
|
|
if (!strcmp(ne, "describ")) {
|
|
lwsl_err(" %s %d\n", ne, t->str_match_pos);
|
|
write(1, buf - 10, 20);
|
|
}
|
|
}
|
|
#endif
|
|
t->parser->instance_count++;
|
|
t->parser = t->root;
|
|
t->str_match_pos = 0;
|
|
}
|
|
|
|
/* seal off the line length table block */
|
|
|
|
if (bp) {
|
|
if (write(t->fd, linetable, bp) != bp)
|
|
return 1;
|
|
t->c += bp;
|
|
bp = 0;
|
|
}
|
|
|
|
if (lseek(t->fd, lbh, SEEK_SET) < 0) {
|
|
lwsl_err("%s: seek to 0x%llx failed\n", __func__,
|
|
(unsigned long long)lbh);
|
|
return 1;
|
|
}
|
|
|
|
g16(linetable, t->c - lbh);
|
|
g16(linetable + 2, t->line_number - sline);
|
|
g32(linetable + 4, chars);
|
|
if (write(t->fd, linetable, 8) != 8) {
|
|
lwsl_err("%s: write linetable header failed\n", __func__);
|
|
return 1;
|
|
}
|
|
|
|
assert(lseek(t->fd, 0, SEEK_END) == (off_t)t->c);
|
|
|
|
if (lseek(t->fd, t->c, SEEK_SET) < 0) {
|
|
lwsl_err("%s: end seek failed\n", __func__);
|
|
return 1;
|
|
}
|
|
|
|
bp = 0;
|
|
|
|
if (len) {
|
|
t->lines_in_unsealed_linetable = 0;
|
|
goto resume;
|
|
}
|
|
|
|
/* dump the collected per-input instance and line data, and free it */
|
|
|
|
t->agg_trie_creation_us += lws_now_usecs() - tf;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* refer to ./README.md */
|
|
|
|
int
|
|
lws_fts_serialize(struct lws_fts *t)
|
|
{
|
|
struct lws_fts_filepath *fp = t->filepath_list, *ofp;
|
|
unsigned long long tf = lws_now_usecs();
|
|
struct lws_fts_entry *e, *e1, *s[256];
|
|
unsigned char buf[8192], stasis;
|
|
int n, bp, sp = 0, do_parent;
|
|
|
|
(void)tf;
|
|
finalize_per_input(t);
|
|
|
|
/*
|
|
* Compute aggregated instance counts (parents should know the total
|
|
* number of instances below each child path)
|
|
*
|
|
*
|
|
* If we have
|
|
*
|
|
* (root) -> (c1) -> (c2)
|
|
* -> (c3)
|
|
*
|
|
* we need to visit the nodes in the order
|
|
*
|
|
* c2, c1, c3, root
|
|
*/
|
|
|
|
sp = 0;
|
|
s[0] = t->root;
|
|
do_parent = 0;
|
|
while (sp >= 0) {
|
|
int n;
|
|
|
|
/* aggregate in every antecedent */
|
|
|
|
for (n = 0; n <= sp; n++) {
|
|
s[n]->agg_inst_count += s[sp]->instance_count;
|
|
s[n]->agg_child_count += s[sp]->child_count;
|
|
}
|
|
|
|
/* handle any children before the parent */
|
|
|
|
if (s[sp]->child_list) {
|
|
if (sp + 1 == LWS_ARRAY_SIZE(s)) {
|
|
lwsl_err("Stack too deep\n");
|
|
|
|
goto bail;
|
|
}
|
|
|
|
s[sp + 1] = s[sp]->child_list;
|
|
sp++;
|
|
continue;
|
|
}
|
|
|
|
do {
|
|
if (s[sp]->sibling) {
|
|
s[sp] = s[sp]->sibling;
|
|
break;
|
|
} else
|
|
sp--;
|
|
} while (sp >= 0);
|
|
}
|
|
|
|
/* dump the filepaths and set prev */
|
|
|
|
fp = t->filepath_list;
|
|
ofp = NULL;
|
|
bp = 0;
|
|
while (fp) {
|
|
|
|
fp->ofs = t->c + bp;
|
|
n = (int)strlen(fp->filepath);
|
|
spill(15 + n, 0);
|
|
|
|
bp += wq32(&buf[bp], fp->line_table_ofs);
|
|
bp += wq32(&buf[bp], fp->total_lines);
|
|
bp += wq32(&buf[bp], n);
|
|
memcpy(&buf[bp], fp->filepath, n);
|
|
bp += n;
|
|
|
|
fp->prev = ofp;
|
|
ofp = fp;
|
|
fp = fp->next;
|
|
}
|
|
|
|
spill(0, 1);
|
|
|
|
/* record the fileoffset of the filepath map and filepath count */
|
|
|
|
if (lseek(t->fd, 0xc, SEEK_SET) < 0)
|
|
goto bail_seek;
|
|
|
|
g32(buf, t->c + bp);
|
|
g32(buf + 4, t->next_file_index);
|
|
if (write(t->fd, buf, 8) != 8)
|
|
goto bail;
|
|
|
|
if (lseek(t->fd, t->c + bp, SEEK_SET) < 0)
|
|
goto bail_seek;
|
|
|
|
/* dump the filepath map, starting from index 0, which is at the tail */
|
|
|
|
fp = ofp;
|
|
bp = 0;
|
|
while (fp) {
|
|
spill(5, 0);
|
|
g32(buf + bp, fp->ofs);
|
|
bp += 4;
|
|
fp = fp->prev;
|
|
}
|
|
spill(0, 1);
|
|
|
|
/*
|
|
* The trie entries in reverse order... because of the reversal, we have
|
|
* always written children first, and marked them with their file offset
|
|
* before we come to refer to them.
|
|
*/
|
|
|
|
bp = 0;
|
|
sp = 0;
|
|
s[0] = t->root;
|
|
do_parent = 0;
|
|
while (s[sp]) {
|
|
|
|
/* handle any children before the parent */
|
|
|
|
if (!do_parent && s[sp]->child_list) {
|
|
|
|
if (sp + 1 == LWS_ARRAY_SIZE(s)) {
|
|
lwsl_err("Stack too deep\n");
|
|
|
|
goto bail;
|
|
}
|
|
|
|
s[sp + 1] = s[sp]->child_list;
|
|
sp++;
|
|
continue;
|
|
}
|
|
|
|
/* leaf nodes with no children */
|
|
|
|
e = s[sp];
|
|
e->ofs = t->c + bp;
|
|
|
|
/* write the trie entry header */
|
|
|
|
spill((3 * MAX_VLI), 0);
|
|
|
|
bp += wq32(&buf[bp], e->ofs_last_inst_file);
|
|
bp += wq32(&buf[bp], e->child_count);
|
|
bp += wq32(&buf[bp], e->instance_count);
|
|
bp += wq32(&buf[bp], e->agg_inst_count);
|
|
|
|
/* sort the children in order of highest aggregate hits first */
|
|
|
|
do {
|
|
struct lws_fts_entry **pe, *te1, *te2;
|
|
|
|
stasis = 1;
|
|
|
|
/* bubble sort keeps going until nothing changed */
|
|
|
|
pe = &e->child_list;
|
|
while (*pe) {
|
|
|
|
te1 = *pe;
|
|
te2 = te1->sibling;
|
|
|
|
if (te2 && te1->agg_inst_count <
|
|
te2->agg_inst_count) {
|
|
stasis = 0;
|
|
|
|
*pe = te2;
|
|
te1->sibling = te2->sibling;
|
|
te2->sibling = te1;
|
|
}
|
|
|
|
pe = &(*pe)->sibling;
|
|
}
|
|
|
|
} while (!stasis);
|
|
|
|
/* write the children */
|
|
|
|
e1 = e->child_list;
|
|
while (e1) {
|
|
spill((5 * MAX_VLI) + e1->suffix_len + 1, 0);
|
|
|
|
bp += wq32(&buf[bp], e1->ofs);
|
|
bp += wq32(&buf[bp], e1->instance_count);
|
|
bp += wq32(&buf[bp], e1->agg_inst_count);
|
|
bp += wq32(&buf[bp], e1->agg_child_count);
|
|
|
|
if (e1->suffix) { /* string */
|
|
bp += wq32(&buf[bp], e1->suffix_len);
|
|
memmove(&buf[bp], e1->suffix, e1->suffix_len);
|
|
bp += e1->suffix_len;
|
|
} else { /* char */
|
|
bp += wq32(&buf[bp], 1);
|
|
buf[bp++] = e1->c;
|
|
}
|
|
#if 0
|
|
if (e1->suffix && e1->suffix_len == 3 &&
|
|
!memcmp(e1->suffix, "cri", 3)) {
|
|
struct lws_fts_entry *e2;
|
|
|
|
e2 = e1;
|
|
while (e2){
|
|
if (e2->suffix)
|
|
lwsl_notice("%s\n", e2->suffix);
|
|
else
|
|
lwsl_notice("%c\n", e2->c);
|
|
|
|
e2 = e2->parent;
|
|
}
|
|
|
|
lwsl_err("*** %c CRI inst %d ch %d\n", e1->parent->c,
|
|
e1->instance_count, e1->child_count);
|
|
}
|
|
#endif
|
|
e1 = e1->sibling;
|
|
}
|
|
|
|
/* if there are siblings, do those next */
|
|
|
|
if (do_parent) {
|
|
do_parent = 0;
|
|
sp--;
|
|
}
|
|
|
|
if (s[sp]->sibling)
|
|
s[sp] = s[sp]->sibling;
|
|
else {
|
|
/* if there are no siblings, do the parent */
|
|
do_parent = 1;
|
|
s[sp] = s[sp]->parent;
|
|
}
|
|
}
|
|
|
|
spill(0, 1);
|
|
|
|
assert(lseek(t->fd, 0, SEEK_END) == (off_t)t->c);
|
|
|
|
/* drop the correct root trie offset + file length into the header */
|
|
|
|
if (lseek(t->fd, 4, SEEK_SET) < 0) {
|
|
lwsl_err("%s: unable to seek\n", __func__);
|
|
|
|
goto bail;
|
|
}
|
|
|
|
g32(buf, t->root->ofs);
|
|
g32(buf + 4, t->c);
|
|
if (write(t->fd, buf, 0x8) != 0x8)
|
|
goto bail;
|
|
|
|
lwsl_notice("%s: index %d files (%uMiB) cpu time %dms, "
|
|
"alloc: %dKiB + %dKiB, "
|
|
"serialize: %dms, file: %dKiB\n", __func__,
|
|
t->next_file_index,
|
|
(int)(t->agg_raw_input / (1024 * 1024)),
|
|
(int)(t->agg_trie_creation_us / 1000),
|
|
(int)(lwsac_total_alloc(t->lwsac_head) / 1024),
|
|
(int)(t->worst_lwsac_input_size / 1024),
|
|
(int)((lws_now_usecs() - tf) / 1000),
|
|
(int)(t->c / 1024));
|
|
|
|
return 0;
|
|
|
|
bail_seek:
|
|
lwsl_err("%s: problem seekings\n", __func__);
|
|
|
|
bail:
|
|
return 1;
|
|
}
|
|
|
|
|