You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1005 lines
21 KiB

/*
* libwebsockets - small server side websockets and web server implementation
*
* Copyright (C) 2010 - 2019 Andy Green <andy@warmcat.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "private-lib-core.h"
#include "private-lib-misc-fts.h"
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#define AC_COUNT_STASHED_CHILDREN 8
struct ch {
jg2_file_offset ofs;
char name[64];
int inst;
int child_agg;
int name_length;
int effpos;
int descendents;
};
struct wac {
struct ch ch[AC_COUNT_STASHED_CHILDREN];
jg2_file_offset self;
jg2_file_offset tifs;
int child_count;
int child;
int agg;
int desc;
char done_children;
char once;
};
struct linetable {
struct linetable *next;
int chunk_line_number_start;
int chunk_line_number_count;
off_t chunk_filepos_start;
off_t vli_ofs_in_index;
};
static uint32_t
b32(unsigned char *b)
{
return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3];
}
static uint16_t
b16(unsigned char *b)
{
return (b[0] << 8) | b[1];
}
static int
lws_fts_filepath(struct lws_fts_file *jtf, int filepath_index, char *result,
size_t len, uint32_t *ofs_linetable, uint32_t *lines)
{
unsigned char buf[256 + 15];
uint32_t flen;
int ra, bp = 0;
size_t m;
off_t o;
if (filepath_index > jtf->filepaths)
return 1;
if (lseek(jtf->fd, jtf->filepath_table + (4 * filepath_index),
SEEK_SET) < 0) {
lwsl_err("%s: unable to seek\n", __func__);
return 1;
}
ra = read(jtf->fd, buf, 4);
if (ra < 0)
return 1;
o = (unsigned int)b32(buf);
if (lseek(jtf->fd, o, SEEK_SET) < 0) {
lwsl_err("%s: unable to seek\n", __func__);
return 1;
}
ra = read(jtf->fd, buf, sizeof(buf));
if (ra < 0)
return 1;
if (ofs_linetable)
bp += rq32(&buf[bp], ofs_linetable);
else
bp += rq32(&buf[bp], &flen);
if (lines)
bp += rq32(&buf[bp], lines);
else
bp += rq32(&buf[bp], &flen);
bp += rq32(&buf[bp], &flen);
m = flen;
if (len - 1 < m)
m = flen - 1;
strncpy(result, (char *)&buf[bp], m);
result[m] = '\0';
result[len - 1] = '\0';
return 0;
}
/*
* returns -1 for fail or fd open on the trie file.
*
* *root is set to the position of the root trie entry.
* *flen is set to the length of the whole file
*/
int
lws_fts_adopt(struct lws_fts_file *jtf)
{
unsigned char buf[256];
off_t ot;
if (read(jtf->fd, buf, TRIE_FILE_HDR_SIZE) != TRIE_FILE_HDR_SIZE) {
lwsl_err("%s: unable to read file header\n", __func__);
goto bail;
}
if (buf[0] != 0xca || buf[1] != 0x7a ||
buf[2] != 0x5f || buf[3] != 0x75) {
lwsl_err("%s: bad magic %02X %02X %02X %02X\n", __func__,
buf[0], buf[1], buf[2], buf[3]);
goto bail;
}
jtf->root = b32(&buf[4]);
ot = lseek(jtf->fd, 0, SEEK_END);
if (ot < 0) {
lwsl_err("%s: unable to seek\n", __func__);
goto bail;
}
jtf->flen = ot;
if (jtf->flen != b32(&buf[8])) {
lwsl_err("%s: file size doesn't match expected\n", __func__);
goto bail;
}
jtf->filepath_table = b32(&buf[12]);
jtf->filepaths = b32(&buf[16]);
return jtf->fd;
bail:
return -1;
}
struct lws_fts_file *
lws_fts_open(const char *filepath)
{
struct lws_fts_file *jtf;
jtf = lws_malloc(sizeof(*jtf), "fts open");
if (!jtf)
goto bail1;
jtf->fd = open(filepath, O_RDONLY);
if (jtf->fd < 0) {
lwsl_err("%s: unable to open %s\n", __func__, filepath);
goto bail2;
}
if (lws_fts_adopt(jtf) < 0)
goto bail3;
return jtf;
bail3:
close(jtf->fd);
bail2:
lws_free(jtf);
bail1:
return NULL;
}
void
lws_fts_close(struct lws_fts_file *jtf)
{
close(jtf->fd);
lws_free(jtf);
}
#define grab(_pos, _size) { \
bp = 0; \
if (lseek(jtf->fd, _pos, SEEK_SET) < 0) { \
lwsl_err("%s: unable to seek\n", __func__); \
\
goto bail; \
} \
\
ra = read(jtf->fd, buf, _size); \
if (ra < 0) \
goto bail; \
}
static struct linetable *
lws_fts_cache_chunktable(struct lws_fts_file *jtf, uint32_t ofs_linetable,
struct lwsac **linetable_head)
{
struct linetable *lt, *first = NULL, **prev = NULL;
unsigned char buf[8];
int line = 1, bp, ra;
off_t cfs = 0;
*linetable_head = NULL;
do {
grab(ofs_linetable, sizeof(buf));
lt = lwsac_use(linetable_head, sizeof(*lt), 0);
if (!lt)
goto bail;
if (!first)
first = lt;
lt->next = NULL;
if (prev)
*prev = lt;
prev = &lt->next;
lt->chunk_line_number_start = line;
lt->chunk_line_number_count = b16(&buf[bp + 2]);
lt->vli_ofs_in_index = ofs_linetable + 8;
lt->chunk_filepos_start = cfs;
line += lt->chunk_line_number_count;
cfs += b32(&buf[bp + 4]);
ofs_linetable += b16(&buf[bp]);
} while (b16(&buf[bp]));
return first;
bail:
lwsac_free(linetable_head);
return NULL;
}
static int
lws_fts_getfileoffset(struct lws_fts_file *jtf, struct linetable *ltstart,
int line, off_t *_ofs)
{
struct linetable *lt = ltstart;
unsigned char buf[LWS_FTS_LINES_PER_CHUNK * 5];
uint32_t ll;
off_t ofs;
int bp, ra;
/* first figure out which chunk */
do {
if (line >= lt->chunk_line_number_start &&
line < lt->chunk_line_number_start +
lt->chunk_line_number_count)
break;
lt = lt->next;
} while (lt);
if (!lt)
goto bail;
/* we know it's in this chunk */
ofs = lt->chunk_filepos_start;
line -= lt->chunk_line_number_start;
grab(lt->vli_ofs_in_index, sizeof(buf));
bp = 0;
while (line) {
bp += rq32(&buf[bp], &ll);
ofs += ll;
line--;
}
/* we know the offset it is at in the original file */
*_ofs = ofs;
return 0;
bail:
lwsl_info("%s: bail %d\n", __func__, line);
return 1;
}
static int
ac_record(struct lws_fts_file *jtf, struct lwsac **results_head,
const char *needle, int pos, struct wac *s, int sp,
uint32_t instances, uint32_t agg_instances, uint32_t children,
struct lws_fts_result_autocomplete ***ppac)
{
struct lws_fts_result_autocomplete *ac;
int n, m;
char *p;
if (!instances && !agg_instances)
return 1;
m = pos;
for (n = 1; n <= sp; n++)
m += s[n].ch[s[n].child - 1].name_length;
ac = lwsac_use(results_head, sizeof(*ac) + m + 1, 0);
if (!ac)
return -1;
p = (char *)(ac + 1);
**ppac = ac;
ac->next = NULL;
*ppac = &ac->next;
ac->instances = instances;
ac->agg_instances = agg_instances;
ac->ac_length = m;
ac->has_children = !!children;
ac->elided = 0;
memcpy(p, needle, pos);
p += pos;
for (n = 1; n <= sp; n++) {
int w = s[n].child - 1;
memcpy(p, s[n].ch[w].name, s[n].ch[w].name_length);
p += s[n].ch[w].name_length;
}
p = (char *)(ac + 1);
p[m] = '\0';
/*
* deduct this child's instance weight from his antecdents to track
* relative path attractiveness dynamically, after we already used its
* best results (children are sorted best-first)
*/
for (n = sp; n >= 0; n--) {
s[n].ch[s[n].child - 1].child_agg -= instances;
s[n].agg -= instances;
}
return 0;
}
struct lws_fts_result *
lws_fts_search(struct lws_fts_file *jtf, struct lws_fts_search_params *ftsp)
{
uint32_t children, instances, co, sl, agg, slt, chunk,
fileofs_tif_start, desc, agg_instances;
int pos = 0, n, m, nl, bp, base = 0, ra, palm, budget, sp, ofd = -1;
unsigned long long tf = lws_now_usecs();
struct lws_fts_result_autocomplete **pac = NULL;
char stasis, nac = 0, credible, needle[32];
struct lws_fts_result_filepath *fp;
struct lws_fts_result *result;
unsigned char buf[4096];
off_t o, child_ofs;
struct wac s[128];
ftsp->results_head = NULL;
if (!ftsp->needle)
return NULL;
nl = (int)strlen(ftsp->needle);
if ((size_t)nl > sizeof(needle) - 2)
return NULL;
result = lwsac_use(&ftsp->results_head, sizeof(*result), 0);
if (!result)
return NULL;
/* start with no results... */
result->autocomplete_head = NULL;
pac = &result->autocomplete_head;
result->filepath_head = NULL;
result->duration_ms = 0;
result->effective_flags = ftsp->flags;
palm = 0;
for (n = 0; n < nl; n++)
needle[n] = tolower(ftsp->needle[n]);
needle[nl] = '\0';
o = jtf->root;
do {
bp = 0;
base = 0;
grab(o, sizeof(buf));
child_ofs = o + bp;
bp += rq32(&buf[bp], &fileofs_tif_start);
bp += rq32(&buf[bp], &children);
bp += rq32(&buf[bp], &instances);
bp += rq32(&buf[bp], &agg_instances);
palm = pos;
/* the children follow here */
if (pos == nl) {
nac = 0;
if (!fileofs_tif_start)
/*
* we matched, but there are no instances of
* this, it's actually an intermediate
*/
goto autocomp;
/* we leave with bp positioned at the instance list */
o = fileofs_tif_start;
grab(o, sizeof(buf));
break;
}
if (ra - bp < 1024) {
/*
* We don't have enough. So reload the buffer starting
* at where we got to.
*/
base += bp;
grab(o + base, sizeof(buf));
}
/* gets set if any child COULD match needle if it went on */
credible = 0;
for (n = 0; (uint32_t)n < children; n++) {
uint32_t inst;
bp += rq32(&buf[bp], &co);
bp += rq32(&buf[bp], &inst);
bp += rq32(&buf[bp], &agg);
bp += rq32(&buf[bp], &desc);
bp += rq32(&buf[bp], &sl);
if (sl > (uint32_t)(nl - pos)) {
/*
* it can't be a match because it's longer than
* our needle string (but that leaves it as a
* perfectly fine autocomplete candidate)
*/
size_t g = nl - pos;
/*
* "credible" means at least one child matches
* all the chars in needle up to as many as it
* has. If not "credible" this path cannot
* match.
*/
if (!strncmp((char *)&buf[bp], &needle[pos], g))
credible = 1;
else
/*
* deflate the parent agg using the
* knowledge this child is not on the
* path shown by the remainder of needle
*/
agg_instances -= agg;
nac = 0;
bp += sl;
slt = 0;
pos = palm;
goto ensure;
}
/* the comparison string potentially has huge length */
slt = sl;
while (slt) {
/*
* the strategy is to compare whatever we have
* lying around, then bring in more if it didn't
* fail to match yet. That way we don't bring
* in anything we could already have known was
* not needed due to a match fail.
*/
chunk = ra - bp;
if (chunk > slt)
chunk = slt;
if ((chunk == 1 && needle[pos] != buf[bp]) ||
(chunk != 1 &&
memcmp(&needle[pos], &buf[bp], chunk))) {
/*
* it doesn't match... so nothing can
* autocomplete this...
*/
bp += slt;
slt = 0;
nac = 1;
goto ensure;
}
slt -= chunk;
pos += chunk;
bp += chunk;
/* so far, it matches */
if (!slt) {
/* we matched the whole thing */
o = co;
if (!co)
goto bail;
n = (int)children;
credible = 1;
}
ensure:
/*
* do we have at least buf more to match, or the
* remainder of the string, whichever is less?
*
* bp may exceed sizeof(buf) on no match path
*/
chunk = sizeof(buf);
if (slt < chunk)
chunk = slt;
if (ra - bp >= (int)chunk)
continue;
/*
* We don't have enough. So reload buf starting
* at where we got to.
*/
base += bp;
grab(o + base, sizeof(buf));
} /* while we are still comparing */
} /* for each child */
if ((uint32_t)n == children) {
if (!credible)
goto bail;
nac = 0;
goto autocomp;
}
} while(1);
result->duration_ms = (int)((lws_now_usecs() - tf) / 1000);
if (!instances && !children)
return result;
/* the match list may easily exceed one read buffer load ... */
o += bp;
/*
* Only do the file match list if it was requested in the search flags
*/
if (!(ftsp->flags & LWSFTS_F_QUERY_FILES))
goto autocomp;
do {
uint32_t fi, tot, line, ro, ofs_linetable, lines, fplen,
*u, _o;
struct lwsac *lt_head = NULL;
struct linetable *ltst;
char path[256], *pp;
int footprint;
off_t fo;
ofd = -1;
grab(o, sizeof(buf));
ro = o;
bp += rq32(&buf[bp], &_o);
o = _o;
assert(!o || o > TRIE_FILE_HDR_SIZE);
bp += rq32(&buf[bp], &fi);
bp += rq32(&buf[bp], &tot);
if (lws_fts_filepath(jtf, fi, path, sizeof(path) - 1,
&ofs_linetable, &lines)) {
lwsl_err("can't get filepath index %d\n", fi);
goto bail;
}
if (ftsp->only_filepath && strcmp(path, ftsp->only_filepath))
continue;
ltst = lws_fts_cache_chunktable(jtf, ofs_linetable, &lt_head);
if (!ltst)
goto bail;
if (ftsp->flags & LWSFTS_F_QUERY_QUOTE_LINE) {
ofd = open(path, O_RDONLY);
if (ofd < 0) {
lwsac_free(&lt_head);
goto bail;
}
}
fplen = (int)strlen(path);
footprint = sizeof(*fp) + fplen + 1;
if (ftsp->flags & LWSFTS_F_QUERY_FILE_LINES) {
/* line number and offset in file */
footprint += 2 * sizeof(uint32_t) * tot;
if (ftsp->flags & LWSFTS_F_QUERY_QUOTE_LINE)
/* pointer to quote string */
footprint += sizeof(void *) * tot;
}
fp = lwsac_use(&ftsp->results_head, footprint, 0);
if (!fp) {
lwsac_free(&lt_head);
goto bail;
}
fp->filepath_length = fplen;
fp->lines_in_file = lines;
fp->matches = tot;
fp->matches_length = footprint - sizeof(*fp) - (fplen + 1);
fp->next = result->filepath_head;
result->filepath_head = fp;
/* line table first so it can be aligned */
u = (uint32_t*)(fp + 1);
if (ftsp->flags & LWSFTS_F_QUERY_FILE_LINES) {
/* for each line number */
for (n = 0; (uint32_t)n < tot; n++) {
unsigned char lbuf[256], *p;
char ebuf[384];
const char **v;
int m;
if ((ra - bp) < 8) {
base += bp;
grab(ro + base, sizeof(buf));
}
bp += rq32(&buf[bp], &line);
*u++ = line;
if (lws_fts_getfileoffset(jtf, ltst, line, &fo))
continue;
*u++ = (uint32_t)fo;
if (!(ftsp->flags & LWSFTS_F_QUERY_QUOTE_LINE))
continue;
if (lseek(ofd, fo, SEEK_SET) < 0)
continue;
m = read(ofd, lbuf, sizeof(lbuf) - 1);
if (m < 0)
continue;
lbuf[sizeof(lbuf) - 1] = '\0';
p = (unsigned char *)strchr((char *)lbuf, '\n');
if (p)
m = lws_ptr_diff(p, lbuf);
lbuf[m] = '\0';
p = (unsigned char *)strchr((char *)lbuf, '\r');
if (p)
m = lws_ptr_diff(p, lbuf);
lbuf[m] = '\0';
lws_json_purify(ebuf, (const char *)lbuf,
sizeof(ebuf) - 1, NULL);
m = (int)strlen(ebuf);
p = lwsac_use(&ftsp->results_head, m + 1, 0);
if (!p) {
lwsac_free(&lt_head);
goto bail;
}
memcpy(p, ebuf, m);
p[m] = '\0';
v = (const char **)u;
*v = (const char *)p;
u += sizeof(const char *) / sizeof(uint32_t);
}
}
pp = ((char *)&fp[1]) + fp->matches_length;
memcpy(pp, path, fplen);
pp[fplen] = '\0';
if (ofd >= 0) {
close(ofd);
ofd = -1;
}
lwsac_free(&lt_head);
if (ftsp->only_filepath)
break;
} while (o);
/* sort the instance file list by results density */
do {
struct lws_fts_result_filepath **prf, *rf1, *rf2;
stasis = 1;
/* bubble sort keeps going until nothing changed */
prf = &result->filepath_head;
while (*prf) {
rf1 = *prf;
rf2 = rf1->next;
if (rf2 && rf1->lines_in_file && rf2->lines_in_file &&
((rf1->matches * 1000) / rf1->lines_in_file) <
((rf2->matches * 1000) / rf2->lines_in_file)) {
stasis = 0;
*prf = rf2;
rf1->next = rf2->next;
rf2->next = rf1;
}
prf = &(*prf)->next;
}
} while (!stasis);
autocomp:
if (!(ftsp->flags & LWSFTS_F_QUERY_AUTOCOMPLETE) || nac)
return result;
/*
* autocomplete (ie, the descendent paths that yield the most hits)
*
* We actually need to spider the earliest terminal descendents from
* the child we definitely got past, and present the first n terminal
* strings. The descendents are already sorted in order of highest
* aggregated hits in their descendents first, so simply collecting n
* earliest leaf children is enough.
*
* The leaf children may be quite deep down in a stack however. So we
* have to go through all the walking motions collecting and retaining
* child into for when we come back up the walk.
*
* We can completely ignore file instances for this, we just need the
* earliest children. And we can restrict how many children we stash
* in each stack level to eg, 5.
*
* child_ofs comes in pointing at the start of the trie entry that is
* to be the starting point for making suggestions.
*/
budget = ftsp->max_autocomplete;
base = 0;
bp = 0;
pac = &result->autocomplete_head;
sp = 0;
if (pos > (int)sizeof(s[sp].ch[0].name) - 1)
pos = (int)sizeof(s[sp].ch[0].name) - 1;
memset(&s[sp], 0, sizeof(s[sp]));
s[sp].child = 1;
s[sp].tifs = fileofs_tif_start;
s[sp].self = child_ofs;
s[sp].ch[0].effpos = pos;
if (pos == nl)
n = ac_record(jtf, &ftsp->results_head, needle, pos, s, 0,
instances, agg_instances, children, &pac);
while (sp >= 0 && budget) {
int nobump = 0;
struct ch *tch = &s[sp].ch[s[sp].child - 1];
grab(child_ofs, sizeof(buf));
bp += rq32(&buf[bp], &fileofs_tif_start);
bp += rq32(&buf[bp], &children);
bp += rq32(&buf[bp], &instances);
bp += rq32(&buf[bp], &agg_instances);
if (sp > 0 && s[sp - 1].done_children &&
tch->effpos + tch->name_length >= nl &&
tch->inst && fileofs_tif_start) {
n = ac_record(jtf, &ftsp->results_head, needle, pos, s,
sp, tch->inst, tch->child_agg,
tch->descendents, &pac);
if (n < 0)
goto bail;
if (!n)
if (--budget == 0)
break;
}
if (!s[sp].done_children && children) {
s[sp].done_children = 1;
sp++;
memset(&s[sp], 0, sizeof(s[sp]));
s[sp].tifs = fileofs_tif_start;
s[sp].self = child_ofs;
for (n = 0; n < (int)children && s[sp].child_count <
(int)LWS_ARRAY_SIZE(s[0].ch); n++) {
uint32_t slen, cho, agg, inst;
int i = s[sp].child_count;
struct ch *ch = &s[sp].ch[i];
size_t max;
bp += rq32(&buf[bp], &cho);
bp += rq32(&buf[bp], &inst);
bp += rq32(&buf[bp], &agg);
bp += rq32(&buf[bp], &desc);
bp += rq32(&buf[bp], &slen);
max = slen;
if (max > sizeof(ch->name) - 1)
max = sizeof(ch->name) - 1;
strncpy(ch->name, (char *)&buf[bp], max);
bp += slen;
ch->name_length = (int)max;
ch->name[sizeof(ch->name) - 1] = '\0';
ch->inst = inst;
ch->effpos =
s[sp - 1].ch[s[sp - 1].child - 1].effpos;
ch->child_agg = agg;
ch->descendents = desc;
/*
* if we have more needle chars than we matched
* to get this far, we can only allow potential
* matches that are consistent with the
* additional unmatched character(s)...
*/
m = nl - ch->effpos;
if (m > ch->name_length)
m = ch->name_length;
if (m > 0 &&
strncmp(&needle[ch->effpos], ch->name, m))
continue;
ch->effpos += m;
s[sp].ch[s[sp].child_count++].ofs = cho;
}
}
while (sp >= 0 && s[sp].child >= s[sp].child_count) {
s[sp].done_children = 0;
sp--;
}
/*
* Compare parent remaining agg vs parent's next siblings' still
* intact original agg... if the next sibling has more, abandon
* the parent path and go with the sibling... this keeps the
* autocomplete results related to popularity.
*/
nobump = 0;
n = sp - 1;
while (n >= 0) {
struct lws_fts_result_autocomplete *ac =
(struct lws_fts_result_autocomplete *)pac;
if (s[n].child < s[n].child_count &&
s[n].ch[s[n].child - 1].child_agg <
s[n].ch[s[n].child].child_agg) {
if (pac)
/*
* mark the autocomplete result that
* there were more children down his
* path that we skipped in these results
*/
ac->elided = 1;
for (m = n; m < sp + 1; m++)
s[m].done_children = 0;
sp = n;
child_ofs = s[sp].ch[s[sp].child++].ofs;
nobump = 1;
}
n--;
}
if (nobump || sp < 0)
continue;
child_ofs = s[sp].ch[s[sp].child++].ofs;
}
/* let's do a final sort into agg order */
do {
struct lws_fts_result_autocomplete *ac1, *ac2;
stasis = 1;
/* bubble sort keeps going until nothing changed */
pac = &result->autocomplete_head;
while (*pac) {
ac1 = *pac;
ac2 = ac1->next;
if (ac2 && ac1->instances < ac2->instances) {
stasis = 0;
*pac = ac2;
ac1->next = ac2->next;
ac2->next = ac1;
}
pac = &(*pac)->next;
}
} while (!stasis);
return result;
bail:
if (ofd >= 0)
close(ofd);
lwsl_info("%s: search ended up at bail\n", __func__);
return result;
}