/* * libwebsockets - small server side websockets and web server implementation * * Copyright (C) 2010 - 2019 Andy Green * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include "private-lib-core.h" #include "private-lib-misc-fts.h" #include #include #include #include #include #include #define AC_COUNT_STASHED_CHILDREN 8 struct ch { jg2_file_offset ofs; char name[64]; int inst; int child_agg; int name_length; int effpos; int descendents; }; struct wac { struct ch ch[AC_COUNT_STASHED_CHILDREN]; jg2_file_offset self; jg2_file_offset tifs; int child_count; int child; int agg; int desc; char done_children; char once; }; struct linetable { struct linetable *next; int chunk_line_number_start; int chunk_line_number_count; off_t chunk_filepos_start; off_t vli_ofs_in_index; }; static uint32_t b32(unsigned char *b) { return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3]; } static uint16_t b16(unsigned char *b) { return (b[0] << 8) | b[1]; } static int lws_fts_filepath(struct lws_fts_file *jtf, int filepath_index, char *result, size_t len, uint32_t *ofs_linetable, uint32_t *lines) { unsigned char buf[256 + 15]; uint32_t flen; int ra, bp = 0; size_t m; off_t o; if (filepath_index > jtf->filepaths) return 1; if (lseek(jtf->fd, jtf->filepath_table + (4 * filepath_index), SEEK_SET) < 0) { lwsl_err("%s: unable to seek\n", __func__); return 1; } ra = read(jtf->fd, buf, 4); if (ra < 0) return 1; o = (unsigned int)b32(buf); if (lseek(jtf->fd, o, SEEK_SET) < 0) { lwsl_err("%s: unable to seek\n", __func__); return 1; } ra = read(jtf->fd, buf, sizeof(buf)); if (ra < 0) return 1; if (ofs_linetable) bp += rq32(&buf[bp], ofs_linetable); else bp += rq32(&buf[bp], &flen); if (lines) bp += rq32(&buf[bp], lines); else bp += rq32(&buf[bp], &flen); bp += rq32(&buf[bp], &flen); m = flen; if (len - 1 < m) m = flen - 1; strncpy(result, (char *)&buf[bp], m); result[m] = '\0'; result[len - 1] = '\0'; return 0; } /* * returns -1 for fail or fd open on the trie file. * * *root is set to the position of the root trie entry. * *flen is set to the length of the whole file */ int lws_fts_adopt(struct lws_fts_file *jtf) { unsigned char buf[256]; off_t ot; if (read(jtf->fd, buf, TRIE_FILE_HDR_SIZE) != TRIE_FILE_HDR_SIZE) { lwsl_err("%s: unable to read file header\n", __func__); goto bail; } if (buf[0] != 0xca || buf[1] != 0x7a || buf[2] != 0x5f || buf[3] != 0x75) { lwsl_err("%s: bad magic %02X %02X %02X %02X\n", __func__, buf[0], buf[1], buf[2], buf[3]); goto bail; } jtf->root = b32(&buf[4]); ot = lseek(jtf->fd, 0, SEEK_END); if (ot < 0) { lwsl_err("%s: unable to seek\n", __func__); goto bail; } jtf->flen = ot; if (jtf->flen != b32(&buf[8])) { lwsl_err("%s: file size doesn't match expected\n", __func__); goto bail; } jtf->filepath_table = b32(&buf[12]); jtf->filepaths = b32(&buf[16]); return jtf->fd; bail: return -1; } struct lws_fts_file * lws_fts_open(const char *filepath) { struct lws_fts_file *jtf; jtf = lws_malloc(sizeof(*jtf), "fts open"); if (!jtf) goto bail1; jtf->fd = open(filepath, O_RDONLY); if (jtf->fd < 0) { lwsl_err("%s: unable to open %s\n", __func__, filepath); goto bail2; } if (lws_fts_adopt(jtf) < 0) goto bail3; return jtf; bail3: close(jtf->fd); bail2: lws_free(jtf); bail1: return NULL; } void lws_fts_close(struct lws_fts_file *jtf) { close(jtf->fd); lws_free(jtf); } #define grab(_pos, _size) { \ bp = 0; \ if (lseek(jtf->fd, _pos, SEEK_SET) < 0) { \ lwsl_err("%s: unable to seek\n", __func__); \ \ goto bail; \ } \ \ ra = read(jtf->fd, buf, _size); \ if (ra < 0) \ goto bail; \ } static struct linetable * lws_fts_cache_chunktable(struct lws_fts_file *jtf, uint32_t ofs_linetable, struct lwsac **linetable_head) { struct linetable *lt, *first = NULL, **prev = NULL; unsigned char buf[8]; int line = 1, bp, ra; off_t cfs = 0; *linetable_head = NULL; do { grab(ofs_linetable, sizeof(buf)); lt = lwsac_use(linetable_head, sizeof(*lt), 0); if (!lt) goto bail; if (!first) first = lt; lt->next = NULL; if (prev) *prev = lt; prev = <->next; lt->chunk_line_number_start = line; lt->chunk_line_number_count = b16(&buf[bp + 2]); lt->vli_ofs_in_index = ofs_linetable + 8; lt->chunk_filepos_start = cfs; line += lt->chunk_line_number_count; cfs += b32(&buf[bp + 4]); ofs_linetable += b16(&buf[bp]); } while (b16(&buf[bp])); return first; bail: lwsac_free(linetable_head); return NULL; } static int lws_fts_getfileoffset(struct lws_fts_file *jtf, struct linetable *ltstart, int line, off_t *_ofs) { struct linetable *lt = ltstart; unsigned char buf[LWS_FTS_LINES_PER_CHUNK * 5]; uint32_t ll; off_t ofs; int bp, ra; /* first figure out which chunk */ do { if (line >= lt->chunk_line_number_start && line < lt->chunk_line_number_start + lt->chunk_line_number_count) break; lt = lt->next; } while (lt); if (!lt) goto bail; /* we know it's in this chunk */ ofs = lt->chunk_filepos_start; line -= lt->chunk_line_number_start; grab(lt->vli_ofs_in_index, sizeof(buf)); bp = 0; while (line) { bp += rq32(&buf[bp], &ll); ofs += ll; line--; } /* we know the offset it is at in the original file */ *_ofs = ofs; return 0; bail: lwsl_info("%s: bail %d\n", __func__, line); return 1; } static int ac_record(struct lws_fts_file *jtf, struct lwsac **results_head, const char *needle, int pos, struct wac *s, int sp, uint32_t instances, uint32_t agg_instances, uint32_t children, struct lws_fts_result_autocomplete ***ppac) { struct lws_fts_result_autocomplete *ac; int n, m; char *p; if (!instances && !agg_instances) return 1; m = pos; for (n = 1; n <= sp; n++) m += s[n].ch[s[n].child - 1].name_length; ac = lwsac_use(results_head, sizeof(*ac) + m + 1, 0); if (!ac) return -1; p = (char *)(ac + 1); **ppac = ac; ac->next = NULL; *ppac = &ac->next; ac->instances = instances; ac->agg_instances = agg_instances; ac->ac_length = m; ac->has_children = !!children; ac->elided = 0; memcpy(p, needle, pos); p += pos; for (n = 1; n <= sp; n++) { int w = s[n].child - 1; memcpy(p, s[n].ch[w].name, s[n].ch[w].name_length); p += s[n].ch[w].name_length; } p = (char *)(ac + 1); p[m] = '\0'; /* * deduct this child's instance weight from his antecdents to track * relative path attractiveness dynamically, after we already used its * best results (children are sorted best-first) */ for (n = sp; n >= 0; n--) { s[n].ch[s[n].child - 1].child_agg -= instances; s[n].agg -= instances; } return 0; } struct lws_fts_result * lws_fts_search(struct lws_fts_file *jtf, struct lws_fts_search_params *ftsp) { uint32_t children, instances, co, sl, agg, slt, chunk, fileofs_tif_start, desc, agg_instances; int pos = 0, n, m, nl, bp, base = 0, ra, palm, budget, sp, ofd = -1; unsigned long long tf = lws_now_usecs(); struct lws_fts_result_autocomplete **pac = NULL; char stasis, nac = 0, credible, needle[32]; struct lws_fts_result_filepath *fp; struct lws_fts_result *result; unsigned char buf[4096]; off_t o, child_ofs; struct wac s[128]; ftsp->results_head = NULL; if (!ftsp->needle) return NULL; nl = (int)strlen(ftsp->needle); if ((size_t)nl > sizeof(needle) - 2) return NULL; result = lwsac_use(&ftsp->results_head, sizeof(*result), 0); if (!result) return NULL; /* start with no results... */ result->autocomplete_head = NULL; pac = &result->autocomplete_head; result->filepath_head = NULL; result->duration_ms = 0; result->effective_flags = ftsp->flags; palm = 0; for (n = 0; n < nl; n++) needle[n] = tolower(ftsp->needle[n]); needle[nl] = '\0'; o = jtf->root; do { bp = 0; base = 0; grab(o, sizeof(buf)); child_ofs = o + bp; bp += rq32(&buf[bp], &fileofs_tif_start); bp += rq32(&buf[bp], &children); bp += rq32(&buf[bp], &instances); bp += rq32(&buf[bp], &agg_instances); palm = pos; /* the children follow here */ if (pos == nl) { nac = 0; if (!fileofs_tif_start) /* * we matched, but there are no instances of * this, it's actually an intermediate */ goto autocomp; /* we leave with bp positioned at the instance list */ o = fileofs_tif_start; grab(o, sizeof(buf)); break; } if (ra - bp < 1024) { /* * We don't have enough. So reload the buffer starting * at where we got to. */ base += bp; grab(o + base, sizeof(buf)); } /* gets set if any child COULD match needle if it went on */ credible = 0; for (n = 0; (uint32_t)n < children; n++) { uint32_t inst; bp += rq32(&buf[bp], &co); bp += rq32(&buf[bp], &inst); bp += rq32(&buf[bp], &agg); bp += rq32(&buf[bp], &desc); bp += rq32(&buf[bp], &sl); if (sl > (uint32_t)(nl - pos)) { /* * it can't be a match because it's longer than * our needle string (but that leaves it as a * perfectly fine autocomplete candidate) */ size_t g = nl - pos; /* * "credible" means at least one child matches * all the chars in needle up to as many as it * has. If not "credible" this path cannot * match. */ if (!strncmp((char *)&buf[bp], &needle[pos], g)) credible = 1; else /* * deflate the parent agg using the * knowledge this child is not on the * path shown by the remainder of needle */ agg_instances -= agg; nac = 0; bp += sl; slt = 0; pos = palm; goto ensure; } /* the comparison string potentially has huge length */ slt = sl; while (slt) { /* * the strategy is to compare whatever we have * lying around, then bring in more if it didn't * fail to match yet. That way we don't bring * in anything we could already have known was * not needed due to a match fail. */ chunk = ra - bp; if (chunk > slt) chunk = slt; if ((chunk == 1 && needle[pos] != buf[bp]) || (chunk != 1 && memcmp(&needle[pos], &buf[bp], chunk))) { /* * it doesn't match... so nothing can * autocomplete this... */ bp += slt; slt = 0; nac = 1; goto ensure; } slt -= chunk; pos += chunk; bp += chunk; /* so far, it matches */ if (!slt) { /* we matched the whole thing */ o = co; if (!co) goto bail; n = (int)children; credible = 1; } ensure: /* * do we have at least buf more to match, or the * remainder of the string, whichever is less? * * bp may exceed sizeof(buf) on no match path */ chunk = sizeof(buf); if (slt < chunk) chunk = slt; if (ra - bp >= (int)chunk) continue; /* * We don't have enough. So reload buf starting * at where we got to. */ base += bp; grab(o + base, sizeof(buf)); } /* while we are still comparing */ } /* for each child */ if ((uint32_t)n == children) { if (!credible) goto bail; nac = 0; goto autocomp; } } while(1); result->duration_ms = (int)((lws_now_usecs() - tf) / 1000); if (!instances && !children) return result; /* the match list may easily exceed one read buffer load ... */ o += bp; /* * Only do the file match list if it was requested in the search flags */ if (!(ftsp->flags & LWSFTS_F_QUERY_FILES)) goto autocomp; do { uint32_t fi, tot, line, ro, ofs_linetable, lines, fplen, *u, _o; struct lwsac *lt_head = NULL; struct linetable *ltst; char path[256], *pp; int footprint; off_t fo; ofd = -1; grab(o, sizeof(buf)); ro = o; bp += rq32(&buf[bp], &_o); o = _o; assert(!o || o > TRIE_FILE_HDR_SIZE); bp += rq32(&buf[bp], &fi); bp += rq32(&buf[bp], &tot); if (lws_fts_filepath(jtf, fi, path, sizeof(path) - 1, &ofs_linetable, &lines)) { lwsl_err("can't get filepath index %d\n", fi); goto bail; } if (ftsp->only_filepath && strcmp(path, ftsp->only_filepath)) continue; ltst = lws_fts_cache_chunktable(jtf, ofs_linetable, <_head); if (!ltst) goto bail; if (ftsp->flags & LWSFTS_F_QUERY_QUOTE_LINE) { ofd = open(path, O_RDONLY); if (ofd < 0) { lwsac_free(<_head); goto bail; } } fplen = (int)strlen(path); footprint = sizeof(*fp) + fplen + 1; if (ftsp->flags & LWSFTS_F_QUERY_FILE_LINES) { /* line number and offset in file */ footprint += 2 * sizeof(uint32_t) * tot; if (ftsp->flags & LWSFTS_F_QUERY_QUOTE_LINE) /* pointer to quote string */ footprint += sizeof(void *) * tot; } fp = lwsac_use(&ftsp->results_head, footprint, 0); if (!fp) { lwsac_free(<_head); goto bail; } fp->filepath_length = fplen; fp->lines_in_file = lines; fp->matches = tot; fp->matches_length = footprint - sizeof(*fp) - (fplen + 1); fp->next = result->filepath_head; result->filepath_head = fp; /* line table first so it can be aligned */ u = (uint32_t*)(fp + 1); if (ftsp->flags & LWSFTS_F_QUERY_FILE_LINES) { /* for each line number */ for (n = 0; (uint32_t)n < tot; n++) { unsigned char lbuf[256], *p; char ebuf[384]; const char **v; int m; if ((ra - bp) < 8) { base += bp; grab(ro + base, sizeof(buf)); } bp += rq32(&buf[bp], &line); *u++ = line; if (lws_fts_getfileoffset(jtf, ltst, line, &fo)) continue; *u++ = (uint32_t)fo; if (!(ftsp->flags & LWSFTS_F_QUERY_QUOTE_LINE)) continue; if (lseek(ofd, fo, SEEK_SET) < 0) continue; m = read(ofd, lbuf, sizeof(lbuf) - 1); if (m < 0) continue; lbuf[sizeof(lbuf) - 1] = '\0'; p = (unsigned char *)strchr((char *)lbuf, '\n'); if (p) m = lws_ptr_diff(p, lbuf); lbuf[m] = '\0'; p = (unsigned char *)strchr((char *)lbuf, '\r'); if (p) m = lws_ptr_diff(p, lbuf); lbuf[m] = '\0'; lws_json_purify(ebuf, (const char *)lbuf, sizeof(ebuf) - 1, NULL); m = (int)strlen(ebuf); p = lwsac_use(&ftsp->results_head, m + 1, 0); if (!p) { lwsac_free(<_head); goto bail; } memcpy(p, ebuf, m); p[m] = '\0'; v = (const char **)u; *v = (const char *)p; u += sizeof(const char *) / sizeof(uint32_t); } } pp = ((char *)&fp[1]) + fp->matches_length; memcpy(pp, path, fplen); pp[fplen] = '\0'; if (ofd >= 0) { close(ofd); ofd = -1; } lwsac_free(<_head); if (ftsp->only_filepath) break; } while (o); /* sort the instance file list by results density */ do { struct lws_fts_result_filepath **prf, *rf1, *rf2; stasis = 1; /* bubble sort keeps going until nothing changed */ prf = &result->filepath_head; while (*prf) { rf1 = *prf; rf2 = rf1->next; if (rf2 && rf1->lines_in_file && rf2->lines_in_file && ((rf1->matches * 1000) / rf1->lines_in_file) < ((rf2->matches * 1000) / rf2->lines_in_file)) { stasis = 0; *prf = rf2; rf1->next = rf2->next; rf2->next = rf1; } prf = &(*prf)->next; } } while (!stasis); autocomp: if (!(ftsp->flags & LWSFTS_F_QUERY_AUTOCOMPLETE) || nac) return result; /* * autocomplete (ie, the descendent paths that yield the most hits) * * We actually need to spider the earliest terminal descendents from * the child we definitely got past, and present the first n terminal * strings. The descendents are already sorted in order of highest * aggregated hits in their descendents first, so simply collecting n * earliest leaf children is enough. * * The leaf children may be quite deep down in a stack however. So we * have to go through all the walking motions collecting and retaining * child into for when we come back up the walk. * * We can completely ignore file instances for this, we just need the * earliest children. And we can restrict how many children we stash * in each stack level to eg, 5. * * child_ofs comes in pointing at the start of the trie entry that is * to be the starting point for making suggestions. */ budget = ftsp->max_autocomplete; base = 0; bp = 0; pac = &result->autocomplete_head; sp = 0; if (pos > (int)sizeof(s[sp].ch[0].name) - 1) pos = (int)sizeof(s[sp].ch[0].name) - 1; memset(&s[sp], 0, sizeof(s[sp])); s[sp].child = 1; s[sp].tifs = fileofs_tif_start; s[sp].self = child_ofs; s[sp].ch[0].effpos = pos; if (pos == nl) n = ac_record(jtf, &ftsp->results_head, needle, pos, s, 0, instances, agg_instances, children, &pac); while (sp >= 0 && budget) { int nobump = 0; struct ch *tch = &s[sp].ch[s[sp].child - 1]; grab(child_ofs, sizeof(buf)); bp += rq32(&buf[bp], &fileofs_tif_start); bp += rq32(&buf[bp], &children); bp += rq32(&buf[bp], &instances); bp += rq32(&buf[bp], &agg_instances); if (sp > 0 && s[sp - 1].done_children && tch->effpos + tch->name_length >= nl && tch->inst && fileofs_tif_start) { n = ac_record(jtf, &ftsp->results_head, needle, pos, s, sp, tch->inst, tch->child_agg, tch->descendents, &pac); if (n < 0) goto bail; if (!n) if (--budget == 0) break; } if (!s[sp].done_children && children) { s[sp].done_children = 1; sp++; memset(&s[sp], 0, sizeof(s[sp])); s[sp].tifs = fileofs_tif_start; s[sp].self = child_ofs; for (n = 0; n < (int)children && s[sp].child_count < (int)LWS_ARRAY_SIZE(s[0].ch); n++) { uint32_t slen, cho, agg, inst; int i = s[sp].child_count; struct ch *ch = &s[sp].ch[i]; size_t max; bp += rq32(&buf[bp], &cho); bp += rq32(&buf[bp], &inst); bp += rq32(&buf[bp], &agg); bp += rq32(&buf[bp], &desc); bp += rq32(&buf[bp], &slen); max = slen; if (max > sizeof(ch->name) - 1) max = sizeof(ch->name) - 1; strncpy(ch->name, (char *)&buf[bp], max); bp += slen; ch->name_length = (int)max; ch->name[sizeof(ch->name) - 1] = '\0'; ch->inst = inst; ch->effpos = s[sp - 1].ch[s[sp - 1].child - 1].effpos; ch->child_agg = agg; ch->descendents = desc; /* * if we have more needle chars than we matched * to get this far, we can only allow potential * matches that are consistent with the * additional unmatched character(s)... */ m = nl - ch->effpos; if (m > ch->name_length) m = ch->name_length; if (m > 0 && strncmp(&needle[ch->effpos], ch->name, m)) continue; ch->effpos += m; s[sp].ch[s[sp].child_count++].ofs = cho; } } while (sp >= 0 && s[sp].child >= s[sp].child_count) { s[sp].done_children = 0; sp--; } /* * Compare parent remaining agg vs parent's next siblings' still * intact original agg... if the next sibling has more, abandon * the parent path and go with the sibling... this keeps the * autocomplete results related to popularity. */ nobump = 0; n = sp - 1; while (n >= 0) { struct lws_fts_result_autocomplete *ac = (struct lws_fts_result_autocomplete *)pac; if (s[n].child < s[n].child_count && s[n].ch[s[n].child - 1].child_agg < s[n].ch[s[n].child].child_agg) { if (pac) /* * mark the autocomplete result that * there were more children down his * path that we skipped in these results */ ac->elided = 1; for (m = n; m < sp + 1; m++) s[m].done_children = 0; sp = n; child_ofs = s[sp].ch[s[sp].child++].ofs; nobump = 1; } n--; } if (nobump || sp < 0) continue; child_ofs = s[sp].ch[s[sp].child++].ofs; } /* let's do a final sort into agg order */ do { struct lws_fts_result_autocomplete *ac1, *ac2; stasis = 1; /* bubble sort keeps going until nothing changed */ pac = &result->autocomplete_head; while (*pac) { ac1 = *pac; ac2 = ac1->next; if (ac2 && ac1->instances < ac2->instances) { stasis = 0; *pac = ac2; ac1->next = ac2->next; ac2->next = ac1; } pac = &(*pac)->next; } } while (!stasis); return result; bail: if (ofd >= 0) close(ofd); lwsl_info("%s: search ended up at bail\n", __func__); return result; }