You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
358 lines
14 KiB
358 lines
14 KiB
//===--- Dex.cpp - Dex Symbol Index Implementation --------------*- C++ -*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "Dex.h"
|
|
#include "FileDistance.h"
|
|
#include "FuzzyMatch.h"
|
|
#include "Quality.h"
|
|
#include "index/Index.h"
|
|
#include "index/dex/Iterator.h"
|
|
#include "support/Logger.h"
|
|
#include "support/Trace.h"
|
|
#include "llvm/ADT/StringSet.h"
|
|
#include "llvm/Support/ScopedPrinter.h"
|
|
#include <algorithm>
|
|
#include <queue>
|
|
|
|
namespace clang {
|
|
namespace clangd {
|
|
namespace dex {
|
|
|
|
std::unique_ptr<SymbolIndex> Dex::build(SymbolSlab Symbols, RefSlab Refs,
|
|
RelationSlab Rels) {
|
|
auto Size = Symbols.bytes() + Refs.bytes();
|
|
// There is no need to include "Rels" in Data because the relations are self-
|
|
// contained, without references into a backing store.
|
|
auto Data = std::make_pair(std::move(Symbols), std::move(Refs));
|
|
return std::make_unique<Dex>(Data.first, Data.second, Rels, std::move(Data),
|
|
Size);
|
|
}
|
|
|
|
namespace {
|
|
|
|
// Mark symbols which are can be used for code completion.
|
|
const Token RestrictedForCodeCompletion =
|
|
Token(Token::Kind::Sentinel, "Restricted For Code Completion");
|
|
|
|
// Helper to efficiently assemble the inverse index (token -> matching docs).
|
|
// The output is a nice uniform structure keyed on Token, but constructing
|
|
// the Token object every time we want to insert into the map is wasteful.
|
|
// Instead we have various maps keyed on things that are cheap to compute,
|
|
// and produce the Token keys once at the end.
|
|
class IndexBuilder {
|
|
llvm::DenseMap<Trigram, std::vector<DocID>> TrigramDocs;
|
|
std::vector<DocID> RestrictedCCDocs;
|
|
llvm::StringMap<std::vector<DocID>> TypeDocs;
|
|
llvm::StringMap<std::vector<DocID>> ScopeDocs;
|
|
llvm::StringMap<std::vector<DocID>> ProximityDocs;
|
|
std::vector<Trigram> TrigramScratch;
|
|
|
|
public:
|
|
// Add the tokens which are given symbol's characteristics.
|
|
// This includes fuzzy matching trigrams, symbol's scope, etc.
|
|
// FIXME(kbobyrev): Support more token types:
|
|
// * Namespace proximity
|
|
void add(const Symbol &Sym, DocID D) {
|
|
generateIdentifierTrigrams(Sym.Name, TrigramScratch);
|
|
for (Trigram T : TrigramScratch)
|
|
TrigramDocs[T].push_back(D);
|
|
ScopeDocs[Sym.Scope].push_back(D);
|
|
if (!llvm::StringRef(Sym.CanonicalDeclaration.FileURI).empty())
|
|
for (const auto &ProximityURI :
|
|
generateProximityURIs(Sym.CanonicalDeclaration.FileURI))
|
|
ProximityDocs[ProximityURI].push_back(D);
|
|
if (Sym.Flags & Symbol::IndexedForCodeCompletion)
|
|
RestrictedCCDocs.push_back(D);
|
|
if (!Sym.Type.empty())
|
|
TypeDocs[Sym.Type].push_back(D);
|
|
}
|
|
|
|
// Assemble the final compressed posting lists for the added symbols.
|
|
llvm::DenseMap<Token, PostingList> build() {
|
|
llvm::DenseMap<Token, PostingList> Result(/*InitialReserve=*/
|
|
TrigramDocs.size() +
|
|
RestrictedCCDocs.size() +
|
|
TypeDocs.size() +
|
|
ScopeDocs.size() +
|
|
ProximityDocs.size());
|
|
for (const auto &E : TrigramDocs)
|
|
Result.try_emplace(Token(Token::Kind::Trigram, E.first.str()), E.second);
|
|
for (const auto &E : TypeDocs)
|
|
Result.try_emplace(Token(Token::Kind::Type, E.first()), E.second);
|
|
for (const auto &E : ScopeDocs)
|
|
Result.try_emplace(Token(Token::Kind::Scope, E.first()), E.second);
|
|
for (const auto &E : ProximityDocs)
|
|
Result.try_emplace(Token(Token::Kind::ProximityURI, E.first()), E.second);
|
|
if (!RestrictedCCDocs.empty())
|
|
Result.try_emplace(RestrictedForCodeCompletion, RestrictedCCDocs);
|
|
return Result;
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
void Dex::buildIndex() {
|
|
this->Corpus = dex::Corpus(Symbols.size());
|
|
std::vector<std::pair<float, const Symbol *>> ScoredSymbols(Symbols.size());
|
|
|
|
for (size_t I = 0; I < Symbols.size(); ++I) {
|
|
const Symbol *Sym = Symbols[I];
|
|
LookupTable[Sym->ID] = Sym;
|
|
ScoredSymbols[I] = {quality(*Sym), Sym};
|
|
}
|
|
|
|
// Symbols are sorted by symbol qualities so that items in the posting lists
|
|
// are stored in the descending order of symbol quality.
|
|
llvm::sort(ScoredSymbols, std::greater<std::pair<float, const Symbol *>>());
|
|
|
|
// SymbolQuality was empty up until now.
|
|
SymbolQuality.resize(Symbols.size());
|
|
// Populate internal storage using Symbol + Score pairs.
|
|
for (size_t I = 0; I < ScoredSymbols.size(); ++I) {
|
|
SymbolQuality[I] = ScoredSymbols[I].first;
|
|
Symbols[I] = ScoredSymbols[I].second;
|
|
}
|
|
|
|
// Build posting lists for symbols.
|
|
IndexBuilder Builder;
|
|
for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank)
|
|
Builder.add(*Symbols[SymbolRank], SymbolRank);
|
|
InvertedIndex = Builder.build();
|
|
}
|
|
|
|
std::unique_ptr<Iterator> Dex::iterator(const Token &Tok) const {
|
|
auto It = InvertedIndex.find(Tok);
|
|
return It == InvertedIndex.end() ? Corpus.none()
|
|
: It->second.iterator(&It->first);
|
|
}
|
|
|
|
// Constructs BOOST iterators for Path Proximities.
|
|
std::unique_ptr<Iterator> Dex::createFileProximityIterator(
|
|
llvm::ArrayRef<std::string> ProximityPaths) const {
|
|
std::vector<std::unique_ptr<Iterator>> BoostingIterators;
|
|
// Deduplicate parent URIs extracted from the ProximityPaths.
|
|
llvm::StringSet<> ParentURIs;
|
|
llvm::StringMap<SourceParams> Sources;
|
|
for (const auto &Path : ProximityPaths) {
|
|
Sources[Path] = SourceParams();
|
|
auto PathURI = URI::create(Path);
|
|
const auto PathProximityURIs = generateProximityURIs(PathURI.toString());
|
|
for (const auto &ProximityURI : PathProximityURIs)
|
|
ParentURIs.insert(ProximityURI);
|
|
}
|
|
// Use SymbolRelevanceSignals for symbol relevance evaluation: use defaults
|
|
// for all parameters except for Proximity Path distance signal.
|
|
SymbolRelevanceSignals PathProximitySignals;
|
|
// DistanceCalculator will find the shortest distance from ProximityPaths to
|
|
// any URI extracted from the ProximityPaths.
|
|
URIDistance DistanceCalculator(Sources);
|
|
PathProximitySignals.FileProximityMatch = &DistanceCalculator;
|
|
// Try to build BOOST iterator for each Proximity Path provided by
|
|
// ProximityPaths. Boosting factor should depend on the distance to the
|
|
// Proximity Path: the closer processed path is, the higher boosting factor.
|
|
for (const auto &ParentURI : ParentURIs.keys()) {
|
|
// FIXME(kbobyrev): Append LIMIT on top of every BOOST iterator.
|
|
auto It = iterator(Token(Token::Kind::ProximityURI, ParentURI));
|
|
if (It->kind() != Iterator::Kind::False) {
|
|
PathProximitySignals.SymbolURI = ParentURI;
|
|
BoostingIterators.push_back(Corpus.boost(
|
|
std::move(It), PathProximitySignals.evaluateHeuristics()));
|
|
}
|
|
}
|
|
BoostingIterators.push_back(Corpus.all());
|
|
return Corpus.unionOf(std::move(BoostingIterators));
|
|
}
|
|
|
|
// Constructs BOOST iterators for preferred types.
|
|
std::unique_ptr<Iterator>
|
|
Dex::createTypeBoostingIterator(llvm::ArrayRef<std::string> Types) const {
|
|
std::vector<std::unique_ptr<Iterator>> BoostingIterators;
|
|
SymbolRelevanceSignals PreferredTypeSignals;
|
|
PreferredTypeSignals.TypeMatchesPreferred = true;
|
|
auto Boost = PreferredTypeSignals.evaluateHeuristics();
|
|
for (const auto &T : Types)
|
|
BoostingIterators.push_back(
|
|
Corpus.boost(iterator(Token(Token::Kind::Type, T)), Boost));
|
|
BoostingIterators.push_back(Corpus.all());
|
|
return Corpus.unionOf(std::move(BoostingIterators));
|
|
}
|
|
|
|
/// Constructs iterators over tokens extracted from the query and exhausts it
|
|
/// while applying Callback to each symbol in the order of decreasing quality
|
|
/// of the matched symbols.
|
|
bool Dex::fuzzyFind(const FuzzyFindRequest &Req,
|
|
llvm::function_ref<void(const Symbol &)> Callback) const {
|
|
assert(!StringRef(Req.Query).contains("::") &&
|
|
"There must be no :: in query.");
|
|
trace::Span Tracer("Dex fuzzyFind");
|
|
FuzzyMatcher Filter(Req.Query);
|
|
// For short queries we use specialized trigrams that don't yield all results.
|
|
// Prevent clients from postfiltering them for longer queries.
|
|
bool More = !Req.Query.empty() && Req.Query.size() < 3;
|
|
|
|
std::vector<std::unique_ptr<Iterator>> Criteria;
|
|
const auto TrigramTokens = generateQueryTrigrams(Req.Query);
|
|
|
|
// Generate query trigrams and construct AND iterator over all query
|
|
// trigrams.
|
|
std::vector<std::unique_ptr<Iterator>> TrigramIterators;
|
|
for (const auto &Trigram : TrigramTokens)
|
|
TrigramIterators.push_back(iterator(Trigram));
|
|
Criteria.push_back(Corpus.intersect(move(TrigramIterators)));
|
|
|
|
// Generate scope tokens for search query.
|
|
std::vector<std::unique_ptr<Iterator>> ScopeIterators;
|
|
for (const auto &Scope : Req.Scopes)
|
|
ScopeIterators.push_back(iterator(Token(Token::Kind::Scope, Scope)));
|
|
if (Req.AnyScope)
|
|
ScopeIterators.push_back(
|
|
Corpus.boost(Corpus.all(), ScopeIterators.empty() ? 1.0 : 0.2));
|
|
Criteria.push_back(Corpus.unionOf(move(ScopeIterators)));
|
|
|
|
// Add proximity paths boosting (all symbols, some boosted).
|
|
Criteria.push_back(createFileProximityIterator(Req.ProximityPaths));
|
|
// Add boosting for preferred types.
|
|
Criteria.push_back(createTypeBoostingIterator(Req.PreferredTypes));
|
|
|
|
if (Req.RestrictForCodeCompletion)
|
|
Criteria.push_back(iterator(RestrictedForCodeCompletion));
|
|
|
|
// Use TRUE iterator if both trigrams and scopes from the query are not
|
|
// present in the symbol index.
|
|
auto Root = Corpus.intersect(move(Criteria));
|
|
// Retrieve more items than it was requested: some of the items with high
|
|
// final score might not be retrieved otherwise.
|
|
// FIXME(kbobyrev): Tune this ratio.
|
|
if (Req.Limit)
|
|
Root = Corpus.limit(move(Root), *Req.Limit * 100);
|
|
SPAN_ATTACH(Tracer, "query", llvm::to_string(*Root));
|
|
vlog("Dex query tree: {0}", *Root);
|
|
|
|
using IDAndScore = std::pair<DocID, float>;
|
|
std::vector<IDAndScore> IDAndScores = consume(*Root);
|
|
|
|
auto Compare = [](const IDAndScore &LHS, const IDAndScore &RHS) {
|
|
return LHS.second > RHS.second;
|
|
};
|
|
TopN<IDAndScore, decltype(Compare)> Top(
|
|
Req.Limit ? *Req.Limit : std::numeric_limits<size_t>::max(), Compare);
|
|
for (const auto &IDAndScore : IDAndScores) {
|
|
const DocID SymbolDocID = IDAndScore.first;
|
|
const auto *Sym = Symbols[SymbolDocID];
|
|
const llvm::Optional<float> Score = Filter.match(Sym->Name);
|
|
if (!Score)
|
|
continue;
|
|
// Combine Fuzzy Matching score, precomputed symbol quality and boosting
|
|
// score for a cumulative final symbol score.
|
|
const float FinalScore =
|
|
(*Score) * SymbolQuality[SymbolDocID] * IDAndScore.second;
|
|
// If Top.push(...) returns true, it means that it had to pop an item. In
|
|
// this case, it is possible to retrieve more symbols.
|
|
if (Top.push({SymbolDocID, FinalScore}))
|
|
More = true;
|
|
}
|
|
|
|
// Apply callback to the top Req.Limit items in the descending
|
|
// order of cumulative score.
|
|
for (const auto &Item : std::move(Top).items())
|
|
Callback(*Symbols[Item.first]);
|
|
return More;
|
|
}
|
|
|
|
void Dex::lookup(const LookupRequest &Req,
|
|
llvm::function_ref<void(const Symbol &)> Callback) const {
|
|
trace::Span Tracer("Dex lookup");
|
|
for (const auto &ID : Req.IDs) {
|
|
auto I = LookupTable.find(ID);
|
|
if (I != LookupTable.end())
|
|
Callback(*I->second);
|
|
}
|
|
}
|
|
|
|
bool Dex::refs(const RefsRequest &Req,
|
|
llvm::function_ref<void(const Ref &)> Callback) const {
|
|
trace::Span Tracer("Dex refs");
|
|
uint32_t Remaining =
|
|
Req.Limit.getValueOr(std::numeric_limits<uint32_t>::max());
|
|
for (const auto &ID : Req.IDs)
|
|
for (const auto &Ref : Refs.lookup(ID)) {
|
|
if (!static_cast<int>(Req.Filter & Ref.Kind))
|
|
continue;
|
|
if (Remaining == 0)
|
|
return true; // More refs were available.
|
|
--Remaining;
|
|
Callback(Ref);
|
|
}
|
|
return false; // We reported all refs.
|
|
}
|
|
|
|
void Dex::relations(
|
|
const RelationsRequest &Req,
|
|
llvm::function_ref<void(const SymbolID &, const Symbol &)> Callback) const {
|
|
trace::Span Tracer("Dex relations");
|
|
uint32_t Remaining =
|
|
Req.Limit.getValueOr(std::numeric_limits<uint32_t>::max());
|
|
for (const SymbolID &Subject : Req.Subjects) {
|
|
LookupRequest LookupReq;
|
|
auto It = Relations.find(
|
|
std::make_pair(Subject, static_cast<uint8_t>(Req.Predicate)));
|
|
if (It != Relations.end()) {
|
|
for (const auto &Object : It->second) {
|
|
if (Remaining > 0) {
|
|
--Remaining;
|
|
LookupReq.IDs.insert(Object);
|
|
}
|
|
}
|
|
}
|
|
lookup(LookupReq, [&](const Symbol &Object) { Callback(Subject, Object); });
|
|
}
|
|
}
|
|
|
|
size_t Dex::estimateMemoryUsage() const {
|
|
size_t Bytes = Symbols.size() * sizeof(const Symbol *);
|
|
Bytes += SymbolQuality.size() * sizeof(float);
|
|
Bytes += LookupTable.getMemorySize();
|
|
Bytes += InvertedIndex.getMemorySize();
|
|
for (const auto &TokenToPostingList : InvertedIndex)
|
|
Bytes += TokenToPostingList.second.bytes();
|
|
Bytes += Refs.getMemorySize();
|
|
Bytes += Relations.getMemorySize();
|
|
return Bytes + BackingDataSize;
|
|
}
|
|
|
|
std::vector<std::string> generateProximityURIs(llvm::StringRef URIPath) {
|
|
std::vector<std::string> Result;
|
|
auto ParsedURI = URI::parse(URIPath);
|
|
assert(ParsedURI &&
|
|
"Non-empty argument of generateProximityURIs() should be a valid "
|
|
"URI.");
|
|
llvm::StringRef Body = ParsedURI->body();
|
|
// FIXME(kbobyrev): Currently, this is a heuristic which defines the maximum
|
|
// size of resulting vector. Some projects might want to have higher limit if
|
|
// the file hierarchy is deeper. For the generic case, it would be useful to
|
|
// calculate Limit in the index build stage by calculating the maximum depth
|
|
// of the project source tree at runtime.
|
|
size_t Limit = 5;
|
|
// Insert original URI before the loop: this would save a redundant iteration
|
|
// with a URI parse.
|
|
Result.emplace_back(ParsedURI->toString());
|
|
while (!Body.empty() && --Limit > 0) {
|
|
// FIXME(kbobyrev): Parsing and encoding path to URIs is not necessary and
|
|
// could be optimized.
|
|
Body = llvm::sys::path::parent_path(Body, llvm::sys::path::Style::posix);
|
|
if (!Body.empty())
|
|
Result.emplace_back(
|
|
URI(ParsedURI->scheme(), ParsedURI->authority(), Body).toString());
|
|
}
|
|
return Result;
|
|
}
|
|
|
|
} // namespace dex
|
|
} // namespace clangd
|
|
} // namespace clang
|