/* * Copyright (C) 2018 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // A token based context-free grammar matcher. // // A parser passes token to the matcher: literal terminal strings and token // types. // The parser passes each token along with the [begin, end) position range // in which it occurs. So for an input string "Groundhog February 2, 2007", the // parser would tell the matcher that: // // "Groundhog" occurs at [0, 9) // "February" occurs at [9, 18) // occurs at [18, 20) // "," occurs at [20, 21) // occurs at [21, 26) // // Multiple overlapping symbols can be passed. // The only constraint on symbol order is that they have to be passed in // left-to-right order, strictly speaking, their "end" positions must be // nondecreasing. This constraint allows a more efficient matching algorithm. // The "begin" positions can be in any order. #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_MATCHER_H_ #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_MATCHER_H_ #include #include #include #include "annotator/types.h" #include "utils/base/arena.h" #include "utils/grammar/parsing/chart.h" #include "utils/grammar/parsing/derivation.h" #include "utils/grammar/parsing/parse-tree.h" #include "utils/grammar/rules_generated.h" #include "utils/strings/stringpiece.h" #include "utils/utf8/unilib.h" namespace libtextclassifier3::grammar { class Matcher { public: explicit Matcher(const UniLib* unilib, const RulesSet* rules, const std::vector rules_shards, UnsafeArena* arena) : unilib_(*unilib), arena_(arena), last_end_(std::numeric_limits().lowest()), rules_(rules), rules_shards_(rules_shards), pending_items_(nullptr), pending_exclusion_items_(nullptr) { TC3_CHECK_NE(rules, nullptr); } explicit Matcher(const UniLib* unilib, const RulesSet* rules, UnsafeArena* arena) : Matcher(unilib, rules, {}, arena) { rules_shards_.reserve(rules->rules()->size()); rules_shards_.insert(rules_shards_.end(), rules->rules()->begin(), rules->rules()->end()); } // Finish the matching. void Finish(); // Tells the matcher that the given terminal was found occupying position // range [begin, end) in the input. // The matcher may invoke callback functions before returning, if this // terminal triggers any new matches for rules in the grammar. // Calls to AddTerminal() and AddParseTree() must be in left-to-right order, // that is, the sequence of `end` values must be non-decreasing. void AddTerminal(const CodepointSpan codepoint_span, const int match_offset, StringPiece terminal); void AddTerminal(const CodepointIndex begin, const CodepointIndex end, StringPiece terminal) { AddTerminal(CodepointSpan{begin, end}, begin, terminal); } // Adds predefined parse tree. void AddParseTree(ParseTree* parse_tree); const Chart<> chart() const { return chart_; } private: // Process matches from lhs set. void ExecuteLhsSet(const CodepointSpan codepoint_span, const int match_offset, const int whitespace_gap, const std::function& initializer_fn, const RulesSet_::LhsSet* lhs_set); // Queues a newly created match item. void QueueForProcessing(ParseTree* item); // Queues a match item for later post checking of the exclusion condition. // For exclusions we need to check that the `item->excluded_nonterminal` // doesn't match the same span. As we cannot know which matches have already // been added, we queue the item for later post checking - once all matches // up to `item->codepoint_span.second` have been added. void QueueForPostCheck(ExclusionNode* item); // Adds pending items to the chart, possibly generating new matches as a // result. void ProcessPendingSet(); // Checks all pending exclusion matches that their exclusion condition is // fulfilled. void ProcessPendingExclusionMatches(); UniLib unilib_; // Memory arena for match allocation. UnsafeArena* arena_; // The end position of the most recent match or terminal, for sanity // checking. int last_end_; // Rules. const RulesSet* rules_; // The active rule shards. std::vector rules_shards_; // The set of items pending to be added to the chart as a singly-linked list. ParseTree* pending_items_; // The set of items pending to be post-checked as a singly-linked list. ExclusionNode* pending_exclusion_items_; // The chart data structure: a hashtable containing all matches, indexed by // their end positions. Chart<> chart_; }; } // namespace libtextclassifier3::grammar #endif // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_MATCHER_H_