/* * Copyright (C) 2018 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // A lexer that (splits) and classifies tokens. // // Any whitespace gets absorbed into the token that follows them in the text. // For example, if the text contains: // // ...hello there world... // | | | // offset=16 39 52 // // then the output will be: // // "hello" [?, 16) // "there" [16, 44) <-- note "16" NOT "39" // "world" [44, ?) <-- note "44" NOT "52" // // This makes it appear to the Matcher as if the tokens are adjacent. #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_LEXER_H_ #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_LEXER_H_ #include #include "annotator/types.h" #include "utils/grammar/parsing/parse-tree.h" #include "utils/grammar/types.h" #include "utils/strings/stringpiece.h" #include "utils/utf8/unicodetext.h" #include "utils/utf8/unilib.h" namespace libtextclassifier3::grammar { // A lexical symbol with an identified meaning that represents raw tokens, // token categories or predefined text matches. // It is the unit fed to the grammar matcher. struct Symbol { // The type of the lexical symbol. enum class Type { // A raw token. TYPE_TERM, // A symbol representing a string of digits. TYPE_DIGITS, // Punctuation characters. TYPE_PUNCTUATION, // A predefined parse tree. TYPE_PARSE_TREE }; explicit Symbol() = default; // Constructs a symbol of a given type with an anchor in the text. Symbol(const Type type, const CodepointSpan codepoint_span, const int match_offset, StringPiece lexeme) : type(type), codepoint_span(codepoint_span), match_offset(match_offset), lexeme(lexeme) {} // Constructs a symbol from a pre-defined parse tree. explicit Symbol(ParseTree* parse_tree) : type(Type::TYPE_PARSE_TREE), codepoint_span(parse_tree->codepoint_span), match_offset(parse_tree->match_offset), parse_tree(parse_tree) {} // The type of the symbol. Type type; // The span in the text as codepoint offsets. CodepointSpan codepoint_span; // The match start offset (including preceding whitespace) as codepoint // offset. int match_offset; // The symbol text value. StringPiece lexeme; // The predefined parse tree. ParseTree* parse_tree; }; class Lexer { public: explicit Lexer(const UniLib* unilib) : unilib_(*unilib) {} // Processes a single token. // Splits a token into classified symbols. void AppendTokenSymbols(const StringPiece value, int match_offset, const CodepointSpan codepoint_span, std::vector* symbols) const; private: // Gets the type of a character. Symbol::Type GetSymbolType(const UnicodeText::const_iterator& it) const; const UniLib& unilib_; }; } // namespace libtextclassifier3::grammar #endif // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_LEXER_H_