You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
210 lines
6.2 KiB
210 lines
6.2 KiB
//
|
|
// Copyright (C) 2018 The Android Open Source Project
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
|
|
include "utils/grammar/semantics/expression.fbs";
|
|
include "utils/i18n/language-tag.fbs";
|
|
include "utils/zlib/buffer.fbs";
|
|
|
|
// The terminal rules map as sorted strings table.
|
|
// The sorted terminal strings table is represented as offsets into the
|
|
// global strings pool, this allows to save memory between localized
|
|
// rules sets.
|
|
namespace libtextclassifier3.grammar.RulesSet_.Rules_;
|
|
table TerminalRulesMap {
|
|
// The offsets into the terminals pool.
|
|
terminal_offsets:[uint];
|
|
|
|
// The lhs set associated with a terminal rule.
|
|
// This is an offset into the (deduplicated) global `lhs_set` vector.
|
|
lhs_set_index:[uint];
|
|
|
|
// Bounds the lengths of the terminal strings for quick early lookup
|
|
// abort.
|
|
min_terminal_length:int;
|
|
|
|
max_terminal_length:int;
|
|
}
|
|
|
|
namespace libtextclassifier3.grammar.RulesSet_.Rules_;
|
|
struct UnaryRulesEntry {
|
|
key:uint (key);
|
|
value:uint;
|
|
}
|
|
|
|
// One key, value pair entry in the binary rules hash map.
|
|
// The key is a pair of nonterminals and the value the index of the lhs set.
|
|
namespace libtextclassifier3.grammar.RulesSet_.Rules_;
|
|
struct BinaryRule {
|
|
// The two rhs nonterminals.
|
|
rhs_first:uint;
|
|
|
|
rhs_second:uint;
|
|
|
|
// The lhs set associated with this binary rule.
|
|
// This is an offset into the (deduplicated) global `lhs_set` vector.
|
|
lhs_set_index:uint;
|
|
}
|
|
|
|
// One bucket in the binary rule hash map that contains all entries for a
|
|
// given hash value.
|
|
namespace libtextclassifier3.grammar.RulesSet_.Rules_;
|
|
table BinaryRuleTableBucket {
|
|
rules:[BinaryRule];
|
|
}
|
|
|
|
namespace libtextclassifier3.grammar.RulesSet_;
|
|
table Rules {
|
|
// The locale this rule set applies to.
|
|
locale:[LanguageTag];
|
|
|
|
terminal_rules:Rules_.TerminalRulesMap;
|
|
lowercase_terminal_rules:Rules_.TerminalRulesMap;
|
|
|
|
// The unary rules map.
|
|
// This is a map from a nonterminal to an lhs set index into the
|
|
// (deduplicated) global `lhs_set` vector.
|
|
unary_rules:[Rules_.UnaryRulesEntry];
|
|
|
|
// The binary rules (hash) map.
|
|
// This is a map from nonterminal pair to an lhs set index into the
|
|
// (deduplicated) global `lhs_set` vector.
|
|
binary_rules:[Rules_.BinaryRuleTableBucket];
|
|
}
|
|
|
|
// A set of lhs nonterminals associated with a rule match.
|
|
// Most commonly, that is just the id of the lhs nonterminal of the rule that
|
|
// is triggered, in this case `lhs` is set to the id of the nonterminal.
|
|
// If a callback needs to be triggered, lhs is the (negated) index into the
|
|
// `lhs` vector below that specifies additionally to the nonterminal, also the
|
|
// callback and parameter to call.
|
|
namespace libtextclassifier3.grammar.RulesSet_;
|
|
table LhsSet {
|
|
lhs:[int];
|
|
}
|
|
|
|
namespace libtextclassifier3.grammar.RulesSet_;
|
|
struct Lhs {
|
|
// The lhs nonterminal.
|
|
nonterminal:uint;
|
|
|
|
// The id of the callback to trigger.
|
|
callback_id:uint;
|
|
|
|
// A parameter to pass when invoking the callback.
|
|
callback_param:ulong;
|
|
|
|
// The maximum amount of whitespace allowed between the two parts.
|
|
// A value of -1 allows for unbounded whitespace.
|
|
max_whitespace_gap:byte;
|
|
}
|
|
|
|
namespace libtextclassifier3.grammar.RulesSet_.Nonterminals_;
|
|
table AnnotationNtEntry {
|
|
key:string (key, shared);
|
|
value:int;
|
|
}
|
|
|
|
// Usage of pre-defined non-terminals that the lexer can generate if used by
|
|
// the grammar.
|
|
namespace libtextclassifier3.grammar.RulesSet_;
|
|
table Nonterminals {
|
|
// Id of the nonterminal indicating the start of input.
|
|
start_nt:int;
|
|
|
|
// Id of the nonterminal indicating the end of input.
|
|
end_nt:int;
|
|
|
|
// Id of the nonterminal indicating a token.
|
|
token_nt:int;
|
|
|
|
// Id of the nonterminal indicating a string of digits.
|
|
digits_nt:int;
|
|
|
|
// `n_digits_nt[k]` is the id of the nonterminal indicating a string of
|
|
// `k` digits.
|
|
n_digits_nt:[int];
|
|
|
|
// Id of the nonterminal indicating a word or token boundary.
|
|
wordbreak_nt:int;
|
|
|
|
// Id of the nonterminal indicating an uppercase token.
|
|
uppercase_token_nt:int;
|
|
|
|
// Predefined nonterminals for annotations.
|
|
// Maps annotation/collection names to non-terminal ids.
|
|
annotation_nt:[Nonterminals_.AnnotationNtEntry];
|
|
}
|
|
|
|
namespace libtextclassifier3.grammar.RulesSet_.DebugInformation_;
|
|
table NonterminalNamesEntry {
|
|
key:int (key);
|
|
value:string (shared);
|
|
}
|
|
|
|
// Debug information for e.g. printing parse trees and show match
|
|
// information.
|
|
namespace libtextclassifier3.grammar.RulesSet_;
|
|
table DebugInformation {
|
|
nonterminal_names:[DebugInformation_.NonterminalNamesEntry];
|
|
}
|
|
|
|
// Regex annotators.
|
|
namespace libtextclassifier3.grammar.RulesSet_;
|
|
table RegexAnnotator {
|
|
// The pattern to run.
|
|
pattern:string (shared);
|
|
|
|
compressed_pattern:CompressedBuffer;
|
|
|
|
// The nonterminal to trigger.
|
|
nonterminal:uint;
|
|
}
|
|
|
|
// Context free grammar rules representation.
|
|
// Rules are represented in (mostly) Chomsky Normal Form, where all rules are
|
|
// of the following form, either:
|
|
// * <nonterm> ::= term
|
|
// * <nonterm> ::= <nonterm>
|
|
// * <nonterm> ::= <nonterm> <nonterm>
|
|
// The `terminals`, `unary_rules` and `binary_rules` maps below represent
|
|
// these sets of rules.
|
|
namespace libtextclassifier3.grammar;
|
|
table RulesSet {
|
|
rules:[RulesSet_.Rules];
|
|
lhs_set:[RulesSet_.LhsSet];
|
|
lhs:[RulesSet_.Lhs];
|
|
|
|
// Terminals string pool.
|
|
// The strings are zero-byte delimited and offset indexed by
|
|
// `terminal_offsets` in the terminals rules map.
|
|
terminals:string (shared);
|
|
|
|
nonterminals:RulesSet_.Nonterminals;
|
|
reserved_6:int16 (deprecated);
|
|
debug_information:RulesSet_.DebugInformation;
|
|
regex_annotator:[RulesSet_.RegexAnnotator];
|
|
|
|
// If true, will compile the regexes only on first use.
|
|
lazy_regex_compilation:bool;
|
|
|
|
// The semantic expressions associated with rule matches.
|
|
semantic_expression:[SemanticExpression];
|
|
|
|
// The schema defining the semantic results.
|
|
semantic_values_schema:[ubyte];
|
|
}
|
|
|