You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1049 lines
33 KiB
1049 lines
33 KiB
4 months ago
|
//
|
||
|
// Copyright (C) 2018 The Android Open Source Project
|
||
|
//
|
||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
// you may not use this file except in compliance with the License.
|
||
|
// You may obtain a copy of the License at
|
||
|
//
|
||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||
|
//
|
||
|
// Unless required by applicable law or agreed to in writing, software
|
||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
// See the License for the specific language governing permissions and
|
||
|
// limitations under the License.
|
||
|
//
|
||
|
|
||
|
include "annotator/entity-data.fbs";
|
||
|
include "annotator/experimental/experimental.fbs";
|
||
|
include "utils/codepoint-range.fbs";
|
||
|
include "utils/container/bit-vector.fbs";
|
||
|
include "utils/flatbuffers/flatbuffers.fbs";
|
||
|
include "utils/grammar/rules.fbs";
|
||
|
include "utils/intents/intent-config.fbs";
|
||
|
include "utils/normalization.fbs";
|
||
|
include "utils/resources.fbs";
|
||
|
include "utils/tokenizer.fbs";
|
||
|
include "utils/zlib/buffer.fbs";
|
||
|
|
||
|
file_identifier "TC2 ";
|
||
|
|
||
|
// The possible model modes, represents a bit field.
|
||
|
namespace libtextclassifier3;
|
||
|
enum ModeFlag : int {
|
||
|
NONE = 0,
|
||
|
ANNOTATION = 1,
|
||
|
CLASSIFICATION = 2,
|
||
|
ANNOTATION_AND_CLASSIFICATION = 3,
|
||
|
SELECTION = 4,
|
||
|
ANNOTATION_AND_SELECTION = 5,
|
||
|
CLASSIFICATION_AND_SELECTION = 6,
|
||
|
ALL = 7,
|
||
|
}
|
||
|
|
||
|
// Enum for specifying the annotation usecase.
|
||
|
namespace libtextclassifier3;
|
||
|
enum AnnotationUsecase : int {
|
||
|
// Results are optimized for Smart{Select,Share,Linkify}.
|
||
|
ANNOTATION_USECASE_SMART = 0,
|
||
|
// Smart{Select,Share,Linkify}
|
||
|
|
||
|
// Results are optimized for using TextClassifier as an infrastructure that
|
||
|
// annotates as much as possible.
|
||
|
ANNOTATION_USECASE_RAW = 1,
|
||
|
}
|
||
|
|
||
|
namespace libtextclassifier3;
|
||
|
enum DatetimeExtractorType : int {
|
||
|
UNKNOWN_DATETIME_EXTRACTOR_TYPE = 0,
|
||
|
AM = 1,
|
||
|
PM = 2,
|
||
|
JANUARY = 3,
|
||
|
FEBRUARY = 4,
|
||
|
MARCH = 5,
|
||
|
APRIL = 6,
|
||
|
MAY = 7,
|
||
|
JUNE = 8,
|
||
|
JULY = 9,
|
||
|
AUGUST = 10,
|
||
|
SEPTEMBER = 11,
|
||
|
OCTOBER = 12,
|
||
|
NOVEMBER = 13,
|
||
|
DECEMBER = 14,
|
||
|
NEXT = 15,
|
||
|
NEXT_OR_SAME = 16,
|
||
|
LAST = 17,
|
||
|
NOW = 18,
|
||
|
TOMORROW = 19,
|
||
|
YESTERDAY = 20,
|
||
|
PAST = 21,
|
||
|
FUTURE = 22,
|
||
|
DAY = 23,
|
||
|
WEEK = 24,
|
||
|
MONTH = 25,
|
||
|
YEAR = 26,
|
||
|
MONDAY = 27,
|
||
|
TUESDAY = 28,
|
||
|
WEDNESDAY = 29,
|
||
|
THURSDAY = 30,
|
||
|
FRIDAY = 31,
|
||
|
SATURDAY = 32,
|
||
|
SUNDAY = 33,
|
||
|
DAYS = 34,
|
||
|
WEEKS = 35,
|
||
|
MONTHS = 36,
|
||
|
|
||
|
// TODO(zilka): Make the following 3 values singular for consistency.
|
||
|
HOURS = 37,
|
||
|
|
||
|
MINUTES = 38,
|
||
|
SECONDS = 39,
|
||
|
YEARS = 40,
|
||
|
DIGITS = 41,
|
||
|
SIGNEDDIGITS = 42,
|
||
|
ZERO = 43,
|
||
|
ONE = 44,
|
||
|
TWO = 45,
|
||
|
THREE = 46,
|
||
|
FOUR = 47,
|
||
|
FIVE = 48,
|
||
|
SIX = 49,
|
||
|
SEVEN = 50,
|
||
|
EIGHT = 51,
|
||
|
NINE = 52,
|
||
|
TEN = 53,
|
||
|
ELEVEN = 54,
|
||
|
TWELVE = 55,
|
||
|
THIRTEEN = 56,
|
||
|
FOURTEEN = 57,
|
||
|
FIFTEEN = 58,
|
||
|
SIXTEEN = 59,
|
||
|
SEVENTEEN = 60,
|
||
|
EIGHTEEN = 61,
|
||
|
NINETEEN = 62,
|
||
|
TWENTY = 63,
|
||
|
THIRTY = 64,
|
||
|
FORTY = 65,
|
||
|
FIFTY = 66,
|
||
|
SIXTY = 67,
|
||
|
SEVENTY = 68,
|
||
|
EIGHTY = 69,
|
||
|
NINETY = 70,
|
||
|
HUNDRED = 71,
|
||
|
THOUSAND = 72,
|
||
|
NOON = 73,
|
||
|
MIDNIGHT = 74,
|
||
|
}
|
||
|
|
||
|
namespace libtextclassifier3;
|
||
|
enum DatetimeGroupType : int {
|
||
|
GROUP_UNKNOWN = 0,
|
||
|
GROUP_UNUSED = 1,
|
||
|
GROUP_YEAR = 2,
|
||
|
GROUP_MONTH = 3,
|
||
|
GROUP_DAY = 4,
|
||
|
GROUP_HOUR = 5,
|
||
|
GROUP_MINUTE = 6,
|
||
|
GROUP_SECOND = 7,
|
||
|
GROUP_AMPM = 8,
|
||
|
GROUP_RELATIONDISTANCE = 9,
|
||
|
GROUP_RELATION = 10,
|
||
|
GROUP_RELATIONTYPE = 11,
|
||
|
|
||
|
// Dummy groups serve just as an inflator of the selection. E.g. we might want
|
||
|
// to select more text than was contained in an envelope of all extractor
|
||
|
// spans.
|
||
|
GROUP_DUMMY1 = 12,
|
||
|
|
||
|
GROUP_DUMMY2 = 13,
|
||
|
GROUP_ABSOLUTETIME = 14,
|
||
|
}
|
||
|
|
||
|
// Options for the model that predicts text selection.
|
||
|
namespace libtextclassifier3;
|
||
|
table SelectionModelOptions {
|
||
|
// If true, before the selection is returned, the unpaired brackets contained
|
||
|
// in the predicted selection are stripped from the both selection ends.
|
||
|
// The bracket codepoints are defined in the Unicode standard:
|
||
|
// http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt
|
||
|
strip_unpaired_brackets:bool = true;
|
||
|
|
||
|
// Number of hypothetical click positions on either side of the actual click
|
||
|
// to consider in order to enforce symmetry.
|
||
|
symmetry_context_size:int;
|
||
|
|
||
|
// Number of examples to bundle in one batch for inference.
|
||
|
batch_size:int = 1024;
|
||
|
|
||
|
// Whether to always classify a suggested selection or only on demand.
|
||
|
always_classify_suggested_selection:bool = false;
|
||
|
}
|
||
|
|
||
|
// Options for the model that classifies a text selection.
|
||
|
namespace libtextclassifier3;
|
||
|
table ClassificationModelOptions {
|
||
|
// Limits for phone numbers.
|
||
|
phone_min_num_digits:int = 7;
|
||
|
|
||
|
phone_max_num_digits:int = 15;
|
||
|
|
||
|
// Limits for addresses.
|
||
|
address_min_num_tokens:int;
|
||
|
|
||
|
// Maximum number of tokens to attempt a classification (-1 is unlimited).
|
||
|
max_num_tokens:int = -1;
|
||
|
}
|
||
|
|
||
|
// Options for post-checks, checksums and verification to apply on a match.
|
||
|
namespace libtextclassifier3;
|
||
|
table VerificationOptions {
|
||
|
verify_luhn_checksum:bool = false;
|
||
|
|
||
|
// Lua verifier to use.
|
||
|
// Index of the lua verifier in the model.
|
||
|
lua_verifier:int = -1;
|
||
|
}
|
||
|
|
||
|
// Behaviour of rule capturing groups.
|
||
|
// This specifies how the text and span of a capturing group, in a regular
|
||
|
// expression or from a capturing match in a grammar rule, should be handled.
|
||
|
namespace libtextclassifier3;
|
||
|
table CapturingGroup {
|
||
|
// If true, the span of the capturing group will be used to
|
||
|
// extend the selection.
|
||
|
extend_selection:bool = true;
|
||
|
|
||
|
// If set, the text of the capturing group will be used to set a field in
|
||
|
// the classfication result entity data.
|
||
|
entity_field_path:FlatbufferFieldPath;
|
||
|
|
||
|
// If set, the flatbuffer entity data will be merged with the
|
||
|
// classification result entity data.
|
||
|
serialized_entity_data:string (shared);
|
||
|
|
||
|
// If set, normalization to apply before text is used in entity data.
|
||
|
normalization_options:NormalizationOptions;
|
||
|
|
||
|
entity_data:EntityData;
|
||
|
}
|
||
|
|
||
|
// List of regular expression matchers to check.
|
||
|
namespace libtextclassifier3.RegexModel_;
|
||
|
table Pattern {
|
||
|
// The name of the collection of a match.
|
||
|
collection_name:string (shared);
|
||
|
|
||
|
// The pattern to check.
|
||
|
pattern:string (shared);
|
||
|
|
||
|
// The modes for which to apply the patterns.
|
||
|
enabled_modes:ModeFlag = ALL;
|
||
|
|
||
|
// The final score to assign to the results of this pattern.
|
||
|
target_classification_score:float = 1;
|
||
|
|
||
|
// Priority score used for conflict resolution with the other models.
|
||
|
priority_score:float = 0;
|
||
|
|
||
|
// If true, will use an approximate matching implementation implemented
|
||
|
// using Find() instead of the true Match(). This approximate matching will
|
||
|
// use the first Find() result and then check that it spans the whole input.
|
||
|
use_approximate_matching:bool = false;
|
||
|
|
||
|
compressed_pattern:CompressedBuffer;
|
||
|
|
||
|
// Verification to apply on a match.
|
||
|
verification_options:VerificationOptions;
|
||
|
|
||
|
capturing_group:[CapturingGroup];
|
||
|
|
||
|
// Entity data to set for a match.
|
||
|
serialized_entity_data:string (shared);
|
||
|
|
||
|
entity_data:EntityData;
|
||
|
}
|
||
|
|
||
|
namespace libtextclassifier3;
|
||
|
table RegexModel {
|
||
|
patterns:[RegexModel_.Pattern];
|
||
|
|
||
|
// If true, will compile the regexes only on first use.
|
||
|
lazy_regex_compilation:bool = true;
|
||
|
|
||
|
// Lua scripts for match verification.
|
||
|
// The verifier can access:
|
||
|
// * `context`: The context as a string.
|
||
|
// * `match`: The groups of the regex match as an array, each group gives
|
||
|
// * `begin`: span start
|
||
|
// * `end`: span end
|
||
|
// * `text`: the text
|
||
|
// The verifier is expected to return a boolean, indicating whether the
|
||
|
// verification succeeded or not.
|
||
|
lua_verifier:[string];
|
||
|
}
|
||
|
|
||
|
// List of regex patterns.
|
||
|
namespace libtextclassifier3.DatetimeModelPattern_;
|
||
|
table Regex {
|
||
|
pattern:string (shared);
|
||
|
|
||
|
// The ith entry specifies the type of the ith capturing group.
|
||
|
// This is used to decide how the matched content has to be parsed.
|
||
|
groups:[DatetimeGroupType];
|
||
|
|
||
|
compressed_pattern:CompressedBuffer;
|
||
|
}
|
||
|
|
||
|
namespace libtextclassifier3;
|
||
|
table DatetimeModelPattern {
|
||
|
regexes:[DatetimeModelPattern_.Regex];
|
||
|
|
||
|
// List of locale indices in DatetimeModel that represent the locales that
|
||
|
// these patterns should be used for. If empty, can be used for all locales.
|
||
|
locales:[int];
|
||
|
|
||
|
// The final score to assign to the results of this pattern.
|
||
|
target_classification_score:float = 1;
|
||
|
|
||
|
// Priority score used for conflict resolution with the other models.
|
||
|
priority_score:float = 0;
|
||
|
|
||
|
// The modes for which to apply the patterns.
|
||
|
enabled_modes:ModeFlag = ALL;
|
||
|
|
||
|
// The annotation usecases for which to apply the patterns.
|
||
|
// This is a flag field for values of AnnotationUsecase.
|
||
|
enabled_annotation_usecases:uint = 4294967295;
|
||
|
}
|
||
|
|
||
|
namespace libtextclassifier3;
|
||
|
table DatetimeModelExtractor {
|
||
|
extractor:DatetimeExtractorType;
|
||
|
pattern:string (shared);
|
||
|
locales:[int];
|
||
|
compressed_pattern:CompressedBuffer;
|
||
|
}
|
||
|
|
||
|
namespace libtextclassifier3;
|
||
|
table DatetimeModel {
|
||
|
// List of BCP 47 locale strings representing all locales supported by the
|
||
|
// model. The individual patterns refer back to them using an index.
|
||
|
locales:[string];
|
||
|
|
||
|
patterns:[DatetimeModelPattern];
|
||
|
extractors:[DatetimeModelExtractor];
|
||
|
|
||
|
// If true, will use the extractors for determining the match location as
|
||
|
// opposed to using the location where the global pattern matched.
|
||
|
use_extractors_for_locating:bool = true;
|
||
|
|
||
|
// List of locale ids, rules of whose are always run, after the requested
|
||
|
// ones.
|
||
|
default_locales:[int];
|
||
|
|
||
|
// If true, will generate the alternative interpretations for ambiguous
|
||
|
// datetime expressions.
|
||
|
generate_alternative_interpretations_when_ambiguous:bool = false;
|
||
|
|
||
|
// If true, will compile the regexes only on first use.
|
||
|
lazy_regex_compilation:bool = true;
|
||
|
|
||
|
// If true, will give only future dates (when the day is not specified).
|
||
|
prefer_future_for_unspecified_date:bool = false;
|
||
|
}
|
||
|
|
||
|
// Configuration for the tokenizer.
|
||
|
namespace libtextclassifier3;
|
||
|
table GrammarTokenizerOptions {
|
||
|
tokenization_type:TokenizationType = ICU;
|
||
|
|
||
|
// If true, white space tokens will be kept when using the icu tokenizer.
|
||
|
icu_preserve_whitespace_tokens:bool = false;
|
||
|
|
||
|
// Codepoint ranges that determine what role the different codepoints play
|
||
|
// during tokenized. The ranges must not overlap.
|
||
|
tokenization_codepoint_config:[TokenizationCodepointRange];
|
||
|
|
||
|
// A set of codepoint ranges to use in the mixed tokenization mode to identify
|
||
|
// stretches of tokens to re-tokenize using the internal tokenizer.
|
||
|
internal_tokenizer_codepoint_ranges:[CodepointRange];
|
||
|
|
||
|
// If true, tokens will be also split when the codepoint's script_id changes
|
||
|
// as defined in TokenizationCodepointRange.
|
||
|
tokenize_on_script_change:bool = false;
|
||
|
}
|
||
|
|
||
|
namespace libtextclassifier3.DatetimeModelLibrary_;
|
||
|
table Item {
|
||
|
key:string (shared);
|
||
|
value:DatetimeModel;
|
||
|
}
|
||
|
|
||
|
// A set of named DateTime models.
|
||
|
namespace libtextclassifier3;
|
||
|
table DatetimeModelLibrary {
|
||
|
models:[DatetimeModelLibrary_.Item];
|
||
|
}
|
||
|
|
||
|
// Classification result to instantiate for a rule match.
|
||
|
namespace libtextclassifier3.GrammarModel_;
|
||
|
table RuleClassificationResult {
|
||
|
// The name of the collection.
|
||
|
collection_name:string (shared);
|
||
|
|
||
|
// The score.
|
||
|
target_classification_score:float = 1;
|
||
|
|
||
|
// The priority score used for conflict resolution with the other models.
|
||
|
priority_score:float = 0;
|
||
|
|
||
|
// Behaviour of capturing matches.
|
||
|
capturing_group:[CapturingGroup];
|
||
|
|
||
|
// Entity data to set for a match.
|
||
|
serialized_entity_data:string (shared);
|
||
|
|
||
|
// Enabled modes.
|
||
|
enabled_modes:ModeFlag = ALL;
|
||
|
|
||
|
entity_data:EntityData;
|
||
|
}
|
||
|
|
||
|
// Configuration for grammar based annotators.
|
||
|
namespace libtextclassifier3;
|
||
|
table GrammarModel {
|
||
|
// The grammar rules.
|
||
|
rules:grammar.RulesSet;
|
||
|
|
||
|
rule_classification_result:[GrammarModel_.RuleClassificationResult];
|
||
|
|
||
|
// Number of tokens in the context to use for classification and text
|
||
|
// selection suggestion.
|
||
|
// A value -1 uses the full context.
|
||
|
context_left_num_tokens:int;
|
||
|
|
||
|
context_right_num_tokens:int;
|
||
|
|
||
|
// Grammar specific tokenizer options.
|
||
|
tokenizer_options:GrammarTokenizerOptions;
|
||
|
|
||
|
// The score.
|
||
|
target_classification_score:float = 1;
|
||
|
|
||
|
// The priority score used for conflict resolution with the other models.
|
||
|
priority_score:float = 1;
|
||
|
}
|
||
|
|
||
|
namespace libtextclassifier3.MoneyParsingOptions_;
|
||
|
table QuantitiesNameToExponentEntry {
|
||
|
key:string (key, shared);
|
||
|
value:int;
|
||
|
}
|
||
|
|
||
|
namespace libtextclassifier3;
|
||
|
table MoneyParsingOptions {
|
||
|
// Separators (codepoints) marking decimal or thousand in the money amount.
|
||
|
separators:[int];
|
||
|
|
||
|
// Mapping between a quantity string (e.g. "million") and the power of 10
|
||
|
// it multiplies the amount with (e.g. 6 in case of "million").
|
||
|
// NOTE: The entries need to be sorted by key since we use LookupByKey.
|
||
|
quantities_name_to_exponent:[MoneyParsingOptions_.QuantitiesNameToExponentEntry];
|
||
|
}
|
||
|
|
||
|
namespace libtextclassifier3.ModelTriggeringOptions_;
|
||
|
table CollectionToPriorityEntry {
|
||
|
key:string (key, shared);
|
||
|
value:float;
|
||
|
}
|
||
|
|
||
|
// Options controlling the output of the Tensorflow Lite models.
|
||
|
namespace libtextclassifier3;
|
||
|
table ModelTriggeringOptions {
|
||
|
// Lower bound threshold for filtering annotation model outputs.
|
||
|
min_annotate_confidence:float = 0;
|
||
|
|
||
|
// The modes for which to enable the models.
|
||
|
enabled_modes:ModeFlag = ALL;
|
||
|
|
||
|
// Comma-separated list of locales (BCP 47 tags) that dictionary
|
||
|
// classification supports.
|
||
|
dictionary_locales:string (shared);
|
||
|
|
||
|
// Comma-separated list of locales (BCP 47 tags) that the model supports, that
|
||
|
// are used to prevent triggering on input in unsupported languages. If
|
||
|
// empty, the model will trigger on all inputs.
|
||
|
locales:string (shared);
|
||
|
|
||
|
// Priority score assigned to the "other" class from ML model.
|
||
|
other_collection_priority_score:float = -1000;
|
||
|
|
||
|
// Priority score assigned to knowledge engine annotations.
|
||
|
knowledge_priority_score:float = 0;
|
||
|
reserved_7:int16 (deprecated);
|
||
|
|
||
|
// Apply a factor to the priority score for entities that are added to this
|
||
|
// map. Key: collection type e.g. "address", "phone"..., Value: float number.
|
||
|
// NOTE: The entries here need to be sorted since we use LookupByKey.
|
||
|
collection_to_priority:[ModelTriggeringOptions_.CollectionToPriorityEntry];
|
||
|
}
|
||
|
|
||
|
// Options controlling the output of the classifier.
|
||
|
namespace libtextclassifier3;
|
||
|
table OutputOptions {
|
||
|
// Lists of collection names that will be filtered out at the output:
|
||
|
// - For annotation, the spans of given collection are simply dropped.
|
||
|
// - For classification, the result is mapped to the class "other".
|
||
|
// - For selection, the spans of given class are returned as
|
||
|
// single-selection.
|
||
|
filtered_collections_annotation:[string];
|
||
|
|
||
|
filtered_collections_classification:[string];
|
||
|
filtered_collections_selection:[string];
|
||
|
}
|
||
|
|
||
|
namespace libtextclassifier3.Model_;
|
||
|
table EmbeddingPruningMask {
|
||
|
// If true, use pruning mask. In this case, we use mask
|
||
|
// pruning_mask to determine the mapping of hashed-charactergrams.
|
||
|
enabled:bool;
|
||
|
|
||
|
// Packing of the binary pruning mask into uint64 values.
|
||
|
pruning_mask:[ulong] (force_align: 16);
|
||
|
|
||
|
// Number of buckets before pruning.
|
||
|
full_num_buckets:int;
|
||
|
|
||
|
// Index of row of compressed embedding matrix to which all pruned buckets
|
||
|
// are mapped.
|
||
|
pruned_row_bucket_id:int;
|
||
|
}
|
||
|
|
||
|
namespace libtextclassifier3.Model_;
|
||
|
table ConflictResolutionOptions {
|
||
|
// If true, will prioritize the longest annotation during conflict
|
||
|
// resolution.
|
||
|
prioritize_longest_annotation:bool = false;
|
||
|
|
||
|
// If true, the annotator will perform conflict resolution between the
|
||
|
// different sub-annotators also in the RAW mode. If false, no conflict
|
||
|
// resolution will be performed in RAW mode.
|
||
|
do_conflict_resolution_in_raw_mode:bool = true;
|
||
|
}
|
||
|
|
||
|
namespace libtextclassifier3;
|
||
|
table Model {
|
||
|
// Comma-separated list of locales supported by the model as BCP 47 tags.
|
||
|
locales:string (shared);
|
||
|
|
||
|
version:int;
|
||
|
|
||
|
// A name for the model that can be used for e.g. logging.
|
||
|
name:string (shared);
|
||
|
|
||
|
selection_feature_options:FeatureProcessorOptions;
|
||
|
classification_feature_options:FeatureProcessorOptions;
|
||
|
|
||
|
// Tensorflow Lite models.
|
||
|
selection_model:[ubyte] (force_align: 16);
|
||
|
|
||
|
classification_model:[ubyte] (force_align: 16);
|
||
|
embedding_model:[ubyte] (force_align: 16);
|
||
|
|
||
|
// Options for the different models.
|
||
|
selection_options:SelectionModelOptions;
|
||
|
|
||
|
classification_options:ClassificationModelOptions;
|
||
|
regex_model:RegexModel;
|
||
|
datetime_model:DatetimeModel;
|
||
|
|
||
|
// Options controlling the output of the models.
|
||
|
triggering_options:ModelTriggeringOptions;
|
||
|
|
||
|
// Global switch that controls if SuggestSelection(), ClassifyText() and
|
||
|
// Annotate() will run. If a mode is disabled it returns empty/no-op results.
|
||
|
enabled_modes:ModeFlag = ALL;
|
||
|
|
||
|
// If true, will snap the selections that consist only of whitespaces to the
|
||
|
// containing suggested span. Otherwise, no suggestion is proposed, since the
|
||
|
// selections are not part of any token.
|
||
|
snap_whitespace_selections:bool = true;
|
||
|
|
||
|
// Global configuration for the output of SuggestSelection(), ClassifyText()
|
||
|
// and Annotate().
|
||
|
output_options:OutputOptions;
|
||
|
|
||
|
// Configures how Intents should be generated on Android.
|
||
|
android_intent_options:AndroidIntentFactoryOptions;
|
||
|
|
||
|
intent_options:IntentFactoryModel;
|
||
|
|
||
|
// Model resources.
|
||
|
resources:ResourcePool;
|
||
|
|
||
|
// Schema data for handling entity data.
|
||
|
entity_data_schema:[ubyte];
|
||
|
|
||
|
number_annotator_options:NumberAnnotatorOptions;
|
||
|
duration_annotator_options:DurationAnnotatorOptions;
|
||
|
|
||
|
// Comma-separated list of locales (BCP 47 tags) that the model supports, that
|
||
|
// are used to prevent triggering on input in unsupported languages. If
|
||
|
// empty, the model will trigger on all inputs.
|
||
|
triggering_locales:string (shared);
|
||
|
|
||
|
embedding_pruning_mask:Model_.EmbeddingPruningMask;
|
||
|
reserved_25:int16 (deprecated);
|
||
|
contact_annotator_options:ContactAnnotatorOptions;
|
||
|
money_parsing_options:MoneyParsingOptions;
|
||
|
translate_annotator_options:TranslateAnnotatorOptions;
|
||
|
grammar_model:GrammarModel;
|
||
|
conflict_resolution_options:Model_.ConflictResolutionOptions;
|
||
|
experimental_model:ExperimentalModel;
|
||
|
pod_ner_model:PodNerModel;
|
||
|
vocab_model:VocabModel;
|
||
|
datetime_grammar_model:GrammarModel;
|
||
|
}
|
||
|
|
||
|
// Method for selecting the center token.
|
||
|
namespace libtextclassifier3.FeatureProcessorOptions_;
|
||
|
enum CenterTokenSelectionMethod : int {
|
||
|
DEFAULT_CENTER_TOKEN_METHOD = 0,
|
||
|
// Invalid option.
|
||
|
|
||
|
// Use click indices to determine the center token.
|
||
|
CENTER_TOKEN_FROM_CLICK = 1,
|
||
|
|
||
|
// Use selection indices to get a token range, and select the middle of it
|
||
|
// as the center token.
|
||
|
CENTER_TOKEN_MIDDLE_OF_SELECTION = 2,
|
||
|
}
|
||
|
|
||
|
// Bounds-sensitive feature extraction configuration.
|
||
|
namespace libtextclassifier3.FeatureProcessorOptions_;
|
||
|
table BoundsSensitiveFeatures {
|
||
|
// Enables the extraction of bounds-sensitive features, instead of the click
|
||
|
// context features.
|
||
|
enabled:bool;
|
||
|
|
||
|
// The numbers of tokens to extract in specific locations relative to the
|
||
|
// bounds.
|
||
|
// Immediately before the span.
|
||
|
num_tokens_before:int;
|
||
|
|
||
|
// Inside the span, aligned with the beginning.
|
||
|
num_tokens_inside_left:int;
|
||
|
|
||
|
// Inside the span, aligned with the end.
|
||
|
num_tokens_inside_right:int;
|
||
|
|
||
|
// Immediately after the span.
|
||
|
num_tokens_after:int;
|
||
|
|
||
|
// If true, also extracts the tokens of the entire span and adds up their
|
||
|
// features forming one "token" to include in the extracted features.
|
||
|
include_inside_bag:bool;
|
||
|
|
||
|
// If true, includes the selection length (in the number of tokens) as a
|
||
|
// feature.
|
||
|
include_inside_length:bool;
|
||
|
|
||
|
// If true, for selection, single token spans are not run through the model
|
||
|
// and their score is assumed to be zero.
|
||
|
score_single_token_spans_as_zero:bool;
|
||
|
}
|
||
|
|
||
|
namespace libtextclassifier3;
|
||
|
table FeatureProcessorOptions {
|
||
|
// Number of buckets used for hashing charactergrams.
|
||
|
num_buckets:int = -1;
|
||
|
|
||
|
// Size of the embedding.
|
||
|
embedding_size:int = -1;
|
||
|
|
||
|
// Number of bits for quantization for embeddings.
|
||
|
embedding_quantization_bits:int = 8;
|
||
|
|
||
|
// Context size defines the number of words to the left and to the right of
|
||
|
// the selected word to be used as context. For example, if context size is
|
||
|
// N, then we take N words to the left and N words to the right of the
|
||
|
// selected word as its context.
|
||
|
context_size:int = -1;
|
||
|
|
||
|
// Maximum number of words of the context to select in total.
|
||
|
max_selection_span:int = -1;
|
||
|
|
||
|
// Orders of charactergrams to extract. E.g., 2 means character bigrams, 3
|
||
|
// character trigrams etc.
|
||
|
chargram_orders:[int];
|
||
|
|
||
|
// Maximum length of a word, in codepoints.
|
||
|
max_word_length:int = 20;
|
||
|
|
||
|
// If true, will use the unicode-aware functionality for extracting features.
|
||
|
unicode_aware_features:bool = false;
|
||
|
|
||
|
// Whether to extract the token case feature.
|
||
|
extract_case_feature:bool = false;
|
||
|
|
||
|
// Whether to extract the selection mask feature.
|
||
|
extract_selection_mask_feature:bool = false;
|
||
|
|
||
|
// List of regexps to run over each token. For each regexp, if there is a
|
||
|
// match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used.
|
||
|
regexp_feature:[string];
|
||
|
|
||
|
// Whether to remap all digits to a single number.
|
||
|
remap_digits:bool = false;
|
||
|
|
||
|
// Whether to lower-case each token before generating hashgrams.
|
||
|
lowercase_tokens:bool;
|
||
|
|
||
|
// If true, the selection classifier output will contain only the selections
|
||
|
// that are feasible (e.g., those that are shorter than max_selection_span),
|
||
|
// if false, the output will be a complete cross-product of possible
|
||
|
// selections to the left and possible selections to the right, including the
|
||
|
// infeasible ones.
|
||
|
// NOTE: Exists mainly for compatibility with older models that were trained
|
||
|
// with the non-reduced output space.
|
||
|
selection_reduced_output_space:bool = true;
|
||
|
|
||
|
// Collection names.
|
||
|
collections:[string];
|
||
|
|
||
|
// An index of collection in collections to be used if a collection name can't
|
||
|
// be mapped to an id.
|
||
|
default_collection:int = -1;
|
||
|
|
||
|
// If true, will split the input by lines, and only use the line that contains
|
||
|
// the clicked token.
|
||
|
only_use_line_with_click:bool = false;
|
||
|
|
||
|
// If true, will split tokens that contain the selection boundary, at the
|
||
|
// position of the boundary.
|
||
|
// E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com"
|
||
|
split_tokens_on_selection_boundaries:bool = false;
|
||
|
|
||
|
// Codepoint ranges that determine how different codepoints are tokenized.
|
||
|
// The ranges must not overlap.
|
||
|
tokenization_codepoint_config:[TokenizationCodepointRange];
|
||
|
|
||
|
center_token_selection_method:FeatureProcessorOptions_.CenterTokenSelectionMethod;
|
||
|
|
||
|
// If true, span boundaries will be snapped to containing tokens and not
|
||
|
// required to exactly match token boundaries.
|
||
|
snap_label_span_boundaries_to_containing_tokens:bool;
|
||
|
|
||
|
// A set of codepoint ranges supported by the model.
|
||
|
supported_codepoint_ranges:[CodepointRange];
|
||
|
|
||
|
// A set of codepoint ranges to use in the mixed tokenization mode to identify
|
||
|
// stretches of tokens to re-tokenize using the internal tokenizer.
|
||
|
internal_tokenizer_codepoint_ranges:[CodepointRange];
|
||
|
|
||
|
// Minimum ratio of supported codepoints in the input context. If the ratio
|
||
|
// is lower than this, the feature computation will fail.
|
||
|
min_supported_codepoint_ratio:float = 0;
|
||
|
|
||
|
// Used for versioning the format of features the model expects.
|
||
|
// - feature_version == 0:
|
||
|
// For each token the features consist of:
|
||
|
// - chargram embeddings
|
||
|
// - dense features
|
||
|
// Chargram embeddings for tokens are concatenated first together,
|
||
|
// and at the end, the dense features for the tokens are concatenated
|
||
|
// to it. So the resulting feature vector has two regions.
|
||
|
feature_version:int = 0;
|
||
|
|
||
|
tokenization_type:TokenizationType = INTERNAL_TOKENIZER;
|
||
|
icu_preserve_whitespace_tokens:bool = false;
|
||
|
|
||
|
// List of codepoints that will be stripped from beginning and end of
|
||
|
// predicted spans.
|
||
|
ignored_span_boundary_codepoints:[int];
|
||
|
|
||
|
bounds_sensitive_features:FeatureProcessorOptions_.BoundsSensitiveFeatures;
|
||
|
|
||
|
// List of allowed charactergrams. The extracted charactergrams are filtered
|
||
|
// using this list, and charactergrams that are not present are interpreted as
|
||
|
// out-of-vocabulary.
|
||
|
// If no allowed_chargrams are specified, all charactergrams are allowed.
|
||
|
// The field is typed as bytes type to allow non-UTF8 chargrams.
|
||
|
allowed_chargrams:[string];
|
||
|
|
||
|
// If true, tokens will be also split when the codepoint's script_id changes
|
||
|
// as defined in TokenizationCodepointRange.
|
||
|
tokenize_on_script_change:bool = false;
|
||
|
|
||
|
// If true, the pipe character '|' will be used as a newline character when
|
||
|
// splitting lines.
|
||
|
use_pipe_character_for_newline:bool = true;
|
||
|
}
|
||
|
|
||
|
namespace libtextclassifier3;
|
||
|
table NumberAnnotatorOptions {
|
||
|
// If true, number and percentage annotations will be produced.
|
||
|
enabled:bool = false;
|
||
|
|
||
|
// Score to assign to the annotated numbers and percentages in the annotator.
|
||
|
score:float = 1;
|
||
|
|
||
|
// Number priority score used for conflict resolution with the other models.
|
||
|
priority_score:float = 0;
|
||
|
|
||
|
// The modes in which to enable number and percentage annotations.
|
||
|
enabled_modes:ModeFlag = ALL;
|
||
|
|
||
|
// The annotation usecases for which to produce number annotations.
|
||
|
// This is a flag field for values of AnnotationUsecase.
|
||
|
enabled_annotation_usecases:uint = 4294967295;
|
||
|
|
||
|
// [Deprecated] A list of codepoints that can form a prefix of a valid number.
|
||
|
allowed_prefix_codepoints:[int];
|
||
|
|
||
|
// [Deprecated] A list of codepoints that can form a suffix of a valid number.
|
||
|
allowed_suffix_codepoints:[int];
|
||
|
|
||
|
// [Deprecated] List of codepoints that will be stripped from beginning of
|
||
|
// predicted spans.
|
||
|
ignored_prefix_span_boundary_codepoints:[int];
|
||
|
|
||
|
// [Deprecated] List of codepoints that will be stripped from end of predicted
|
||
|
// spans.
|
||
|
ignored_suffix_span_boundary_codepoints:[int];
|
||
|
|
||
|
// [Deprecated] If true, percent annotations will be produced.
|
||
|
enable_percentage:bool = false;
|
||
|
|
||
|
// Zero separated and ordered list of suffixes that mark a percent.
|
||
|
percentage_pieces_string:string (shared);
|
||
|
|
||
|
// [Deprecated] List of suffixes offsets in the percent_pieces_string string.
|
||
|
percentage_pieces_offsets:[int];
|
||
|
|
||
|
// Priority score for the percentage annotation.
|
||
|
percentage_priority_score:float = 1;
|
||
|
|
||
|
// Float number priority score used for conflict resolution with the other
|
||
|
// models.
|
||
|
float_number_priority_score:float = 0;
|
||
|
|
||
|
// The maximum number of digits an annotated number can have. Requirement:
|
||
|
// the value should be less or equal to 20.
|
||
|
max_number_of_digits:int = 20;
|
||
|
|
||
|
// The annotation usecases for which to produce percentage annotations.
|
||
|
// This is a flag field for values of AnnotationUsecase.
|
||
|
percentage_annotation_usecases:uint = 2;
|
||
|
}
|
||
|
|
||
|
// DurationAnnotator is so far tailored for English and Japanese only.
|
||
|
namespace libtextclassifier3;
|
||
|
table DurationAnnotatorOptions {
|
||
|
// If true, duration annotations will be produced.
|
||
|
enabled:bool = false;
|
||
|
|
||
|
// Score to assign to the annotated durations from the annotator.
|
||
|
score:float = 1;
|
||
|
|
||
|
// Priority score used for conflict resolution with the other models.
|
||
|
priority_score:float = 0;
|
||
|
|
||
|
// The modes in which to enable duration annotations.
|
||
|
enabled_modes:ModeFlag = ALL;
|
||
|
|
||
|
// The annotation usecases for which to produce duration annotations.
|
||
|
enabled_annotation_usecases:uint = 4294967295;
|
||
|
|
||
|
// Durations typically look like XX hours and XX minutes etc... The list of
|
||
|
// strings below enumerate variants of "hours", "minutes", etc. in these
|
||
|
// expressions. These are verbatim strings that are matched against tokens in
|
||
|
// the input.
|
||
|
week_expressions:[string];
|
||
|
|
||
|
day_expressions:[string];
|
||
|
hour_expressions:[string];
|
||
|
minute_expressions:[string];
|
||
|
second_expressions:[string];
|
||
|
|
||
|
// List of expressions that doesn't break a duration expression (can become
|
||
|
// a part of it) but has not semantic meaning.
|
||
|
filler_expressions:[string];
|
||
|
|
||
|
// List of expressions that mean half of a unit of duration (e.g. "half an
|
||
|
// hour").
|
||
|
half_expressions:[string];
|
||
|
|
||
|
// Set of condepoints that can split the Annotator tokens to sub-tokens for
|
||
|
// sub-token matching.
|
||
|
sub_token_separator_codepoints:[int];
|
||
|
|
||
|
// If this is true, unit must be associated with quantity. For example, a
|
||
|
// phrase "minute" is not parsed as one minute duration if this is true.
|
||
|
require_quantity:bool;
|
||
|
|
||
|
// If this is true, dangling quantity is included in the annotation. For
|
||
|
// example, "10 minutes 20" is interpreted as 10 minutes and 20 seconds.
|
||
|
enable_dangling_quantity_interpretation:bool = true;
|
||
|
}
|
||
|
|
||
|
namespace libtextclassifier3;
|
||
|
table ContactAnnotatorOptions {
|
||
|
// Supported for English genitives only so far.
|
||
|
enable_declension:bool;
|
||
|
|
||
|
// For each language there is a customized list of supported declensions.
|
||
|
language:string (shared);
|
||
|
}
|
||
|
|
||
|
namespace libtextclassifier3.TranslateAnnotatorOptions_;
|
||
|
enum Algorithm : int {
|
||
|
DEFAULT_ALGORITHM = 0,
|
||
|
BACKOFF = 1,
|
||
|
}
|
||
|
|
||
|
// Backoff is the algorithm shipped with Android Q.
|
||
|
namespace libtextclassifier3.TranslateAnnotatorOptions_;
|
||
|
table BackoffOptions {
|
||
|
// The minimum size of text to prefer for detection (in codepoints).
|
||
|
min_text_size:int = 20;
|
||
|
|
||
|
// For reducing the score when text is less than the preferred size.
|
||
|
penalize_ratio:float = 1;
|
||
|
|
||
|
// Original detection score to surrounding text detection score ratios.
|
||
|
subject_text_score_ratio:float = 0.4;
|
||
|
}
|
||
|
|
||
|
namespace libtextclassifier3;
|
||
|
table TranslateAnnotatorOptions {
|
||
|
enabled:bool = false;
|
||
|
|
||
|
// Score to assign to the classification results.
|
||
|
score:float = 1;
|
||
|
|
||
|
// Priority score used for conflict resolution with the other models.
|
||
|
priority_score:float;
|
||
|
|
||
|
algorithm:TranslateAnnotatorOptions_.Algorithm;
|
||
|
backoff_options:TranslateAnnotatorOptions_.BackoffOptions;
|
||
|
}
|
||
|
|
||
|
namespace libtextclassifier3.PodNerModel_;
|
||
|
table Collection {
|
||
|
// Collection's name (e.g., "location", "person").
|
||
|
name:string (shared);
|
||
|
|
||
|
// Priority scores used for conflict resolution with the other annotators
|
||
|
// when the annotation is made over a single/multi token text.
|
||
|
single_token_priority_score:float;
|
||
|
|
||
|
multi_token_priority_score:float;
|
||
|
}
|
||
|
|
||
|
namespace libtextclassifier3.PodNerModel_.Label_;
|
||
|
enum BoiseType : int {
|
||
|
NONE = 0,
|
||
|
BEGIN = 1,
|
||
|
O = 2,
|
||
|
// No label.
|
||
|
|
||
|
INTERMEDIATE = 3,
|
||
|
SINGLE = 4,
|
||
|
END = 5,
|
||
|
}
|
||
|
|
||
|
namespace libtextclassifier3.PodNerModel_.Label_;
|
||
|
enum MentionType : int {
|
||
|
UNDEFINED = 0,
|
||
|
NAM = 1,
|
||
|
NOM = 2,
|
||
|
}
|
||
|
|
||
|
namespace libtextclassifier3.PodNerModel_;
|
||
|
table Label {
|
||
|
boise_type:Label_.BoiseType;
|
||
|
mention_type:Label_.MentionType;
|
||
|
collection_id:int;
|
||
|
// points to the collections array above.
|
||
|
}
|
||
|
|
||
|
namespace libtextclassifier3;
|
||
|
table PodNerModel {
|
||
|
tflite_model:[ubyte];
|
||
|
word_piece_vocab:[ubyte];
|
||
|
lowercase_input:bool = true;
|
||
|
|
||
|
// Index of mention_logits tensor in the output of the tflite model. Can
|
||
|
// be found in the textproto output after model is converted to tflite.
|
||
|
logits_index_in_output_tensor:int = 0;
|
||
|
|
||
|
// Whether to append a period at the end of an input that doesn't already
|
||
|
// end in punctuation.
|
||
|
append_final_period:bool = false;
|
||
|
|
||
|
// Priority score used for conflict resolution with the other models. Used
|
||
|
// only if collections_array is empty.
|
||
|
priority_score:float = 0;
|
||
|
|
||
|
// Maximum number of wordpieces supported by the model.
|
||
|
max_num_wordpieces:int = 128;
|
||
|
|
||
|
// In case of long text (number of wordpieces greater than the max) we use
|
||
|
// sliding window approach, this determines the number of overlapping
|
||
|
// wordpieces between two consecutive windows. This overlap enables context
|
||
|
// for each word NER annotates.
|
||
|
sliding_window_num_wordpieces_overlap:int = 20;
|
||
|
reserved_9:int16 (deprecated);
|
||
|
|
||
|
// The possible labels the ner model can output. If empty the default labels
|
||
|
// will be used.
|
||
|
labels:[PodNerModel_.Label];
|
||
|
|
||
|
// If the ratio of unknown wordpieces in the input text is greater than this
|
||
|
// maximum, the text won't be annotated.
|
||
|
max_ratio_unknown_wordpieces:float = 0.1;
|
||
|
|
||
|
// Possible collections for labeled entities.
|
||
|
collections:[PodNerModel_.Collection];
|
||
|
|
||
|
// Minimum word-length and wordpieces-length required for the text to be
|
||
|
// annotated.
|
||
|
min_number_of_tokens:int = 1;
|
||
|
|
||
|
min_number_of_wordpieces:int = 1;
|
||
|
}
|
||
|
|
||
|
namespace libtextclassifier3;
|
||
|
table VocabModel {
|
||
|
// A trie that stores a list of vocabs that triggers "Define". A id is
|
||
|
// returned when looking up a vocab from the trie and the id can be used
|
||
|
// to access more information about that vocab. The marisa trie library
|
||
|
// requires 8-byte alignment because the first thing in a marisa trie is a
|
||
|
// 64-bit integer.
|
||
|
vocab_trie:[ubyte] (force_align: 8);
|
||
|
|
||
|
// A bit vector that tells if the vocab should trigger "Define" for users of
|
||
|
// beginner proficiency only. To look up the bit vector, use the id returned
|
||
|
// by the trie.
|
||
|
beginner_level:BitVectorData;
|
||
|
|
||
|
// A sorted list of indices of vocabs that should not trigger "Define" if
|
||
|
// its leading character is in upper case. The indices are those returned by
|
||
|
// trie. You may perform binary search to look up an index.
|
||
|
do_not_trigger_in_upper_case:BitVectorData;
|
||
|
|
||
|
// Comma-separated list of locales (BCP 47 tags) that the model supports, that
|
||
|
// are used to prevent triggering on input in unsupported languages. If
|
||
|
// empty, the model will trigger on all inputs.
|
||
|
triggering_locales:string (shared);
|
||
|
|
||
|
// The final score to assign to the results of the vocab model
|
||
|
target_classification_score:float = 1;
|
||
|
|
||
|
// Priority score used for conflict resolution with the other models.
|
||
|
priority_score:float = 0;
|
||
|
}
|
||
|
|
||
|
root_type libtextclassifier3.Model;
|