// // Copyright (C) 2018 The Android Open Source Project // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // include "annotator/entity-data.fbs"; include "annotator/experimental/experimental.fbs"; include "utils/codepoint-range.fbs"; include "utils/container/bit-vector.fbs"; include "utils/flatbuffers/flatbuffers.fbs"; include "utils/grammar/rules.fbs"; include "utils/intents/intent-config.fbs"; include "utils/normalization.fbs"; include "utils/resources.fbs"; include "utils/tokenizer.fbs"; include "utils/zlib/buffer.fbs"; file_identifier "TC2 "; // The possible model modes, represents a bit field. namespace libtextclassifier3; enum ModeFlag : int { NONE = 0, ANNOTATION = 1, CLASSIFICATION = 2, ANNOTATION_AND_CLASSIFICATION = 3, SELECTION = 4, ANNOTATION_AND_SELECTION = 5, CLASSIFICATION_AND_SELECTION = 6, ALL = 7, } // Enum for specifying the annotation usecase. namespace libtextclassifier3; enum AnnotationUsecase : int { // Results are optimized for Smart{Select,Share,Linkify}. ANNOTATION_USECASE_SMART = 0, // Smart{Select,Share,Linkify} // Results are optimized for using TextClassifier as an infrastructure that // annotates as much as possible. ANNOTATION_USECASE_RAW = 1, } namespace libtextclassifier3; enum DatetimeExtractorType : int { UNKNOWN_DATETIME_EXTRACTOR_TYPE = 0, AM = 1, PM = 2, JANUARY = 3, FEBRUARY = 4, MARCH = 5, APRIL = 6, MAY = 7, JUNE = 8, JULY = 9, AUGUST = 10, SEPTEMBER = 11, OCTOBER = 12, NOVEMBER = 13, DECEMBER = 14, NEXT = 15, NEXT_OR_SAME = 16, LAST = 17, NOW = 18, TOMORROW = 19, YESTERDAY = 20, PAST = 21, FUTURE = 22, DAY = 23, WEEK = 24, MONTH = 25, YEAR = 26, MONDAY = 27, TUESDAY = 28, WEDNESDAY = 29, THURSDAY = 30, FRIDAY = 31, SATURDAY = 32, SUNDAY = 33, DAYS = 34, WEEKS = 35, MONTHS = 36, // TODO(zilka): Make the following 3 values singular for consistency. HOURS = 37, MINUTES = 38, SECONDS = 39, YEARS = 40, DIGITS = 41, SIGNEDDIGITS = 42, ZERO = 43, ONE = 44, TWO = 45, THREE = 46, FOUR = 47, FIVE = 48, SIX = 49, SEVEN = 50, EIGHT = 51, NINE = 52, TEN = 53, ELEVEN = 54, TWELVE = 55, THIRTEEN = 56, FOURTEEN = 57, FIFTEEN = 58, SIXTEEN = 59, SEVENTEEN = 60, EIGHTEEN = 61, NINETEEN = 62, TWENTY = 63, THIRTY = 64, FORTY = 65, FIFTY = 66, SIXTY = 67, SEVENTY = 68, EIGHTY = 69, NINETY = 70, HUNDRED = 71, THOUSAND = 72, NOON = 73, MIDNIGHT = 74, } namespace libtextclassifier3; enum DatetimeGroupType : int { GROUP_UNKNOWN = 0, GROUP_UNUSED = 1, GROUP_YEAR = 2, GROUP_MONTH = 3, GROUP_DAY = 4, GROUP_HOUR = 5, GROUP_MINUTE = 6, GROUP_SECOND = 7, GROUP_AMPM = 8, GROUP_RELATIONDISTANCE = 9, GROUP_RELATION = 10, GROUP_RELATIONTYPE = 11, // Dummy groups serve just as an inflator of the selection. E.g. we might want // to select more text than was contained in an envelope of all extractor // spans. GROUP_DUMMY1 = 12, GROUP_DUMMY2 = 13, GROUP_ABSOLUTETIME = 14, } // Options for the model that predicts text selection. namespace libtextclassifier3; table SelectionModelOptions { // If true, before the selection is returned, the unpaired brackets contained // in the predicted selection are stripped from the both selection ends. // The bracket codepoints are defined in the Unicode standard: // http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt strip_unpaired_brackets:bool = true; // Number of hypothetical click positions on either side of the actual click // to consider in order to enforce symmetry. symmetry_context_size:int; // Number of examples to bundle in one batch for inference. batch_size:int = 1024; // Whether to always classify a suggested selection or only on demand. always_classify_suggested_selection:bool = false; } // Options for the model that classifies a text selection. namespace libtextclassifier3; table ClassificationModelOptions { // Limits for phone numbers. phone_min_num_digits:int = 7; phone_max_num_digits:int = 15; // Limits for addresses. address_min_num_tokens:int; // Maximum number of tokens to attempt a classification (-1 is unlimited). max_num_tokens:int = -1; } // Options for post-checks, checksums and verification to apply on a match. namespace libtextclassifier3; table VerificationOptions { verify_luhn_checksum:bool = false; // Lua verifier to use. // Index of the lua verifier in the model. lua_verifier:int = -1; } // Behaviour of rule capturing groups. // This specifies how the text and span of a capturing group, in a regular // expression or from a capturing match in a grammar rule, should be handled. namespace libtextclassifier3; table CapturingGroup { // If true, the span of the capturing group will be used to // extend the selection. extend_selection:bool = true; // If set, the text of the capturing group will be used to set a field in // the classfication result entity data. entity_field_path:FlatbufferFieldPath; // If set, the flatbuffer entity data will be merged with the // classification result entity data. serialized_entity_data:string (shared); // If set, normalization to apply before text is used in entity data. normalization_options:NormalizationOptions; entity_data:EntityData; } // List of regular expression matchers to check. namespace libtextclassifier3.RegexModel_; table Pattern { // The name of the collection of a match. collection_name:string (shared); // The pattern to check. pattern:string (shared); // The modes for which to apply the patterns. enabled_modes:ModeFlag = ALL; // The final score to assign to the results of this pattern. target_classification_score:float = 1; // Priority score used for conflict resolution with the other models. priority_score:float = 0; // If true, will use an approximate matching implementation implemented // using Find() instead of the true Match(). This approximate matching will // use the first Find() result and then check that it spans the whole input. use_approximate_matching:bool = false; compressed_pattern:CompressedBuffer; // Verification to apply on a match. verification_options:VerificationOptions; capturing_group:[CapturingGroup]; // Entity data to set for a match. serialized_entity_data:string (shared); entity_data:EntityData; } namespace libtextclassifier3; table RegexModel { patterns:[RegexModel_.Pattern]; // If true, will compile the regexes only on first use. lazy_regex_compilation:bool = true; // Lua scripts for match verification. // The verifier can access: // * `context`: The context as a string. // * `match`: The groups of the regex match as an array, each group gives // * `begin`: span start // * `end`: span end // * `text`: the text // The verifier is expected to return a boolean, indicating whether the // verification succeeded or not. lua_verifier:[string]; } // List of regex patterns. namespace libtextclassifier3.DatetimeModelPattern_; table Regex { pattern:string (shared); // The ith entry specifies the type of the ith capturing group. // This is used to decide how the matched content has to be parsed. groups:[DatetimeGroupType]; compressed_pattern:CompressedBuffer; } namespace libtextclassifier3; table DatetimeModelPattern { regexes:[DatetimeModelPattern_.Regex]; // List of locale indices in DatetimeModel that represent the locales that // these patterns should be used for. If empty, can be used for all locales. locales:[int]; // The final score to assign to the results of this pattern. target_classification_score:float = 1; // Priority score used for conflict resolution with the other models. priority_score:float = 0; // The modes for which to apply the patterns. enabled_modes:ModeFlag = ALL; // The annotation usecases for which to apply the patterns. // This is a flag field for values of AnnotationUsecase. enabled_annotation_usecases:uint = 4294967295; } namespace libtextclassifier3; table DatetimeModelExtractor { extractor:DatetimeExtractorType; pattern:string (shared); locales:[int]; compressed_pattern:CompressedBuffer; } namespace libtextclassifier3; table DatetimeModel { // List of BCP 47 locale strings representing all locales supported by the // model. The individual patterns refer back to them using an index. locales:[string]; patterns:[DatetimeModelPattern]; extractors:[DatetimeModelExtractor]; // If true, will use the extractors for determining the match location as // opposed to using the location where the global pattern matched. use_extractors_for_locating:bool = true; // List of locale ids, rules of whose are always run, after the requested // ones. default_locales:[int]; // If true, will generate the alternative interpretations for ambiguous // datetime expressions. generate_alternative_interpretations_when_ambiguous:bool = false; // If true, will compile the regexes only on first use. lazy_regex_compilation:bool = true; // If true, will give only future dates (when the day is not specified). prefer_future_for_unspecified_date:bool = false; } // Configuration for the tokenizer. namespace libtextclassifier3; table GrammarTokenizerOptions { tokenization_type:TokenizationType = ICU; // If true, white space tokens will be kept when using the icu tokenizer. icu_preserve_whitespace_tokens:bool = false; // Codepoint ranges that determine what role the different codepoints play // during tokenized. The ranges must not overlap. tokenization_codepoint_config:[TokenizationCodepointRange]; // A set of codepoint ranges to use in the mixed tokenization mode to identify // stretches of tokens to re-tokenize using the internal tokenizer. internal_tokenizer_codepoint_ranges:[CodepointRange]; // If true, tokens will be also split when the codepoint's script_id changes // as defined in TokenizationCodepointRange. tokenize_on_script_change:bool = false; } namespace libtextclassifier3.DatetimeModelLibrary_; table Item { key:string (shared); value:DatetimeModel; } // A set of named DateTime models. namespace libtextclassifier3; table DatetimeModelLibrary { models:[DatetimeModelLibrary_.Item]; } // Classification result to instantiate for a rule match. namespace libtextclassifier3.GrammarModel_; table RuleClassificationResult { // The name of the collection. collection_name:string (shared); // The score. target_classification_score:float = 1; // The priority score used for conflict resolution with the other models. priority_score:float = 0; // Behaviour of capturing matches. capturing_group:[CapturingGroup]; // Entity data to set for a match. serialized_entity_data:string (shared); // Enabled modes. enabled_modes:ModeFlag = ALL; entity_data:EntityData; } // Configuration for grammar based annotators. namespace libtextclassifier3; table GrammarModel { // The grammar rules. rules:grammar.RulesSet; rule_classification_result:[GrammarModel_.RuleClassificationResult]; // Number of tokens in the context to use for classification and text // selection suggestion. // A value -1 uses the full context. context_left_num_tokens:int; context_right_num_tokens:int; // Grammar specific tokenizer options. tokenizer_options:GrammarTokenizerOptions; // The score. target_classification_score:float = 1; // The priority score used for conflict resolution with the other models. priority_score:float = 1; } namespace libtextclassifier3.MoneyParsingOptions_; table QuantitiesNameToExponentEntry { key:string (key, shared); value:int; } namespace libtextclassifier3; table MoneyParsingOptions { // Separators (codepoints) marking decimal or thousand in the money amount. separators:[int]; // Mapping between a quantity string (e.g. "million") and the power of 10 // it multiplies the amount with (e.g. 6 in case of "million"). // NOTE: The entries need to be sorted by key since we use LookupByKey. quantities_name_to_exponent:[MoneyParsingOptions_.QuantitiesNameToExponentEntry]; } namespace libtextclassifier3.ModelTriggeringOptions_; table CollectionToPriorityEntry { key:string (key, shared); value:float; } // Options controlling the output of the Tensorflow Lite models. namespace libtextclassifier3; table ModelTriggeringOptions { // Lower bound threshold for filtering annotation model outputs. min_annotate_confidence:float = 0; // The modes for which to enable the models. enabled_modes:ModeFlag = ALL; // Comma-separated list of locales (BCP 47 tags) that dictionary // classification supports. dictionary_locales:string (shared); // Comma-separated list of locales (BCP 47 tags) that the model supports, that // are used to prevent triggering on input in unsupported languages. If // empty, the model will trigger on all inputs. locales:string (shared); // Priority score assigned to the "other" class from ML model. other_collection_priority_score:float = -1000; // Priority score assigned to knowledge engine annotations. knowledge_priority_score:float = 0; reserved_7:int16 (deprecated); // Apply a factor to the priority score for entities that are added to this // map. Key: collection type e.g. "address", "phone"..., Value: float number. // NOTE: The entries here need to be sorted since we use LookupByKey. collection_to_priority:[ModelTriggeringOptions_.CollectionToPriorityEntry]; } // Options controlling the output of the classifier. namespace libtextclassifier3; table OutputOptions { // Lists of collection names that will be filtered out at the output: // - For annotation, the spans of given collection are simply dropped. // - For classification, the result is mapped to the class "other". // - For selection, the spans of given class are returned as // single-selection. filtered_collections_annotation:[string]; filtered_collections_classification:[string]; filtered_collections_selection:[string]; } namespace libtextclassifier3.Model_; table EmbeddingPruningMask { // If true, use pruning mask. In this case, we use mask // pruning_mask to determine the mapping of hashed-charactergrams. enabled:bool; // Packing of the binary pruning mask into uint64 values. pruning_mask:[ulong] (force_align: 16); // Number of buckets before pruning. full_num_buckets:int; // Index of row of compressed embedding matrix to which all pruned buckets // are mapped. pruned_row_bucket_id:int; } namespace libtextclassifier3.Model_; table ConflictResolutionOptions { // If true, will prioritize the longest annotation during conflict // resolution. prioritize_longest_annotation:bool = false; // If true, the annotator will perform conflict resolution between the // different sub-annotators also in the RAW mode. If false, no conflict // resolution will be performed in RAW mode. do_conflict_resolution_in_raw_mode:bool = true; } namespace libtextclassifier3; table Model { // Comma-separated list of locales supported by the model as BCP 47 tags. locales:string (shared); version:int; // A name for the model that can be used for e.g. logging. name:string (shared); selection_feature_options:FeatureProcessorOptions; classification_feature_options:FeatureProcessorOptions; // Tensorflow Lite models. selection_model:[ubyte] (force_align: 16); classification_model:[ubyte] (force_align: 16); embedding_model:[ubyte] (force_align: 16); // Options for the different models. selection_options:SelectionModelOptions; classification_options:ClassificationModelOptions; regex_model:RegexModel; datetime_model:DatetimeModel; // Options controlling the output of the models. triggering_options:ModelTriggeringOptions; // Global switch that controls if SuggestSelection(), ClassifyText() and // Annotate() will run. If a mode is disabled it returns empty/no-op results. enabled_modes:ModeFlag = ALL; // If true, will snap the selections that consist only of whitespaces to the // containing suggested span. Otherwise, no suggestion is proposed, since the // selections are not part of any token. snap_whitespace_selections:bool = true; // Global configuration for the output of SuggestSelection(), ClassifyText() // and Annotate(). output_options:OutputOptions; // Configures how Intents should be generated on Android. android_intent_options:AndroidIntentFactoryOptions; intent_options:IntentFactoryModel; // Model resources. resources:ResourcePool; // Schema data for handling entity data. entity_data_schema:[ubyte]; number_annotator_options:NumberAnnotatorOptions; duration_annotator_options:DurationAnnotatorOptions; // Comma-separated list of locales (BCP 47 tags) that the model supports, that // are used to prevent triggering on input in unsupported languages. If // empty, the model will trigger on all inputs. triggering_locales:string (shared); embedding_pruning_mask:Model_.EmbeddingPruningMask; reserved_25:int16 (deprecated); contact_annotator_options:ContactAnnotatorOptions; money_parsing_options:MoneyParsingOptions; translate_annotator_options:TranslateAnnotatorOptions; grammar_model:GrammarModel; conflict_resolution_options:Model_.ConflictResolutionOptions; experimental_model:ExperimentalModel; pod_ner_model:PodNerModel; vocab_model:VocabModel; datetime_grammar_model:GrammarModel; } // Method for selecting the center token. namespace libtextclassifier3.FeatureProcessorOptions_; enum CenterTokenSelectionMethod : int { DEFAULT_CENTER_TOKEN_METHOD = 0, // Invalid option. // Use click indices to determine the center token. CENTER_TOKEN_FROM_CLICK = 1, // Use selection indices to get a token range, and select the middle of it // as the center token. CENTER_TOKEN_MIDDLE_OF_SELECTION = 2, } // Bounds-sensitive feature extraction configuration. namespace libtextclassifier3.FeatureProcessorOptions_; table BoundsSensitiveFeatures { // Enables the extraction of bounds-sensitive features, instead of the click // context features. enabled:bool; // The numbers of tokens to extract in specific locations relative to the // bounds. // Immediately before the span. num_tokens_before:int; // Inside the span, aligned with the beginning. num_tokens_inside_left:int; // Inside the span, aligned with the end. num_tokens_inside_right:int; // Immediately after the span. num_tokens_after:int; // If true, also extracts the tokens of the entire span and adds up their // features forming one "token" to include in the extracted features. include_inside_bag:bool; // If true, includes the selection length (in the number of tokens) as a // feature. include_inside_length:bool; // If true, for selection, single token spans are not run through the model // and their score is assumed to be zero. score_single_token_spans_as_zero:bool; } namespace libtextclassifier3; table FeatureProcessorOptions { // Number of buckets used for hashing charactergrams. num_buckets:int = -1; // Size of the embedding. embedding_size:int = -1; // Number of bits for quantization for embeddings. embedding_quantization_bits:int = 8; // Context size defines the number of words to the left and to the right of // the selected word to be used as context. For example, if context size is // N, then we take N words to the left and N words to the right of the // selected word as its context. context_size:int = -1; // Maximum number of words of the context to select in total. max_selection_span:int = -1; // Orders of charactergrams to extract. E.g., 2 means character bigrams, 3 // character trigrams etc. chargram_orders:[int]; // Maximum length of a word, in codepoints. max_word_length:int = 20; // If true, will use the unicode-aware functionality for extracting features. unicode_aware_features:bool = false; // Whether to extract the token case feature. extract_case_feature:bool = false; // Whether to extract the selection mask feature. extract_selection_mask_feature:bool = false; // List of regexps to run over each token. For each regexp, if there is a // match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used. regexp_feature:[string]; // Whether to remap all digits to a single number. remap_digits:bool = false; // Whether to lower-case each token before generating hashgrams. lowercase_tokens:bool; // If true, the selection classifier output will contain only the selections // that are feasible (e.g., those that are shorter than max_selection_span), // if false, the output will be a complete cross-product of possible // selections to the left and possible selections to the right, including the // infeasible ones. // NOTE: Exists mainly for compatibility with older models that were trained // with the non-reduced output space. selection_reduced_output_space:bool = true; // Collection names. collections:[string]; // An index of collection in collections to be used if a collection name can't // be mapped to an id. default_collection:int = -1; // If true, will split the input by lines, and only use the line that contains // the clicked token. only_use_line_with_click:bool = false; // If true, will split tokens that contain the selection boundary, at the // position of the boundary. // E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com" split_tokens_on_selection_boundaries:bool = false; // Codepoint ranges that determine how different codepoints are tokenized. // The ranges must not overlap. tokenization_codepoint_config:[TokenizationCodepointRange]; center_token_selection_method:FeatureProcessorOptions_.CenterTokenSelectionMethod; // If true, span boundaries will be snapped to containing tokens and not // required to exactly match token boundaries. snap_label_span_boundaries_to_containing_tokens:bool; // A set of codepoint ranges supported by the model. supported_codepoint_ranges:[CodepointRange]; // A set of codepoint ranges to use in the mixed tokenization mode to identify // stretches of tokens to re-tokenize using the internal tokenizer. internal_tokenizer_codepoint_ranges:[CodepointRange]; // Minimum ratio of supported codepoints in the input context. If the ratio // is lower than this, the feature computation will fail. min_supported_codepoint_ratio:float = 0; // Used for versioning the format of features the model expects. // - feature_version == 0: // For each token the features consist of: // - chargram embeddings // - dense features // Chargram embeddings for tokens are concatenated first together, // and at the end, the dense features for the tokens are concatenated // to it. So the resulting feature vector has two regions. feature_version:int = 0; tokenization_type:TokenizationType = INTERNAL_TOKENIZER; icu_preserve_whitespace_tokens:bool = false; // List of codepoints that will be stripped from beginning and end of // predicted spans. ignored_span_boundary_codepoints:[int]; bounds_sensitive_features:FeatureProcessorOptions_.BoundsSensitiveFeatures; // List of allowed charactergrams. The extracted charactergrams are filtered // using this list, and charactergrams that are not present are interpreted as // out-of-vocabulary. // If no allowed_chargrams are specified, all charactergrams are allowed. // The field is typed as bytes type to allow non-UTF8 chargrams. allowed_chargrams:[string]; // If true, tokens will be also split when the codepoint's script_id changes // as defined in TokenizationCodepointRange. tokenize_on_script_change:bool = false; // If true, the pipe character '|' will be used as a newline character when // splitting lines. use_pipe_character_for_newline:bool = true; } namespace libtextclassifier3; table NumberAnnotatorOptions { // If true, number and percentage annotations will be produced. enabled:bool = false; // Score to assign to the annotated numbers and percentages in the annotator. score:float = 1; // Number priority score used for conflict resolution with the other models. priority_score:float = 0; // The modes in which to enable number and percentage annotations. enabled_modes:ModeFlag = ALL; // The annotation usecases for which to produce number annotations. // This is a flag field for values of AnnotationUsecase. enabled_annotation_usecases:uint = 4294967295; // [Deprecated] A list of codepoints that can form a prefix of a valid number. allowed_prefix_codepoints:[int]; // [Deprecated] A list of codepoints that can form a suffix of a valid number. allowed_suffix_codepoints:[int]; // [Deprecated] List of codepoints that will be stripped from beginning of // predicted spans. ignored_prefix_span_boundary_codepoints:[int]; // [Deprecated] List of codepoints that will be stripped from end of predicted // spans. ignored_suffix_span_boundary_codepoints:[int]; // [Deprecated] If true, percent annotations will be produced. enable_percentage:bool = false; // Zero separated and ordered list of suffixes that mark a percent. percentage_pieces_string:string (shared); // [Deprecated] List of suffixes offsets in the percent_pieces_string string. percentage_pieces_offsets:[int]; // Priority score for the percentage annotation. percentage_priority_score:float = 1; // Float number priority score used for conflict resolution with the other // models. float_number_priority_score:float = 0; // The maximum number of digits an annotated number can have. Requirement: // the value should be less or equal to 20. max_number_of_digits:int = 20; // The annotation usecases for which to produce percentage annotations. // This is a flag field for values of AnnotationUsecase. percentage_annotation_usecases:uint = 2; } // DurationAnnotator is so far tailored for English and Japanese only. namespace libtextclassifier3; table DurationAnnotatorOptions { // If true, duration annotations will be produced. enabled:bool = false; // Score to assign to the annotated durations from the annotator. score:float = 1; // Priority score used for conflict resolution with the other models. priority_score:float = 0; // The modes in which to enable duration annotations. enabled_modes:ModeFlag = ALL; // The annotation usecases for which to produce duration annotations. enabled_annotation_usecases:uint = 4294967295; // Durations typically look like XX hours and XX minutes etc... The list of // strings below enumerate variants of "hours", "minutes", etc. in these // expressions. These are verbatim strings that are matched against tokens in // the input. week_expressions:[string]; day_expressions:[string]; hour_expressions:[string]; minute_expressions:[string]; second_expressions:[string]; // List of expressions that doesn't break a duration expression (can become // a part of it) but has not semantic meaning. filler_expressions:[string]; // List of expressions that mean half of a unit of duration (e.g. "half an // hour"). half_expressions:[string]; // Set of condepoints that can split the Annotator tokens to sub-tokens for // sub-token matching. sub_token_separator_codepoints:[int]; // If this is true, unit must be associated with quantity. For example, a // phrase "minute" is not parsed as one minute duration if this is true. require_quantity:bool; // If this is true, dangling quantity is included in the annotation. For // example, "10 minutes 20" is interpreted as 10 minutes and 20 seconds. enable_dangling_quantity_interpretation:bool = true; } namespace libtextclassifier3; table ContactAnnotatorOptions { // Supported for English genitives only so far. enable_declension:bool; // For each language there is a customized list of supported declensions. language:string (shared); } namespace libtextclassifier3.TranslateAnnotatorOptions_; enum Algorithm : int { DEFAULT_ALGORITHM = 0, BACKOFF = 1, } // Backoff is the algorithm shipped with Android Q. namespace libtextclassifier3.TranslateAnnotatorOptions_; table BackoffOptions { // The minimum size of text to prefer for detection (in codepoints). min_text_size:int = 20; // For reducing the score when text is less than the preferred size. penalize_ratio:float = 1; // Original detection score to surrounding text detection score ratios. subject_text_score_ratio:float = 0.4; } namespace libtextclassifier3; table TranslateAnnotatorOptions { enabled:bool = false; // Score to assign to the classification results. score:float = 1; // Priority score used for conflict resolution with the other models. priority_score:float; algorithm:TranslateAnnotatorOptions_.Algorithm; backoff_options:TranslateAnnotatorOptions_.BackoffOptions; } namespace libtextclassifier3.PodNerModel_; table Collection { // Collection's name (e.g., "location", "person"). name:string (shared); // Priority scores used for conflict resolution with the other annotators // when the annotation is made over a single/multi token text. single_token_priority_score:float; multi_token_priority_score:float; } namespace libtextclassifier3.PodNerModel_.Label_; enum BoiseType : int { NONE = 0, BEGIN = 1, O = 2, // No label. INTERMEDIATE = 3, SINGLE = 4, END = 5, } namespace libtextclassifier3.PodNerModel_.Label_; enum MentionType : int { UNDEFINED = 0, NAM = 1, NOM = 2, } namespace libtextclassifier3.PodNerModel_; table Label { boise_type:Label_.BoiseType; mention_type:Label_.MentionType; collection_id:int; // points to the collections array above. } namespace libtextclassifier3; table PodNerModel { tflite_model:[ubyte]; word_piece_vocab:[ubyte]; lowercase_input:bool = true; // Index of mention_logits tensor in the output of the tflite model. Can // be found in the textproto output after model is converted to tflite. logits_index_in_output_tensor:int = 0; // Whether to append a period at the end of an input that doesn't already // end in punctuation. append_final_period:bool = false; // Priority score used for conflict resolution with the other models. Used // only if collections_array is empty. priority_score:float = 0; // Maximum number of wordpieces supported by the model. max_num_wordpieces:int = 128; // In case of long text (number of wordpieces greater than the max) we use // sliding window approach, this determines the number of overlapping // wordpieces between two consecutive windows. This overlap enables context // for each word NER annotates. sliding_window_num_wordpieces_overlap:int = 20; reserved_9:int16 (deprecated); // The possible labels the ner model can output. If empty the default labels // will be used. labels:[PodNerModel_.Label]; // If the ratio of unknown wordpieces in the input text is greater than this // maximum, the text won't be annotated. max_ratio_unknown_wordpieces:float = 0.1; // Possible collections for labeled entities. collections:[PodNerModel_.Collection]; // Minimum word-length and wordpieces-length required for the text to be // annotated. min_number_of_tokens:int = 1; min_number_of_wordpieces:int = 1; } namespace libtextclassifier3; table VocabModel { // A trie that stores a list of vocabs that triggers "Define". A id is // returned when looking up a vocab from the trie and the id can be used // to access more information about that vocab. The marisa trie library // requires 8-byte alignment because the first thing in a marisa trie is a // 64-bit integer. vocab_trie:[ubyte] (force_align: 8); // A bit vector that tells if the vocab should trigger "Define" for users of // beginner proficiency only. To look up the bit vector, use the id returned // by the trie. beginner_level:BitVectorData; // A sorted list of indices of vocabs that should not trigger "Define" if // its leading character is in upper case. The indices are those returned by // trie. You may perform binary search to look up an index. do_not_trigger_in_upper_case:BitVectorData; // Comma-separated list of locales (BCP 47 tags) that the model supports, that // are used to prevent triggering on input in unsupported languages. If // empty, the model will trigger on all inputs. triggering_locales:string (shared); // The final score to assign to the results of the vocab model target_classification_score:float = 1; // Priority score used for conflict resolution with the other models. priority_score:float = 0; } root_type libtextclassifier3.Model;