v811_spc009/external/libtextclassifier/native/annotator/feature-processor.h

/*
 * Copyright (C) 2018 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

// Feature processing for FFModel (feed-forward SmartSelection model).

#ifndef LIBTEXTCLASSIFIER_ANNOTATOR_FEATURE_PROCESSOR_H_
#define LIBTEXTCLASSIFIER_ANNOTATOR_FEATURE_PROCESSOR_H_

#include <map>
#include <memory>
#include <set>
#include <string>
#include <vector>

#include "annotator/cached-features.h"
#include "annotator/model_generated.h"
#include "annotator/types.h"
#include "utils/base/integral_types.h"
#include "utils/base/logging.h"
#include "utils/token-feature-extractor.h"
#include "utils/tokenizer.h"
#include "utils/utf8/unicodetext.h"
#include "utils/utf8/unilib.h"

namespace libtextclassifier3 {

constexpr int kInvalidLabel = -1;

namespace internal {

Tokenizer BuildTokenizer(const FeatureProcessorOptions* options,
                         const UniLib* unilib);

TokenFeatureExtractorOptions BuildTokenFeatureExtractorOptions(
    const FeatureProcessorOptions* options);

// Splits tokens that contain the selection boundary inside them.
// E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com"
void SplitTokensOnSelectionBoundaries(const CodepointSpan& selection,
                                      std::vector<Token>* tokens);

// Returns the index of token that corresponds to the codepoint span.
int CenterTokenFromClick(const CodepointSpan& span,
                         const std::vector<Token>& tokens);

// Returns the index of token that corresponds to the middle of the  codepoint
// span.
int CenterTokenFromMiddleOfSelection(
    const CodepointSpan& span, const std::vector<Token>& selectable_tokens);

// Strips the tokens from the tokens vector that are not used for feature
// extraction because they are out of scope, or pads them so that there is
// enough tokens in the required context_size for all inferences with a click
// in relative_click_span.
void StripOrPadTokens(const TokenSpan& relative_click_span, int context_size,
                      std::vector<Token>* tokens, int* click_pos);

}  // namespace internal

// Converts a codepoint span to a token span in the given list of tokens.
// If snap_boundaries_to_containing_tokens is set to true, it is enough for a
// token to overlap with the codepoint range to be considered part of it.
// Otherwise it must be fully included in the range.
TokenSpan CodepointSpanToTokenSpan(
    const std::vector<Token>& selectable_tokens,
    const CodepointSpan& codepoint_span,
    bool snap_boundaries_to_containing_tokens = false);

// Converts a token span to a codepoint span in the given list of tokens.
CodepointSpan TokenSpanToCodepointSpan(
    const std::vector<Token>& selectable_tokens, const TokenSpan& token_span);

// Converts a codepoint span to a unicode text range, within the given unicode
// text.
// For an invalid span (with a negative index), returns (begin, begin). This
// means that it is safe to call this function before checking the validity of
// the span.
// The indices must fit within the unicode text.
// Note that the execution time is linear with respect to the codepoint indices.
// Calling this function repeatedly for spans on the same text might lead to
// inefficient code.
UnicodeTextRange CodepointSpanToUnicodeTextRange(
    const UnicodeText& unicode_text, const CodepointSpan& span);

// Takes care of preparing features for the span prediction model.
class FeatureProcessor {
 public:
  // A cache mapping codepoint spans to embedded tokens features. An instance
  // can be provided to multiple calls to ExtractFeatures() operating on the
  // same context (the same codepoint spans corresponding to the same tokens),
  // as an optimization. Note that the tokenizations do not have to be
  // identical.
  typedef std::map<CodepointSpan, std::vector<float>> EmbeddingCache;

  explicit FeatureProcessor(const FeatureProcessorOptions* options,
                            const UniLib* unilib)
      : feature_extractor_(internal::BuildTokenFeatureExtractorOptions(options),
                           unilib),
        options_(options),
        tokenizer_(internal::BuildTokenizer(options, unilib)) {
    MakeLabelMaps();
    if (options->supported_codepoint_ranges() != nullptr) {
      SortCodepointRanges({options->supported_codepoint_ranges()->begin(),
                           options->supported_codepoint_ranges()->end()},
                          &supported_codepoint_ranges_);
    }
    PrepareIgnoredSpanBoundaryCodepoints();
  }

  // Tokenizes the input string using the selected tokenization method.
  std::vector<Token> Tokenize(const std::string& text) const;

  // Same as above but takes UnicodeText.
  std::vector<Token> Tokenize(const UnicodeText& text_unicode) const;

  // Converts a label into a token span.
  bool LabelToTokenSpan(int label, TokenSpan* token_span) const;

  // Gets the total number of selection labels.
  int GetSelectionLabelCount() const { return label_to_selection_.size(); }

  // Gets the string value for given collection label.
  std::string LabelToCollection(int label) const;

  // Gets the total number of collections of the model.
  int NumCollections() const { return collection_to_label_.size(); }

  // Gets the name of the default collection.
  std::string GetDefaultCollection() const;

  const FeatureProcessorOptions* GetOptions() const { return options_; }

  // Retokenizes the context and input span, and finds the click position.
  // Depending on the options, might modify tokens (split them or remove them).
  void RetokenizeAndFindClick(const std::string& context,
                              const CodepointSpan& input_span,
                              bool only_use_line_with_click,
                              std::vector<Token>* tokens, int* click_pos) const;

  // Same as above, but takes UnicodeText and iterators within it corresponding
  // to input_span.
  void RetokenizeAndFindClick(const UnicodeText& context_unicode,
                              const UnicodeText::const_iterator& span_begin,
                              const UnicodeText::const_iterator& span_end,
                              const CodepointSpan& input_span,
                              bool only_use_line_with_click,
                              std::vector<Token>* tokens, int* click_pos) const;

  // Returns true if the token span has enough supported codepoints (as defined
  // in the model config) or not and model should not run.
  bool HasEnoughSupportedCodepoints(const std::vector<Token>& tokens,
                                    const TokenSpan& token_span) const;

  // Extracts features as a CachedFeatures object that can be used for repeated
  // inference over token spans in the given context.
  bool ExtractFeatures(const std::vector<Token>& tokens,
                       const TokenSpan& token_span,
                       const CodepointSpan& selection_span_for_feature,
                       const EmbeddingExecutor* embedding_executor,
                       EmbeddingCache* embedding_cache, int feature_vector_size,
                       std::unique_ptr<CachedFeatures>* cached_features) const;

  // Fills selection_label_spans with CodepointSpans that correspond to the
  // selection labels. The CodepointSpans are based on the codepoint ranges of
  // given tokens.
  bool SelectionLabelSpans(
      VectorSpan<Token> tokens,
      std::vector<CodepointSpan>* selection_label_spans) const;

  // Fills selection_label_relative_token_spans with number of tokens left and
  // right from the click.
  bool SelectionLabelRelativeTokenSpans(
      std::vector<TokenSpan>* selection_label_relative_token_spans) const;

  int DenseFeaturesCount() const {
    return feature_extractor_.DenseFeaturesCount();
  }

  int EmbeddingSize() const { return options_->embedding_size(); }

  // Splits context to several segments.
  std::vector<UnicodeTextRange> SplitContext(
      const UnicodeText& context_unicode,
      const bool use_pipe_character_for_newline) const;

  // Strips boundary codepoints from the span in context and returns the new
  // start and end indices. If the span comprises entirely of boundary
  // codepoints, the first index of span is returned for both indices.
  CodepointSpan StripBoundaryCodepoints(const std::string& context,
                                        const CodepointSpan& span) const;

  // Same as above but takes UnicodeText.
  CodepointSpan StripBoundaryCodepoints(const UnicodeText& context_unicode,
                                        const CodepointSpan& span) const;

  // Same as above but takes a pair of iterators for the span, for efficiency.
  CodepointSpan StripBoundaryCodepoints(
      const UnicodeText::const_iterator& span_begin,
      const UnicodeText::const_iterator& span_end,
      const CodepointSpan& span) const;

  // Same as above, but takes an optional buffer for saving the modified value.
  // As an optimization, returns pointer to 'value' if nothing was stripped, or
  // pointer to 'buffer' if something was stripped.
  const std::string& StripBoundaryCodepoints(const std::string& value,
                                             std::string* buffer) const;

 protected:
  // Returns the class id corresponding to the given string collection
  // identifier. There is a catch-all class id that the function returns for
  // unknown collections.
  int CollectionToLabel(const std::string& collection) const;

  // Prepares mapping from collection names to labels.
  void MakeLabelMaps();

  // Gets the number of spannable tokens for the model.
  //
  // Spannable tokens are those tokens of context, which the model predicts
  // selection spans over (i.e., there is 1:1 correspondence between the output
  // classes of the model and each of the spannable tokens).
  int GetNumContextTokens() const { return options_->context_size() * 2 + 1; }

  // Converts a label into a span of codepoint indices corresponding to it
  // given output_tokens.
  bool LabelToSpan(int label, const VectorSpan<Token>& output_tokens,
                   CodepointSpan* span) const;

  // Converts a span to the corresponding label given output_tokens.
  bool SpanToLabel(const CodepointSpan& span,
                   const std::vector<Token>& output_tokens, int* label) const;

  // Converts a token span to the corresponding label.
  int TokenSpanToLabel(const TokenSpan& token_span) const;

  // Returns the ratio of supported codepoints to total number of codepoints in
  // the given token span.
  float SupportedCodepointsRatio(const TokenSpan& token_span,
                                 const std::vector<Token>& tokens) const;

  void PrepareIgnoredSpanBoundaryCodepoints();

  // Counts the number of span boundary codepoints. If count_from_beginning is
  // True, the counting will start at the span_start iterator (inclusive) and at
  // maximum end at span_end (exclusive). If count_from_beginning is True, the
  // counting will start from span_end (exclusive) and end at span_start
  // (inclusive).
  int CountIgnoredSpanBoundaryCodepoints(
      const UnicodeText::const_iterator& span_start,
      const UnicodeText::const_iterator& span_end,
      bool count_from_beginning) const;

  // Finds the center token index in tokens vector, using the method defined
  // in options_.
  int FindCenterToken(const CodepointSpan& span,
                      const std::vector<Token>& tokens) const;

  // Removes all tokens from tokens that are not on a line (defined by calling
  // SplitContext on the context) to which span points.
  void StripTokensFromOtherLines(const std::string& context,
                                 const CodepointSpan& span,
                                 std::vector<Token>* tokens) const;

  // Same as above but takes UnicodeText.
  void StripTokensFromOtherLines(const UnicodeText& context_unicode,
                                 const UnicodeText::const_iterator& span_begin,
                                 const UnicodeText::const_iterator& span_end,
                                 const CodepointSpan& span,
                                 std::vector<Token>* tokens) const;

  // Extracts the features of a token and appends them to the output vector.
  // Uses the embedding cache to to avoid re-extracting the re-embedding the
  // sparse features for the same token.
  bool AppendTokenFeaturesWithCache(
      const Token& token, const CodepointSpan& selection_span_for_feature,
      const EmbeddingExecutor* embedding_executor,
      EmbeddingCache* embedding_cache,
      std::vector<float>* output_features) const;

 protected:
  const TokenFeatureExtractor feature_extractor_;

  // Codepoint ranges that define what codepoints are supported by the model.
  // NOTE: Must be sorted.
  std::vector<CodepointRangeStruct> supported_codepoint_ranges_;

 private:
  // Set of codepoints that will be stripped from beginning and end of
  // predicted spans.
  std::unordered_set<int32> ignored_span_boundary_codepoints_;

  const FeatureProcessorOptions* const options_;

  // Mapping between token selection spans and labels ids.
  std::map<TokenSpan, int> selection_to_label_;
  std::vector<TokenSpan> label_to_selection_;

  // Mapping between collections and labels.
  std::map<std::string, int> collection_to_label_;

  Tokenizer tokenizer_;
};

}  // namespace libtextclassifier3

#endif  // LIBTEXTCLASSIFIER_ANNOTATOR_FEATURE_PROCESSOR_H_