v811_spc009/external/libtextclassifier/native/utils/tokenizer-utils.h

/*
 * Copyright (C) 2018 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

// Utilities for tests.

#ifndef LIBTEXTCLASSIFIER_UTILS_TOKENIZER_UTILS_H_
#define LIBTEXTCLASSIFIER_UTILS_TOKENIZER_UTILS_H_

#include <string>

#include "annotator/types.h"
#include "utils/codepoint-range.h"
#include "utils/strings/utf8.h"
#include "utils/utf8/unicodetext.h"
#include "absl/container/flat_hash_set.h"
#include "absl/strings/string_view.h"

namespace libtextclassifier3 {

struct FilterResult {
  // Whether split on this codepoint.
  bool to_split;
  // If the codepoint is used to split the text, whether to output it as a
  // token.
  bool to_keep;
};

// Returns a list of Tokens for a given input string, by tokenizing on space.
std::vector<Token> TokenizeOnSpace(const std::string& text);

// Returns a list of Tokens for a given input string, by tokenizing on the
// given set of delimiter codepoints.
// If create_tokens_for_non_space_delimiters is true, create tokens for
// delimiters which are not white spaces. For example "This, is" -> {"This",
// ",", "is"}.
std::vector<Token> TokenizeOnDelimiters(
    const std::string& text, const absl::flat_hash_set<char32>& delimiters,
    bool create_tokens_for_non_space_delimiters = false);

// This replicates how the original bert_tokenizer from the tflite-support
// library pretokenize text by using regex_split with these default regexes.
// It splits the text on spaces, punctuations and chinese characters and
// output all the tokens except spaces.
// So far, the only difference between this and the original implementation
// we are aware of is that the original regexes has 8 ranges of chinese
// unicodes. We have all these 8 ranges plus two extra ranges.
std::vector<Token> TokenizeOnWhiteSpacePunctuationAndChineseLetter(
    const absl::string_view text);

// Returns a list of Tokens for a given input string, by tokenizing on the
// given filter function. Caller can control which codepoint to split and
// whether a delimiter should be output as a token.
template <typename FilterFn>
std::vector<Token> TokenizeWithFilter(const absl::string_view input,
                                      FilterFn filter) {
  const UnicodeText input_unicode = UTF8ToUnicodeText(input, /*do_copy=*/false);
  std::vector<Token> tokens;
  UnicodeText::const_iterator start_it = input_unicode.begin();
  int token_start_codepoint = 0;
  int codepoint_idx = 0;

  for (auto it = input_unicode.begin(); it != input_unicode.end(); ++it) {
    const char32 code_point = *it;
    FilterResult filter_result = filter(code_point);
    if (filter_result.to_split) {
      const std::string token_text = UnicodeText::UTF8Substring(start_it, it);
      if (!token_text.empty()) {
        tokens.push_back(
            Token{token_text, token_start_codepoint, codepoint_idx});
      }
      if (filter_result.to_keep) {
        const std::string delimiter =
            UnicodeText::UTF8Substring(it, std::next(it));
        tokens.push_back(Token{delimiter, codepoint_idx, codepoint_idx + 1});
      }
      start_it = std::next(it);
      token_start_codepoint = codepoint_idx + 1;
    }
    codepoint_idx++;
  }
  // Flush the last token if any.
  if (start_it != input_unicode.end()) {
    const std::string token_text =
        UnicodeText::UTF8Substring(start_it, input_unicode.end());
    tokens.push_back(Token{token_text, token_start_codepoint, codepoint_idx});
  }
  return tokens;
}

}  // namespace  libtextclassifier3

#endif  // LIBTEXTCLASSIFIER_UTILS_TOKENIZER_UTILS_H_
v811_spc009_project 7 months ago			`/*`
			`* Copyright (C) 2018 The Android Open Source Project`
			`*`
			`* Licensed under the Apache License, Version 2.0 (the "License");`
			`* you may not use this file except in compliance with the License.`
			`* You may obtain a copy of the License at`
			`*`
			`* http://www.apache.org/licenses/LICENSE-2.0`
			`*`
			`* Unless required by applicable law or agreed to in writing, software`
			`* distributed under the License is distributed on an "AS IS" BASIS,`
			`* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`* See the License for the specific language governing permissions and`
			`* limitations under the License.`
			`*/`

			`// Utilities for tests.`

			`#ifndef LIBTEXTCLASSIFIER_UTILS_TOKENIZER_UTILS_H_`
			`#define LIBTEXTCLASSIFIER_UTILS_TOKENIZER_UTILS_H_`

			`#include <string>`

			`#include "annotator/types.h"`
			`#include "utils/codepoint-range.h"`
			`#include "utils/strings/utf8.h"`
			`#include "utils/utf8/unicodetext.h"`
			`#include "absl/container/flat_hash_set.h"`
			`#include "absl/strings/string_view.h"`

			`namespace libtextclassifier3 {`

			`struct FilterResult {`
			`// Whether split on this codepoint.`
			`bool to_split;`
			`// If the codepoint is used to split the text, whether to output it as a`
			`// token.`
			`bool to_keep;`
			`};`

			`// Returns a list of Tokens for a given input string, by tokenizing on space.`
			`std::vector<Token> TokenizeOnSpace(const std::string& text);`

			`// Returns a list of Tokens for a given input string, by tokenizing on the`
			`// given set of delimiter codepoints.`
			`// If create_tokens_for_non_space_delimiters is true, create tokens for`
			`// delimiters which are not white spaces. For example "This, is" -> {"This",`
			`// ",", "is"}.`
			`std::vector<Token> TokenizeOnDelimiters(`
			`const std::string& text, const absl::flat_hash_set<char32>& delimiters,`
			`bool create_tokens_for_non_space_delimiters = false);`

			`// This replicates how the original bert_tokenizer from the tflite-support`
			`// library pretokenize text by using regex_split with these default regexes.`
			`// It splits the text on spaces, punctuations and chinese characters and`
			`// output all the tokens except spaces.`
			`// So far, the only difference between this and the original implementation`
			`// we are aware of is that the original regexes has 8 ranges of chinese`
			`// unicodes. We have all these 8 ranges plus two extra ranges.`
			`std::vector<Token> TokenizeOnWhiteSpacePunctuationAndChineseLetter(`
			`const absl::string_view text);`

			`// Returns a list of Tokens for a given input string, by tokenizing on the`
			`// given filter function. Caller can control which codepoint to split and`
			`// whether a delimiter should be output as a token.`
			`template <typename FilterFn>`
			`std::vector<Token> TokenizeWithFilter(const absl::string_view input,`
			`FilterFn filter) {`
			`const UnicodeText input_unicode = UTF8ToUnicodeText(input, /do_copy=/false);`
			`std::vector<Token> tokens;`
			`UnicodeText::const_iterator start_it = input_unicode.begin();`
			`int token_start_codepoint = 0;`
			`int codepoint_idx = 0;`

			`for (auto it = input_unicode.begin(); it != input_unicode.end(); ++it) {`
			`const char32 code_point = *it;`
			`FilterResult filter_result = filter(code_point);`
			`if (filter_result.to_split) {`
			`const std::string token_text = UnicodeText::UTF8Substring(start_it, it);`
			`if (!token_text.empty()) {`
			`tokens.push_back(`
			`Token{token_text, token_start_codepoint, codepoint_idx});`
			`}`
			`if (filter_result.to_keep) {`
			`const std::string delimiter =`
			`UnicodeText::UTF8Substring(it, std::next(it));`
			`tokens.push_back(Token{delimiter, codepoint_idx, codepoint_idx + 1});`
			`}`
			`start_it = std::next(it);`
			`token_start_codepoint = codepoint_idx + 1;`
			`}`
			`codepoint_idx++;`
			`}`
			`// Flush the last token if any.`
			`if (start_it != input_unicode.end()) {`
			`const std::string token_text =`
			`UnicodeText::UTF8Substring(start_it, input_unicode.end());`
			`tokens.push_back(Token{token_text, token_start_codepoint, codepoint_idx});`
			`}`
			`return tokens;`
			`}`

			`} // namespace libtextclassifier3`

			`#endif // LIBTEXTCLASSIFIER_UTILS_TOKENIZER_UTILS_H_`