/* * Copyright (C) 2018 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // Utilities for tests. #ifndef LIBTEXTCLASSIFIER_UTILS_TOKENIZER_UTILS_H_ #define LIBTEXTCLASSIFIER_UTILS_TOKENIZER_UTILS_H_ #include #include "annotator/types.h" #include "utils/codepoint-range.h" #include "utils/strings/utf8.h" #include "utils/utf8/unicodetext.h" #include "absl/container/flat_hash_set.h" #include "absl/strings/string_view.h" namespace libtextclassifier3 { struct FilterResult { // Whether split on this codepoint. bool to_split; // If the codepoint is used to split the text, whether to output it as a // token. bool to_keep; }; // Returns a list of Tokens for a given input string, by tokenizing on space. std::vector TokenizeOnSpace(const std::string& text); // Returns a list of Tokens for a given input string, by tokenizing on the // given set of delimiter codepoints. // If create_tokens_for_non_space_delimiters is true, create tokens for // delimiters which are not white spaces. For example "This, is" -> {"This", // ",", "is"}. std::vector TokenizeOnDelimiters( const std::string& text, const absl::flat_hash_set& delimiters, bool create_tokens_for_non_space_delimiters = false); // This replicates how the original bert_tokenizer from the tflite-support // library pretokenize text by using regex_split with these default regexes. // It splits the text on spaces, punctuations and chinese characters and // output all the tokens except spaces. // So far, the only difference between this and the original implementation // we are aware of is that the original regexes has 8 ranges of chinese // unicodes. We have all these 8 ranges plus two extra ranges. std::vector TokenizeOnWhiteSpacePunctuationAndChineseLetter( const absl::string_view text); // Returns a list of Tokens for a given input string, by tokenizing on the // given filter function. Caller can control which codepoint to split and // whether a delimiter should be output as a token. template std::vector TokenizeWithFilter(const absl::string_view input, FilterFn filter) { const UnicodeText input_unicode = UTF8ToUnicodeText(input, /*do_copy=*/false); std::vector tokens; UnicodeText::const_iterator start_it = input_unicode.begin(); int token_start_codepoint = 0; int codepoint_idx = 0; for (auto it = input_unicode.begin(); it != input_unicode.end(); ++it) { const char32 code_point = *it; FilterResult filter_result = filter(code_point); if (filter_result.to_split) { const std::string token_text = UnicodeText::UTF8Substring(start_it, it); if (!token_text.empty()) { tokens.push_back( Token{token_text, token_start_codepoint, codepoint_idx}); } if (filter_result.to_keep) { const std::string delimiter = UnicodeText::UTF8Substring(it, std::next(it)); tokens.push_back(Token{delimiter, codepoint_idx, codepoint_idx + 1}); } start_it = std::next(it); token_start_codepoint = codepoint_idx + 1; } codepoint_idx++; } // Flush the last token if any. if (start_it != input_unicode.end()) { const std::string token_text = UnicodeText::UTF8Substring(start_it, input_unicode.end()); tokens.push_back(Token{token_text, token_start_codepoint, codepoint_idx}); } return tokens; } } // namespace libtextclassifier3 #endif // LIBTEXTCLASSIFIER_UTILS_TOKENIZER_UTILS_H_