You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
106 lines
4.0 KiB
106 lines
4.0 KiB
7 months ago
|
/*
|
||
|
* Copyright (C) 2018 The Android Open Source Project
|
||
|
*
|
||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
* you may not use this file except in compliance with the License.
|
||
|
* You may obtain a copy of the License at
|
||
|
*
|
||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||
|
*
|
||
|
* Unless required by applicable law or agreed to in writing, software
|
||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
* See the License for the specific language governing permissions and
|
||
|
* limitations under the License.
|
||
|
*/
|
||
|
|
||
|
// Utilities for tests.
|
||
|
|
||
|
#ifndef LIBTEXTCLASSIFIER_UTILS_TOKENIZER_UTILS_H_
|
||
|
#define LIBTEXTCLASSIFIER_UTILS_TOKENIZER_UTILS_H_
|
||
|
|
||
|
#include <string>
|
||
|
|
||
|
#include "annotator/types.h"
|
||
|
#include "utils/codepoint-range.h"
|
||
|
#include "utils/strings/utf8.h"
|
||
|
#include "utils/utf8/unicodetext.h"
|
||
|
#include "absl/container/flat_hash_set.h"
|
||
|
#include "absl/strings/string_view.h"
|
||
|
|
||
|
namespace libtextclassifier3 {
|
||
|
|
||
|
struct FilterResult {
|
||
|
// Whether split on this codepoint.
|
||
|
bool to_split;
|
||
|
// If the codepoint is used to split the text, whether to output it as a
|
||
|
// token.
|
||
|
bool to_keep;
|
||
|
};
|
||
|
|
||
|
// Returns a list of Tokens for a given input string, by tokenizing on space.
|
||
|
std::vector<Token> TokenizeOnSpace(const std::string& text);
|
||
|
|
||
|
// Returns a list of Tokens for a given input string, by tokenizing on the
|
||
|
// given set of delimiter codepoints.
|
||
|
// If create_tokens_for_non_space_delimiters is true, create tokens for
|
||
|
// delimiters which are not white spaces. For example "This, is" -> {"This",
|
||
|
// ",", "is"}.
|
||
|
std::vector<Token> TokenizeOnDelimiters(
|
||
|
const std::string& text, const absl::flat_hash_set<char32>& delimiters,
|
||
|
bool create_tokens_for_non_space_delimiters = false);
|
||
|
|
||
|
// This replicates how the original bert_tokenizer from the tflite-support
|
||
|
// library pretokenize text by using regex_split with these default regexes.
|
||
|
// It splits the text on spaces, punctuations and chinese characters and
|
||
|
// output all the tokens except spaces.
|
||
|
// So far, the only difference between this and the original implementation
|
||
|
// we are aware of is that the original regexes has 8 ranges of chinese
|
||
|
// unicodes. We have all these 8 ranges plus two extra ranges.
|
||
|
std::vector<Token> TokenizeOnWhiteSpacePunctuationAndChineseLetter(
|
||
|
const absl::string_view text);
|
||
|
|
||
|
// Returns a list of Tokens for a given input string, by tokenizing on the
|
||
|
// given filter function. Caller can control which codepoint to split and
|
||
|
// whether a delimiter should be output as a token.
|
||
|
template <typename FilterFn>
|
||
|
std::vector<Token> TokenizeWithFilter(const absl::string_view input,
|
||
|
FilterFn filter) {
|
||
|
const UnicodeText input_unicode = UTF8ToUnicodeText(input, /*do_copy=*/false);
|
||
|
std::vector<Token> tokens;
|
||
|
UnicodeText::const_iterator start_it = input_unicode.begin();
|
||
|
int token_start_codepoint = 0;
|
||
|
int codepoint_idx = 0;
|
||
|
|
||
|
for (auto it = input_unicode.begin(); it != input_unicode.end(); ++it) {
|
||
|
const char32 code_point = *it;
|
||
|
FilterResult filter_result = filter(code_point);
|
||
|
if (filter_result.to_split) {
|
||
|
const std::string token_text = UnicodeText::UTF8Substring(start_it, it);
|
||
|
if (!token_text.empty()) {
|
||
|
tokens.push_back(
|
||
|
Token{token_text, token_start_codepoint, codepoint_idx});
|
||
|
}
|
||
|
if (filter_result.to_keep) {
|
||
|
const std::string delimiter =
|
||
|
UnicodeText::UTF8Substring(it, std::next(it));
|
||
|
tokens.push_back(Token{delimiter, codepoint_idx, codepoint_idx + 1});
|
||
|
}
|
||
|
start_it = std::next(it);
|
||
|
token_start_codepoint = codepoint_idx + 1;
|
||
|
}
|
||
|
codepoint_idx++;
|
||
|
}
|
||
|
// Flush the last token if any.
|
||
|
if (start_it != input_unicode.end()) {
|
||
|
const std::string token_text =
|
||
|
UnicodeText::UTF8Substring(start_it, input_unicode.end());
|
||
|
tokens.push_back(Token{token_text, token_start_codepoint, codepoint_idx});
|
||
|
}
|
||
|
return tokens;
|
||
|
}
|
||
|
|
||
|
} // namespace libtextclassifier3
|
||
|
|
||
|
#endif // LIBTEXTCLASSIFIER_UTILS_TOKENIZER_UTILS_H_
|