/* * Copyright (C) 2018 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "utils/tokenizer-utils.h" #include #include "utils/codepoint-range.h" #include "utils/strings/utf8.h" #include "utils/utf8/unicodetext.h" #include "utils/utf8/unilib-common.h" #include "absl/container/flat_hash_set.h" namespace libtextclassifier3 { using libtextclassifier3::Token; std::vector TokenizeOnSpace(const std::string& text) { return TokenizeOnDelimiters(text, {' '}); } std::vector TokenizeOnDelimiters( const std::string& text, const absl::flat_hash_set& delimiters, bool create_tokens_for_non_space_delimiters) { return TokenizeWithFilter(text, [&](char32 codepoint) { bool to_split = delimiters.find(codepoint) != delimiters.end(); bool to_keep = (create_tokens_for_non_space_delimiters) ? codepoint != ' ' : false; return FilterResult{to_split, to_keep}; }); } std::vector TokenizeOnWhiteSpacePunctuationAndChineseLetter( const absl::string_view text) { return TokenizeWithFilter(text, [](char32 codepoint) { bool is_whitespace = IsWhitespace(codepoint); bool to_split = is_whitespace || IsPunctuation(codepoint) || IsChineseLetter(codepoint); bool to_keep = !is_whitespace; return FilterResult{to_split, to_keep}; }); } } // namespace libtextclassifier3