/* * Copyright (C) 2018 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "utils/normalization.h" #include "utils/base/logging.h" #include "utils/normalization_generated.h" namespace libtextclassifier3 { UnicodeText NormalizeText(const UniLib& unilib, const NormalizationOptions* normalization_options, const UnicodeText& text) { return NormalizeTextCodepointWise( unilib, normalization_options->codepointwise_normalization(), text); } UnicodeText NormalizeTextCodepointWise(const UniLib& unilib, const uint32 codepointwise_ops, const UnicodeText& text) { // Sanity check. TC3_CHECK(!((codepointwise_ops & NormalizationOptions_::CodepointwiseNormalizationOp_LOWERCASE) && (codepointwise_ops & NormalizationOptions_::CodepointwiseNormalizationOp_UPPERCASE))); UnicodeText result; for (const char32 codepoint : text) { // Skip whitespace. if ((codepointwise_ops & NormalizationOptions_::CodepointwiseNormalizationOp_DROP_WHITESPACE) && unilib.IsWhitespace(codepoint)) { continue; } // Skip punctuation. if ((codepointwise_ops & NormalizationOptions_:: CodepointwiseNormalizationOp_DROP_PUNCTUATION) && unilib.IsPunctuation(codepoint)) { continue; } int32 normalized_codepoint = codepoint; // Lower case. if (codepointwise_ops & NormalizationOptions_::CodepointwiseNormalizationOp_LOWERCASE) { normalized_codepoint = unilib.ToLower(normalized_codepoint); // Upper case. } else if (codepointwise_ops & NormalizationOptions_::CodepointwiseNormalizationOp_UPPERCASE) { normalized_codepoint = unilib.ToUpper(normalized_codepoint); } result.push_back(normalized_codepoint); } return result; } } // namespace libtextclassifier3