You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
202 lines
7.3 KiB
202 lines
7.3 KiB
/*
|
|
* Copyright (C) 2018 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "annotator/translate/translate.h"
|
|
|
|
#include <memory>
|
|
|
|
#include "annotator/collections.h"
|
|
#include "annotator/entity-data_generated.h"
|
|
#include "annotator/types.h"
|
|
#include "lang_id/lang-id-wrapper.h"
|
|
#include "utils/base/logging.h"
|
|
#include "utils/i18n/locale.h"
|
|
#include "utils/utf8/unicodetext.h"
|
|
#include "lang_id/lang-id.h"
|
|
|
|
namespace libtextclassifier3 {
|
|
|
|
bool TranslateAnnotator::ClassifyText(
|
|
const UnicodeText& context, CodepointSpan selection_indices,
|
|
const std::string& user_familiar_language_tags,
|
|
ClassificationResult* classification_result) const {
|
|
std::vector<TranslateAnnotator::LanguageConfidence> confidences;
|
|
if (options_->algorithm() ==
|
|
TranslateAnnotatorOptions_::Algorithm::Algorithm_BACKOFF) {
|
|
if (options_->backoff_options() == nullptr) {
|
|
TC3_LOG(WARNING) << "No backoff options specified. Returning.";
|
|
return false;
|
|
}
|
|
confidences = BackoffDetectLanguages(context, selection_indices);
|
|
}
|
|
|
|
if (confidences.empty()) {
|
|
return false;
|
|
}
|
|
|
|
std::vector<Locale> user_familiar_languages;
|
|
if (!ParseLocales(user_familiar_language_tags, &user_familiar_languages)) {
|
|
TC3_LOG(WARNING) << "Couldn't parse the user-understood languages.";
|
|
return false;
|
|
}
|
|
if (user_familiar_languages.empty()) {
|
|
TC3_VLOG(INFO) << "user_familiar_languages is not set, not suggesting "
|
|
"translate action.";
|
|
return false;
|
|
}
|
|
bool user_can_understand_language_of_text = false;
|
|
for (const Locale& locale : user_familiar_languages) {
|
|
if (locale.Language() == confidences[0].language) {
|
|
user_can_understand_language_of_text = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!user_can_understand_language_of_text) {
|
|
classification_result->collection = Collections::Translate();
|
|
classification_result->score = options_->score();
|
|
classification_result->priority_score = options_->priority_score();
|
|
classification_result->serialized_entity_data =
|
|
CreateSerializedEntityData(confidences);
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
std::string TranslateAnnotator::CreateSerializedEntityData(
|
|
const std::vector<TranslateAnnotator::LanguageConfidence>& confidences)
|
|
const {
|
|
EntityDataT entity_data;
|
|
entity_data.translate.reset(new EntityData_::TranslateT());
|
|
|
|
for (const LanguageConfidence& confidence : confidences) {
|
|
EntityData_::Translate_::LanguagePredictionResultT*
|
|
language_prediction_result =
|
|
new EntityData_::Translate_::LanguagePredictionResultT();
|
|
language_prediction_result->language_tag = confidence.language;
|
|
language_prediction_result->confidence_score = confidence.confidence;
|
|
entity_data.translate->language_prediction_results.emplace_back(
|
|
language_prediction_result);
|
|
}
|
|
flatbuffers::FlatBufferBuilder builder;
|
|
FinishEntityDataBuffer(builder, EntityData::Pack(builder, &entity_data));
|
|
return std::string(reinterpret_cast<const char*>(builder.GetBufferPointer()),
|
|
builder.GetSize());
|
|
}
|
|
|
|
std::vector<TranslateAnnotator::LanguageConfidence>
|
|
TranslateAnnotator::BackoffDetectLanguages(
|
|
const UnicodeText& context, CodepointSpan selection_indices) const {
|
|
const float penalize_ratio = options_->backoff_options()->penalize_ratio();
|
|
const int min_text_size = options_->backoff_options()->min_text_size();
|
|
if (selection_indices.second - selection_indices.first < min_text_size &&
|
|
penalize_ratio <= 0) {
|
|
return {};
|
|
}
|
|
|
|
const UnicodeText entity =
|
|
UnicodeText::Substring(context, selection_indices.first,
|
|
selection_indices.second, /*do_copy=*/false);
|
|
const std::vector<std::pair<std::string, float>> lang_id_result =
|
|
langid::GetPredictions(langid_model_, entity.data(), entity.size_bytes());
|
|
|
|
const float more_text_score_ratio =
|
|
1.0f - options_->backoff_options()->subject_text_score_ratio();
|
|
std::vector<std::pair<std::string, float>> more_lang_id_results;
|
|
if (more_text_score_ratio >= 0) {
|
|
const UnicodeText entity_with_context = TokenAlignedSubstringAroundSpan(
|
|
context, selection_indices, min_text_size);
|
|
more_lang_id_results =
|
|
langid::GetPredictions(langid_model_, entity_with_context.data(),
|
|
entity_with_context.size_bytes());
|
|
}
|
|
|
|
const float subject_text_score_ratio =
|
|
options_->backoff_options()->subject_text_score_ratio();
|
|
|
|
std::map<std::string, float> result_map;
|
|
for (const auto& [language, score] : lang_id_result) {
|
|
result_map[language] = subject_text_score_ratio * score;
|
|
}
|
|
for (const auto& [language, score] : more_lang_id_results) {
|
|
result_map[language] += more_text_score_ratio * score * penalize_ratio;
|
|
}
|
|
|
|
std::vector<TranslateAnnotator::LanguageConfidence> result;
|
|
result.reserve(result_map.size());
|
|
for (const auto& [key, value] : result_map) {
|
|
result.push_back({key, value});
|
|
}
|
|
|
|
std::sort(result.begin(), result.end(),
|
|
[](TranslateAnnotator::LanguageConfidence& a,
|
|
TranslateAnnotator::LanguageConfidence& b) {
|
|
return a.confidence > b.confidence;
|
|
});
|
|
return result;
|
|
}
|
|
|
|
UnicodeText::const_iterator
|
|
TranslateAnnotator::FindIndexOfNextWhitespaceOrPunctuation(
|
|
const UnicodeText& text, int start_index, int direction) const {
|
|
TC3_CHECK(direction == 1 || direction == -1);
|
|
auto it = text.begin();
|
|
std::advance(it, start_index);
|
|
while (it > text.begin() && it < text.end()) {
|
|
if (unilib_->IsWhitespace(*it) || unilib_->IsPunctuation(*it)) {
|
|
break;
|
|
}
|
|
std::advance(it, direction);
|
|
}
|
|
return it;
|
|
}
|
|
|
|
UnicodeText TranslateAnnotator::TokenAlignedSubstringAroundSpan(
|
|
const UnicodeText& text, CodepointSpan indices, int minimum_length) const {
|
|
const int text_size_codepoints = text.size_codepoints();
|
|
if (text_size_codepoints < minimum_length) {
|
|
return UnicodeText(text, /*do_copy=*/false);
|
|
}
|
|
|
|
const int start = indices.first;
|
|
const int end = indices.second;
|
|
const int length = end - start;
|
|
if (length >= minimum_length) {
|
|
return UnicodeText::Substring(text, start, end, /*do_copy=*/false);
|
|
}
|
|
|
|
const int offset = (minimum_length - length) / 2;
|
|
const int iter_start = std::max(
|
|
0, std::min(start - offset, text_size_codepoints - minimum_length));
|
|
const int iter_end =
|
|
std::min(text_size_codepoints, iter_start + minimum_length);
|
|
|
|
auto it_start = FindIndexOfNextWhitespaceOrPunctuation(text, iter_start, -1);
|
|
const auto it_end = FindIndexOfNextWhitespaceOrPunctuation(text, iter_end, 1);
|
|
|
|
// The it_start now points to whitespace/punctuation (unless it reached the
|
|
// beginning of the string). So we'll move it one position forward to point to
|
|
// the actual text.
|
|
if (it_start != it_end && unilib_->IsWhitespace(*it_start)) {
|
|
std::advance(it_start, 1);
|
|
}
|
|
|
|
return UnicodeText::Substring(it_start, it_end, /*do_copy=*/false);
|
|
}
|
|
|
|
} // namespace libtextclassifier3
|