You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
82 lines
3.2 KiB
82 lines
3.2 KiB
/*
|
|
* Copyright (C) 2018 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#ifndef LIBTEXTCLASSIFIER_ANNOTATOR_TRANSLATE_TRANSLATE_H_
|
|
#define LIBTEXTCLASSIFIER_ANNOTATOR_TRANSLATE_TRANSLATE_H_
|
|
|
|
#include "annotator/model_generated.h"
|
|
#include "annotator/types.h"
|
|
#include "utils/utf8/unicodetext.h"
|
|
#include "utils/utf8/unilib.h"
|
|
#include "lang_id/lang-id.h"
|
|
|
|
namespace libtextclassifier3 {
|
|
|
|
// Returns classification with "translate" when the input text is in a language
|
|
// not understood by the user.
|
|
class TranslateAnnotator {
|
|
public:
|
|
TranslateAnnotator(const TranslateAnnotatorOptions* options,
|
|
const libtextclassifier3::mobile::lang_id::LangId* langid_model,
|
|
const UniLib* unilib)
|
|
: options_(options), langid_model_(langid_model), unilib_(unilib) {}
|
|
|
|
// Returns true if a classification_result was filled with "translate"
|
|
// classification.
|
|
bool ClassifyText(const UnicodeText& context, CodepointSpan selection_indices,
|
|
const std::string& user_familiar_language_tags,
|
|
ClassificationResult* classification_result) const;
|
|
|
|
protected:
|
|
struct LanguageConfidence {
|
|
std::string language;
|
|
float confidence = -1.0;
|
|
};
|
|
|
|
// Detects language of the selection in given context using the "Backoff
|
|
// algorithm", sorted by the score descendingly. It is based on several
|
|
// heuristics, see the code. This is the same algorithm that TextClassifier
|
|
// uses in Android Q.
|
|
std::vector<LanguageConfidence> BackoffDetectLanguages(
|
|
const UnicodeText& context, CodepointSpan selection_indices) const;
|
|
|
|
// Returns the iterator of the next whitespace/punctuation character in given
|
|
// text, starting from given position and going forward (iff direction == 1),
|
|
// and backward (iff direction == -1).
|
|
UnicodeText::const_iterator FindIndexOfNextWhitespaceOrPunctuation(
|
|
const UnicodeText& text, int start_index, int direction) const;
|
|
|
|
// Returns substring from given text, centered around the specified indices,
|
|
// of certain minimum length. The substring is token aligned, so it is
|
|
// guaranteed that the words won't be broken down.
|
|
UnicodeText TokenAlignedSubstringAroundSpan(const UnicodeText& text,
|
|
CodepointSpan indices,
|
|
int minimum_length) const;
|
|
|
|
private:
|
|
std::string CreateSerializedEntityData(
|
|
const std::vector<TranslateAnnotator::LanguageConfidence>& confidences)
|
|
const;
|
|
|
|
const TranslateAnnotatorOptions* options_;
|
|
const libtextclassifier3::mobile::lang_id::LangId* langid_model_;
|
|
const UniLib* unilib_;
|
|
};
|
|
|
|
} // namespace libtextclassifier3
|
|
|
|
#endif // LIBTEXTCLASSIFIER_ANNOTATOR_TRANSLATE_TRANSLATE_H_
|