You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
311 lines
9.8 KiB
311 lines
9.8 KiB
/*
|
|
* Copyright (C) 2018 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "utils/token-feature-extractor.h"
|
|
|
|
#include <cctype>
|
|
#include <string>
|
|
|
|
#include "utils/base/logging.h"
|
|
#include "utils/hash/farmhash.h"
|
|
#include "utils/strings/stringpiece.h"
|
|
#include "utils/utf8/unicodetext.h"
|
|
|
|
namespace libtextclassifier3 {
|
|
|
|
namespace {
|
|
|
|
std::string RemapTokenAscii(const std::string& token,
|
|
const TokenFeatureExtractorOptions& options) {
|
|
if (!options.remap_digits && !options.lowercase_tokens) {
|
|
return token;
|
|
}
|
|
|
|
std::string copy = token;
|
|
for (int i = 0; i < token.size(); ++i) {
|
|
if (options.remap_digits && isdigit(copy[i])) {
|
|
copy[i] = '0';
|
|
}
|
|
if (options.lowercase_tokens) {
|
|
copy[i] = tolower(copy[i]);
|
|
}
|
|
}
|
|
return copy;
|
|
}
|
|
|
|
void RemapTokenUnicode(const std::string& token,
|
|
const TokenFeatureExtractorOptions& options,
|
|
const UniLib& unilib, UnicodeText* remapped) {
|
|
if (!options.remap_digits && !options.lowercase_tokens) {
|
|
// Leave remapped untouched.
|
|
return;
|
|
}
|
|
|
|
UnicodeText word = UTF8ToUnicodeText(token, /*do_copy=*/false);
|
|
remapped->clear();
|
|
for (auto it = word.begin(); it != word.end(); ++it) {
|
|
if (options.remap_digits && unilib.IsDigit(*it)) {
|
|
remapped->push_back('0');
|
|
} else if (options.lowercase_tokens) {
|
|
remapped->push_back(unilib.ToLower(*it));
|
|
} else {
|
|
remapped->push_back(*it);
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace
|
|
|
|
TokenFeatureExtractor::TokenFeatureExtractor(
|
|
const TokenFeatureExtractorOptions& options, const UniLib* unilib)
|
|
: options_(options), unilib_(*unilib) {
|
|
for (const std::string& pattern : options.regexp_features) {
|
|
regex_patterns_.push_back(std::unique_ptr<UniLib::RegexPattern>(
|
|
unilib_.CreateRegexPattern(UTF8ToUnicodeText(
|
|
pattern.c_str(), pattern.size(), /*do_copy=*/false))));
|
|
}
|
|
}
|
|
|
|
bool TokenFeatureExtractor::Extract(const Token& token, bool is_in_span,
|
|
std::vector<int>* sparse_features,
|
|
std::vector<float>* dense_features) const {
|
|
if (!dense_features) {
|
|
return false;
|
|
}
|
|
if (sparse_features) {
|
|
*sparse_features = ExtractCharactergramFeatures(token);
|
|
}
|
|
*dense_features = ExtractDenseFeatures(token, is_in_span);
|
|
return true;
|
|
}
|
|
|
|
std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeatures(
|
|
const Token& token) const {
|
|
if (options_.unicode_aware_features) {
|
|
return ExtractCharactergramFeaturesUnicode(token);
|
|
} else {
|
|
return ExtractCharactergramFeaturesAscii(token);
|
|
}
|
|
}
|
|
|
|
std::vector<float> TokenFeatureExtractor::ExtractDenseFeatures(
|
|
const Token& token, bool is_in_span) const {
|
|
std::vector<float> dense_features;
|
|
|
|
if (options_.extract_case_feature) {
|
|
if (options_.unicode_aware_features) {
|
|
UnicodeText token_unicode =
|
|
UTF8ToUnicodeText(token.value, /*do_copy=*/false);
|
|
if (!token.value.empty() && unilib_.IsUpper(*token_unicode.begin())) {
|
|
dense_features.push_back(1.0);
|
|
} else {
|
|
dense_features.push_back(-1.0);
|
|
}
|
|
} else {
|
|
if (!token.value.empty() && isupper(*token.value.begin())) {
|
|
dense_features.push_back(1.0);
|
|
} else {
|
|
dense_features.push_back(-1.0);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (options_.extract_selection_mask_feature) {
|
|
if (is_in_span) {
|
|
dense_features.push_back(1.0);
|
|
} else {
|
|
if (options_.unicode_aware_features) {
|
|
dense_features.push_back(-1.0);
|
|
} else {
|
|
dense_features.push_back(0.0);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Add regexp features.
|
|
if (!regex_patterns_.empty()) {
|
|
UnicodeText token_unicode =
|
|
UTF8ToUnicodeText(token.value, /*do_copy=*/false);
|
|
for (int i = 0; i < regex_patterns_.size(); ++i) {
|
|
if (!regex_patterns_[i].get()) {
|
|
dense_features.push_back(-1.0);
|
|
continue;
|
|
}
|
|
auto matcher = regex_patterns_[i]->Matcher(token_unicode);
|
|
int status;
|
|
if (matcher->Matches(&status)) {
|
|
dense_features.push_back(1.0);
|
|
} else {
|
|
dense_features.push_back(-1.0);
|
|
}
|
|
}
|
|
}
|
|
|
|
return dense_features;
|
|
}
|
|
|
|
int TokenFeatureExtractor::HashToken(StringPiece token) const {
|
|
if (options_.allowed_chargrams.empty()) {
|
|
return tc3farmhash::Fingerprint64(token) % options_.num_buckets;
|
|
} else {
|
|
// Padding and out-of-vocabulary tokens have extra buckets reserved because
|
|
// they are special and important tokens, and we don't want them to share
|
|
// embedding with other charactergrams.
|
|
// TODO(zilka): Experimentally verify.
|
|
const int kNumExtraBuckets = 2;
|
|
const std::string token_string = token.ToString();
|
|
if (token_string == "<PAD>") {
|
|
return 1;
|
|
} else if (options_.allowed_chargrams.find(token_string) ==
|
|
options_.allowed_chargrams.end()) {
|
|
return 0; // Out-of-vocabulary.
|
|
} else {
|
|
return (tc3farmhash::Fingerprint64(token) %
|
|
(options_.num_buckets - kNumExtraBuckets)) +
|
|
kNumExtraBuckets;
|
|
}
|
|
}
|
|
}
|
|
|
|
std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeaturesAscii(
|
|
const Token& token) const {
|
|
std::vector<int> result;
|
|
if (token.is_padding || token.value.empty()) {
|
|
result.push_back(HashToken("<PAD>"));
|
|
} else {
|
|
const std::string word = RemapTokenAscii(token.value, options_);
|
|
|
|
// Trim words that are over max_word_length characters.
|
|
const int max_word_length = options_.max_word_length;
|
|
std::string feature_word;
|
|
if (word.size() > max_word_length) {
|
|
feature_word =
|
|
"^" + word.substr(0, max_word_length / 2) + "\1" +
|
|
word.substr(word.size() - max_word_length / 2, max_word_length / 2) +
|
|
"$";
|
|
} else {
|
|
// Add a prefix and suffix to the word.
|
|
feature_word = "^" + word + "$";
|
|
}
|
|
|
|
// Upper-bound the number of charactergram extracted to avoid resizing.
|
|
result.reserve(options_.chargram_orders.size() * feature_word.size());
|
|
|
|
if (options_.chargram_orders.empty()) {
|
|
result.push_back(HashToken(feature_word));
|
|
} else {
|
|
// Generate the character-grams.
|
|
for (int chargram_order : options_.chargram_orders) {
|
|
if (chargram_order == 1) {
|
|
for (int i = 1; i < feature_word.size() - 1; ++i) {
|
|
result.push_back(
|
|
HashToken(StringPiece(feature_word, /*offset=*/i, /*len=*/1)));
|
|
}
|
|
} else {
|
|
for (int i = 0;
|
|
i < static_cast<int>(feature_word.size()) - chargram_order + 1;
|
|
++i) {
|
|
result.push_back(HashToken(StringPiece(feature_word, /*offset=*/i,
|
|
/*len=*/chargram_order)));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeaturesUnicode(
|
|
const Token& token) const {
|
|
std::vector<int> result;
|
|
if (token.is_padding || token.value.empty()) {
|
|
result.push_back(HashToken("<PAD>"));
|
|
} else {
|
|
UnicodeText word = UTF8ToUnicodeText(token.value, /*do_copy=*/false);
|
|
RemapTokenUnicode(token.value, options_, unilib_, &word);
|
|
|
|
// Trim the word if needed by finding a left-cut point and right-cut point.
|
|
auto left_cut = word.begin();
|
|
auto right_cut = word.end();
|
|
for (int i = 0; i < options_.max_word_length / 2; i++) {
|
|
if (left_cut < right_cut) {
|
|
++left_cut;
|
|
}
|
|
if (left_cut < right_cut) {
|
|
--right_cut;
|
|
}
|
|
}
|
|
|
|
std::string feature_word;
|
|
if (left_cut == right_cut) {
|
|
feature_word = "^" + word.UTF8Substring(word.begin(), word.end()) + "$";
|
|
} else {
|
|
// clang-format off
|
|
feature_word = "^" +
|
|
word.UTF8Substring(word.begin(), left_cut) +
|
|
"\1" +
|
|
word.UTF8Substring(right_cut, word.end()) +
|
|
"$";
|
|
// clang-format on
|
|
}
|
|
|
|
const UnicodeText feature_word_unicode =
|
|
UTF8ToUnicodeText(feature_word, /*do_copy=*/false);
|
|
|
|
// Upper-bound the number of charactergram extracted to avoid resizing.
|
|
result.reserve(options_.chargram_orders.size() * feature_word.size());
|
|
|
|
if (options_.chargram_orders.empty()) {
|
|
result.push_back(HashToken(feature_word));
|
|
} else {
|
|
// Generate the character-grams.
|
|
for (int chargram_order : options_.chargram_orders) {
|
|
UnicodeText::const_iterator it_start = feature_word_unicode.begin();
|
|
UnicodeText::const_iterator it_end = feature_word_unicode.end();
|
|
if (chargram_order == 1) {
|
|
++it_start;
|
|
--it_end;
|
|
}
|
|
|
|
UnicodeText::const_iterator it_chargram_start = it_start;
|
|
UnicodeText::const_iterator it_chargram_end = it_start;
|
|
bool chargram_is_complete = true;
|
|
for (int i = 0; i < chargram_order; ++i) {
|
|
if (it_chargram_end == it_end) {
|
|
chargram_is_complete = false;
|
|
break;
|
|
}
|
|
++it_chargram_end;
|
|
}
|
|
if (!chargram_is_complete) {
|
|
continue;
|
|
}
|
|
|
|
for (; it_chargram_end <= it_end;
|
|
++it_chargram_start, ++it_chargram_end) {
|
|
const int length_bytes =
|
|
it_chargram_end.utf8_data() - it_chargram_start.utf8_data();
|
|
result.push_back(HashToken(
|
|
StringPiece(it_chargram_start.utf8_data(), length_bytes)));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
} // namespace libtextclassifier3
|