You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
163 lines
5.4 KiB
163 lines
5.4 KiB
/*
|
|
* Copyright (C) 2018 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "lang_id/custom-tokenizer.h"
|
|
|
|
#include <ctype.h>
|
|
|
|
#include <string>
|
|
|
|
#include "lang_id/common/lite_base/attributes.h"
|
|
#include "lang_id/common/lite_base/logging.h"
|
|
#include "lang_id/common/utf8.h"
|
|
#include "utf.h"
|
|
|
|
namespace libtextclassifier3 {
|
|
namespace mobile {
|
|
namespace lang_id {
|
|
|
|
namespace {
|
|
inline bool IsTokenSeparator(int num_bytes, const char *curr) {
|
|
if (num_bytes != 1) {
|
|
return false;
|
|
}
|
|
return !isalpha(*curr);
|
|
}
|
|
|
|
// Appends to *word the UTF8 encoding for the lowercase version of the UTF8
|
|
// character that starts at |curr| and has |num_bytes| bytes.
|
|
//
|
|
// NOTE: if the current UTF8 character does not have a lowercase version, then
|
|
// we append the original UTF8 character.
|
|
inline SAFTM_ATTRIBUTE_ALWAYS_INLINE void AppendLowerCase(const char *curr,
|
|
int num_bytes,
|
|
std::string *word) {
|
|
if (num_bytes == 1) {
|
|
// Optimize the ASCII case.
|
|
word->push_back(tolower(*curr));
|
|
return;
|
|
}
|
|
|
|
// Harder, general case.
|
|
//
|
|
// NOTE: for lowercasing, we use the utils from utf.h:
|
|
// charntorune + tolowerrune + runetochar. Unfortunately, that library does
|
|
// not contain any fast util for determining the number of bytes for the UTF8
|
|
// character that starts at a given address *without* converting to a full
|
|
// codepoint (like our utils::OneCharLen, which is used intensively by the
|
|
// rest of our code, including by the performance-critical char ngram
|
|
// feature). Hence, the rest of our code continues to use utils::OneCharLen,
|
|
// and here, when we append the bytes to *word, we make sure that's consistent
|
|
// with utils::OneCharLen.
|
|
|
|
// charntorune() below reads the UTF8 character that starts at curr (using at
|
|
// most num_bytes bytes) and stores the corresponding codepoint into rune.
|
|
Rune rune;
|
|
charntorune(&rune, curr, num_bytes);
|
|
if (rune != Runeerror) {
|
|
Rune lower = tolowerrune(rune);
|
|
char lower_buf[UTFmax];
|
|
runetochar(lower_buf, &lower);
|
|
|
|
// When appending the UTF8 bytes to word, we do not use the number of bytes
|
|
// returned by runetochar(); instead, we use utils::OneCharLen(), the same
|
|
// method used by the char ngram feature. We expect them to be equal, but
|
|
// just in case.
|
|
int lower_num_bytes = utils::OneCharLen(lower_buf);
|
|
|
|
// Using lower_num_bytes below is safe, because, by definition of UTFmax,
|
|
SAFTM_DCHECK_GE(UTFmax, 4);
|
|
|
|
// And, by implementation of utils::OneCharLen():
|
|
SAFTM_DCHECK_GT(lower_num_bytes, 0);
|
|
SAFTM_DCHECK_LE(lower_num_bytes, 4);
|
|
word->append(lower_buf, lower_num_bytes);
|
|
} else {
|
|
// There are sequences of bytes that charntorune() can't convert into a
|
|
// valid Rune (a special case is [0xEF, 0xBF, 0xBD], the UTF8 encoding for
|
|
// the U+FFFD special Unicode character, which is also the value of
|
|
// Runeerror). We keep those bytes unchanged.
|
|
word->append(curr, num_bytes);
|
|
}
|
|
}
|
|
} // namespace
|
|
|
|
void TokenizerForLangId::Setup(TaskContext *context) {
|
|
lowercase_input_ = context->Get("lang_id_lowercase_input", false);
|
|
}
|
|
|
|
void TokenizerForLangId::Tokenize(StringPiece text,
|
|
LightSentence *sentence) const {
|
|
const char *const start = text.data();
|
|
const char *curr = start;
|
|
const char *end = utils::GetSafeEndOfUtf8String(start, text.size());
|
|
|
|
// Corner case: the safe part of the text is empty ("").
|
|
if (curr >= end) {
|
|
return;
|
|
}
|
|
|
|
// Number of bytes for UTF8 character starting at *curr. Note: the loop below
|
|
// is guaranteed to terminate because in each iteration, we move curr by at
|
|
// least num_bytes, and num_bytes is guaranteed to be > 0.
|
|
int num_bytes = utils::OneCharLen(curr);
|
|
while (curr < end) {
|
|
// Jump over consecutive token separators.
|
|
while (IsTokenSeparator(num_bytes, curr)) {
|
|
curr += num_bytes;
|
|
if (curr >= end) {
|
|
return;
|
|
}
|
|
num_bytes = utils::OneCharLen(curr);
|
|
}
|
|
|
|
// If control reaches this point, we are at beginning of a non-empty token.
|
|
sentence->emplace_back();
|
|
std::string *word = &(sentence->back());
|
|
|
|
// Add special token-start character.
|
|
word->push_back('^');
|
|
|
|
// Add UTF8 characters to word, until we hit the end of the safe text or a
|
|
// token separator.
|
|
while (true) {
|
|
if (lowercase_input_) {
|
|
AppendLowerCase(curr, num_bytes, word);
|
|
} else {
|
|
word->append(curr, num_bytes);
|
|
}
|
|
curr += num_bytes;
|
|
if (curr >= end) {
|
|
break;
|
|
}
|
|
num_bytes = utils::OneCharLen(curr);
|
|
if (IsTokenSeparator(num_bytes, curr)) {
|
|
curr += num_bytes;
|
|
if (curr >= end) {
|
|
break;
|
|
}
|
|
num_bytes = utils::OneCharLen(curr);
|
|
break;
|
|
}
|
|
}
|
|
word->push_back('$');
|
|
}
|
|
}
|
|
|
|
} // namespace lang_id
|
|
} // namespace mobile
|
|
} // namespace nlp_saft
|