You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
95 lines
2.9 KiB
95 lines
2.9 KiB
/*
|
|
* Copyright (C) 2018 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "lang_id/script/approx-script.h"
|
|
|
|
#include "lang_id/common/lite_base/integral-types.h"
|
|
#include "lang_id/common/lite_base/logging.h"
|
|
#include "lang_id/common/utf8.h"
|
|
#include "lang_id/script/approx-script-data.h"
|
|
|
|
namespace libtextclassifier3 {
|
|
namespace mobile {
|
|
|
|
// int value of USCRIPT_UNKNOWN from enum UScriptCode (from
|
|
// unicode/uscript.h). Note: we do have a test that
|
|
// USCRIPT_UNKNOWN evaluates to 103.
|
|
const int kUnknownUscript = 103;
|
|
|
|
namespace {
|
|
using approx_script_internal::kNumRanges;
|
|
using approx_script_internal::kRangeFirst;
|
|
using approx_script_internal::kRangeScript;
|
|
using approx_script_internal::kRangeSizeMinusOne;
|
|
|
|
uint32 Utf8ToCodepoint(const unsigned char *s, int num_bytes) {
|
|
switch (num_bytes) {
|
|
case 1:
|
|
return s[0];
|
|
case 2:
|
|
return ((s[0] & 0x1F) << 6) | (s[1] & 0x3F);
|
|
case 3:
|
|
return (((s[0] & 0x0F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F));
|
|
case 4:
|
|
return (((s[0] & 0x07) << 18) | ((s[1] & 0x3F) << 12) |
|
|
((s[2] & 0x3F) << 6) | (s[3] & 0x3F));
|
|
default:
|
|
SAFTM_DLOG(FATAL) << "Illegal num_bytes: " << num_bytes;
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
inline int BinarySearch(uint32 codepoint, int start, int end) {
|
|
while (end > start + 1) {
|
|
// Due to the while loop condition, middle > start and middle < end. Hence,
|
|
// on both branches of the if below, we strictly reduce the end - start
|
|
// value, so we eventually get that difference below 1 and complete the
|
|
// while loop.
|
|
int middle = (start + end) / 2;
|
|
if (codepoint < kRangeFirst[middle]) {
|
|
end = middle;
|
|
} else {
|
|
start = middle;
|
|
}
|
|
}
|
|
|
|
if (end == start + 1) {
|
|
const uint32 range_start = kRangeFirst[start];
|
|
if ((codepoint >= range_start) &&
|
|
(codepoint <= range_start + kRangeSizeMinusOne[start])) {
|
|
return kRangeScript[start];
|
|
}
|
|
}
|
|
|
|
return kUnknownUscript;
|
|
}
|
|
} // namespace
|
|
|
|
int GetApproxScript(const unsigned char *s, int num_bytes) {
|
|
SAFTM_DCHECK_NE(s, nullptr);
|
|
SAFTM_DCHECK_EQ(num_bytes,
|
|
utils::OneCharLen(reinterpret_cast<const char *>(s)));
|
|
uint32 codepoint = Utf8ToCodepoint(s, num_bytes);
|
|
return BinarySearch(codepoint, 0, kNumRanges);
|
|
}
|
|
|
|
int GetMaxApproxScriptResult() { return approx_script_internal::kMaxScript; }
|
|
|
|
SAFTM_STATIC_REGISTRATION(ApproxScriptDetector);
|
|
|
|
} // namespace mobile
|
|
} // namespace nlp_saft
|