You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

548 lines
17 KiB

/*
* Copyright (C) 2018 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "utils/utf8/unilib-javaicu.h"
#include <math.h>
#include <cassert>
#include <cctype>
#include <map>
#include "utils/base/logging.h"
#include "utils/base/statusor.h"
#include "utils/java/jni-base.h"
#include "utils/java/jni-helper.h"
#include "utils/utf8/unicodetext.h"
namespace libtextclassifier3 {
UniLibBase::UniLibBase() {
TC3_LOG(FATAL) << "Java ICU UniLib must be initialized with a JniCache.";
}
UniLibBase::UniLibBase(const std::shared_ptr<JniCache>& jni_cache)
: jni_cache_(jni_cache) {}
bool UniLibBase::IsOpeningBracket(char32 codepoint) const {
return libtextclassifier3::IsOpeningBracket(codepoint);
}
bool UniLibBase::IsClosingBracket(char32 codepoint) const {
return libtextclassifier3::IsClosingBracket(codepoint);
}
bool UniLibBase::IsWhitespace(char32 codepoint) const {
return libtextclassifier3::IsWhitespace(codepoint);
}
bool UniLibBase::IsDigit(char32 codepoint) const {
return libtextclassifier3::IsDigit(codepoint);
}
bool UniLibBase::IsLower(char32 codepoint) const {
return libtextclassifier3::IsLower(codepoint);
}
bool UniLibBase::IsUpper(char32 codepoint) const {
return libtextclassifier3::IsUpper(codepoint);
}
bool UniLibBase::IsPunctuation(char32 codepoint) const {
return libtextclassifier3::IsPunctuation(codepoint);
}
char32 UniLibBase::ToLower(char32 codepoint) const {
return libtextclassifier3::ToLower(codepoint);
}
char32 UniLibBase::ToUpper(char32 codepoint) const {
return libtextclassifier3::ToUpper(codepoint);
}
char32 UniLibBase::GetPairedBracket(char32 codepoint) const {
return libtextclassifier3::GetPairedBracket(codepoint);
}
// -----------------------------------------------------------------------------
// Implementations that call out to JVM. Behold the beauty.
// -----------------------------------------------------------------------------
StatusOr<int32> UniLibBase::Length(const UnicodeText& text) const {
TC3_ASSIGN_OR_RETURN(ScopedLocalRef<jstring> text_java,
jni_cache_->ConvertToJavaString(text));
JNIEnv* jenv = jni_cache_->GetEnv();
TC3_ASSIGN_OR_RETURN(int utf16_length,
JniHelper::CallIntMethod(jenv, text_java.get(),
jni_cache_->string_length));
return JniHelper::CallIntMethod(jenv, text_java.get(),
jni_cache_->string_code_point_count, 0,
utf16_length);
}
bool UniLibBase::ParseInt32(const UnicodeText& text, int32* result) const {
return ParseInt(text, result);
}
bool UniLibBase::ParseInt64(const UnicodeText& text, int64* result) const {
return ParseInt(text, result);
}
bool UniLibBase::ParseDouble(const UnicodeText& text, double* result) const {
if (!jni_cache_) {
return false;
}
auto it_dot = text.begin();
for (; it_dot != text.end() && !IsDot(*it_dot); it_dot++) {
}
int32 integer_part;
if (!ParseInt(UnicodeText::Substring(text.begin(), it_dot, /*do_copy=*/false),
&integer_part)) {
return false;
}
int32 fractional_part = 0;
if (it_dot != text.end()) {
if (!ParseInt(
UnicodeText::Substring(++it_dot, text.end(), /*do_copy=*/false),
&fractional_part)) {
return false;
}
}
double factional_part_double = fractional_part;
while (factional_part_double >= 1) {
factional_part_double /= 10;
}
*result = integer_part + factional_part_double;
return true;
}
std::unique_ptr<UniLibBase::RegexPattern> UniLibBase::CreateRegexPattern(
const UnicodeText& regex) const {
return std::unique_ptr<UniLibBase::RegexPattern>(
new UniLibBase::RegexPattern(jni_cache_.get(), regex, /*lazy=*/false));
}
std::unique_ptr<UniLibBase::RegexPattern> UniLibBase::CreateLazyRegexPattern(
const UnicodeText& regex) const {
return std::unique_ptr<UniLibBase::RegexPattern>(
new UniLibBase::RegexPattern(jni_cache_.get(), regex, /*lazy=*/true));
}
UniLibBase::RegexPattern::RegexPattern(const JniCache* jni_cache,
const UnicodeText& pattern, bool lazy)
: jni_cache_(jni_cache),
pattern_(nullptr, jni_cache ? jni_cache->jvm : nullptr),
initialized_(false),
initialization_failure_(false),
pattern_text_(pattern) {
if (!lazy) {
LockedInitializeIfNotAlready();
}
}
Status UniLibBase::RegexPattern::LockedInitializeIfNotAlready() const {
std::lock_guard<std::mutex> guard(mutex_);
if (initialized_ || initialization_failure_) {
return Status::OK;
}
if (jni_cache_) {
JNIEnv* jenv = jni_cache_->GetEnv();
initialization_failure_ = true;
TC3_ASSIGN_OR_RETURN(ScopedLocalRef<jstring> regex_java,
jni_cache_->ConvertToJavaString(pattern_text_));
TC3_ASSIGN_OR_RETURN(ScopedLocalRef<jobject> pattern,
JniHelper::CallStaticObjectMethod(
jenv, jni_cache_->pattern_class.get(),
jni_cache_->pattern_compile, regex_java.get()));
pattern_ = MakeGlobalRef(pattern.get(), jenv, jni_cache_->jvm);
if (pattern_ == nullptr) {
return Status::UNKNOWN;
}
initialization_failure_ = false;
initialized_ = true;
pattern_text_.clear(); // We don't need this anymore.
}
return Status::OK;
}
constexpr int UniLibBase::RegexMatcher::kError;
constexpr int UniLibBase::RegexMatcher::kNoError;
std::unique_ptr<UniLibBase::RegexMatcher> UniLibBase::RegexPattern::Matcher(
const UnicodeText& context) const {
LockedInitializeIfNotAlready(); // Possibly lazy initialization.
if (initialization_failure_) {
return nullptr;
}
if (jni_cache_) {
JNIEnv* env = jni_cache_->GetEnv();
const StatusOr<ScopedLocalRef<jstring>> status_or_context_java =
jni_cache_->ConvertToJavaString(context);
if (!status_or_context_java.ok() || !status_or_context_java.ValueOrDie()) {
return nullptr;
}
const StatusOr<ScopedLocalRef<jobject>> status_or_matcher =
JniHelper::CallObjectMethod(env, pattern_.get(),
jni_cache_->pattern_matcher,
status_or_context_java.ValueOrDie().get());
if (jni_cache_->ExceptionCheckAndClear() || !status_or_matcher.ok() ||
!status_or_matcher.ValueOrDie()) {
return nullptr;
}
return std::unique_ptr<UniLibBase::RegexMatcher>(new RegexMatcher(
jni_cache_,
MakeGlobalRef(status_or_matcher.ValueOrDie().get(), env,
jni_cache_->jvm),
MakeGlobalRef(status_or_context_java.ValueOrDie().get(), env,
jni_cache_->jvm)));
} else {
// NOTE: A valid object needs to be created here to pass the interface
// tests.
return std::unique_ptr<UniLibBase::RegexMatcher>(
new RegexMatcher(jni_cache_, {}, {}));
}
}
UniLibBase::RegexMatcher::RegexMatcher(const JniCache* jni_cache,
ScopedGlobalRef<jobject> matcher,
ScopedGlobalRef<jstring> text)
: jni_cache_(jni_cache),
matcher_(std::move(matcher)),
text_(std::move(text)) {}
bool UniLibBase::RegexMatcher::Matches(int* status) const {
if (jni_cache_) {
*status = kNoError;
const bool result = jni_cache_->GetEnv()->CallBooleanMethod(
matcher_.get(), jni_cache_->matcher_matches);
if (jni_cache_->ExceptionCheckAndClear()) {
*status = kError;
return false;
}
return result;
} else {
*status = kError;
return false;
}
}
bool UniLibBase::RegexMatcher::ApproximatelyMatches(int* status) {
*status = kNoError;
jni_cache_->GetEnv()->CallObjectMethod(matcher_.get(),
jni_cache_->matcher_reset);
if (jni_cache_->ExceptionCheckAndClear()) {
*status = kError;
return kError;
}
if (!Find(status) || *status != kNoError) {
return false;
}
const int found_start = jni_cache_->GetEnv()->CallIntMethod(
matcher_.get(), jni_cache_->matcher_start_idx, 0);
if (jni_cache_->ExceptionCheckAndClear()) {
*status = kError;
return kError;
}
const int found_end = jni_cache_->GetEnv()->CallIntMethod(
matcher_.get(), jni_cache_->matcher_end_idx, 0);
if (jni_cache_->ExceptionCheckAndClear()) {
*status = kError;
return kError;
}
int context_length_bmp = jni_cache_->GetEnv()->CallIntMethod(
text_.get(), jni_cache_->string_length);
if (jni_cache_->ExceptionCheckAndClear()) {
*status = kError;
return false;
}
if (found_start != 0 || found_end != context_length_bmp) {
return false;
}
return true;
}
bool UniLibBase::RegexMatcher::UpdateLastFindOffset() const {
if (!last_find_offset_dirty_) {
return true;
}
const int find_offset = jni_cache_->GetEnv()->CallIntMethod(
matcher_.get(), jni_cache_->matcher_start_idx, 0);
if (jni_cache_->ExceptionCheckAndClear()) {
return false;
}
const int codepoint_count = jni_cache_->GetEnv()->CallIntMethod(
text_.get(), jni_cache_->string_code_point_count, last_find_offset_,
find_offset);
if (jni_cache_->ExceptionCheckAndClear()) {
return false;
}
last_find_offset_codepoints_ += codepoint_count;
last_find_offset_ = find_offset;
last_find_offset_dirty_ = false;
return true;
}
bool UniLibBase::RegexMatcher::Find(int* status) {
if (jni_cache_) {
const bool result = jni_cache_->GetEnv()->CallBooleanMethod(
matcher_.get(), jni_cache_->matcher_find);
if (jni_cache_->ExceptionCheckAndClear()) {
*status = kError;
return false;
}
last_find_offset_dirty_ = true;
*status = kNoError;
return result;
} else {
*status = kError;
return false;
}
}
int UniLibBase::RegexMatcher::Start(int* status) const {
return Start(/*group_idx=*/0, status);
}
int UniLibBase::RegexMatcher::Start(int group_idx, int* status) const {
if (jni_cache_) {
*status = kNoError;
if (!UpdateLastFindOffset()) {
*status = kError;
return kError;
}
const int java_index = jni_cache_->GetEnv()->CallIntMethod(
matcher_.get(), jni_cache_->matcher_start_idx, group_idx);
if (jni_cache_->ExceptionCheckAndClear()) {
*status = kError;
return kError;
}
// If the group didn't participate in the match the index is -1.
if (java_index == -1) {
return -1;
}
const int unicode_index = jni_cache_->GetEnv()->CallIntMethod(
text_.get(), jni_cache_->string_code_point_count, last_find_offset_,
java_index);
if (jni_cache_->ExceptionCheckAndClear()) {
*status = kError;
return kError;
}
return unicode_index + last_find_offset_codepoints_;
} else {
*status = kError;
return kError;
}
}
int UniLibBase::RegexMatcher::End(int* status) const {
return End(/*group_idx=*/0, status);
}
int UniLibBase::RegexMatcher::End(int group_idx, int* status) const {
if (jni_cache_) {
*status = kNoError;
if (!UpdateLastFindOffset()) {
*status = kError;
return kError;
}
const int java_index = jni_cache_->GetEnv()->CallIntMethod(
matcher_.get(), jni_cache_->matcher_end_idx, group_idx);
if (jni_cache_->ExceptionCheckAndClear()) {
*status = kError;
return kError;
}
// If the group didn't participate in the match the index is -1.
if (java_index == -1) {
return -1;
}
const int unicode_index = jni_cache_->GetEnv()->CallIntMethod(
text_.get(), jni_cache_->string_code_point_count, last_find_offset_,
java_index);
if (jni_cache_->ExceptionCheckAndClear()) {
*status = kError;
return kError;
}
return unicode_index + last_find_offset_codepoints_;
} else {
*status = kError;
return kError;
}
}
UnicodeText UniLibBase::RegexMatcher::Group(int* status) const {
if (jni_cache_) {
JNIEnv* jenv = jni_cache_->GetEnv();
StatusOr<ScopedLocalRef<jstring>> status_or_java_result =
JniHelper::CallObjectMethod<jstring>(jenv, matcher_.get(),
jni_cache_->matcher_group);
if (jni_cache_->ExceptionCheckAndClear() || !status_or_java_result.ok() ||
!status_or_java_result.ValueOrDie()) {
*status = kError;
return UTF8ToUnicodeText("", /*do_copy=*/false);
}
StatusOr<std::string> status_or_result =
JStringToUtf8String(jenv, status_or_java_result.ValueOrDie().get());
if (!status_or_result.ok()) {
*status = kError;
return UTF8ToUnicodeText("", /*do_copy=*/false);
}
*status = kNoError;
return UTF8ToUnicodeText(status_or_result.ValueOrDie(), /*do_copy=*/true);
} else {
*status = kError;
return UTF8ToUnicodeText("", /*do_copy=*/false);
}
}
UnicodeText UniLibBase::RegexMatcher::Group(int group_idx, int* status) const {
if (jni_cache_) {
JNIEnv* jenv = jni_cache_->GetEnv();
StatusOr<ScopedLocalRef<jstring>> status_or_java_result =
JniHelper::CallObjectMethod<jstring>(
jenv, matcher_.get(), jni_cache_->matcher_group_idx, group_idx);
if (jni_cache_->ExceptionCheckAndClear() || !status_or_java_result.ok()) {
*status = kError;
TC3_LOG(ERROR) << "Exception occurred";
return UTF8ToUnicodeText("", /*do_copy=*/false);
}
// java_result is nullptr when the group did not participate in the match.
// For these cases other UniLib implementations return empty string, and
// the participation can be checked by checking if Start() == -1.
if (!status_or_java_result.ValueOrDie()) {
*status = kNoError;
return UTF8ToUnicodeText("", /*do_copy=*/false);
}
StatusOr<std::string> status_or_result =
JStringToUtf8String(jenv, status_or_java_result.ValueOrDie().get());
if (!status_or_result.ok()) {
*status = kError;
return UTF8ToUnicodeText("", /*do_copy=*/false);
}
*status = kNoError;
return UTF8ToUnicodeText(status_or_result.ValueOrDie(), /*do_copy=*/true);
} else {
*status = kError;
return UTF8ToUnicodeText("", /*do_copy=*/false);
}
}
constexpr int UniLibBase::BreakIterator::kDone;
UniLibBase::BreakIterator::BreakIterator(const JniCache* jni_cache,
const UnicodeText& text)
: jni_cache_(jni_cache),
text_(nullptr, jni_cache ? jni_cache->jvm : nullptr),
iterator_(nullptr, jni_cache ? jni_cache->jvm : nullptr),
last_break_index_(0),
last_unicode_index_(0) {
if (jni_cache_) {
JNIEnv* jenv = jni_cache_->GetEnv();
StatusOr<ScopedLocalRef<jstring>> status_or_text =
jni_cache_->ConvertToJavaString(text);
if (!status_or_text.ok()) {
return;
}
text_ =
MakeGlobalRef(status_or_text.ValueOrDie().get(), jenv, jni_cache->jvm);
if (!text_) {
return;
}
StatusOr<ScopedLocalRef<jobject>> status_or_iterator =
JniHelper::CallStaticObjectMethod(
jenv, jni_cache->breakiterator_class.get(),
jni_cache->breakiterator_getwordinstance,
jni_cache->locale_us.get());
if (!status_or_iterator.ok()) {
return;
}
iterator_ = MakeGlobalRef(status_or_iterator.ValueOrDie().get(), jenv,
jni_cache->jvm);
if (!iterator_) {
return;
}
JniHelper::CallVoidMethod(jenv, iterator_.get(),
jni_cache->breakiterator_settext, text_.get());
}
}
int UniLibBase::BreakIterator::Next() {
if (jni_cache_) {
const int break_index = jni_cache_->GetEnv()->CallIntMethod(
iterator_.get(), jni_cache_->breakiterator_next);
if (jni_cache_->ExceptionCheckAndClear() ||
break_index == BreakIterator::kDone) {
return BreakIterator::kDone;
}
const int token_unicode_length = jni_cache_->GetEnv()->CallIntMethod(
text_.get(), jni_cache_->string_code_point_count, last_break_index_,
break_index);
if (jni_cache_->ExceptionCheckAndClear()) {
return BreakIterator::kDone;
}
last_break_index_ = break_index;
return last_unicode_index_ += token_unicode_length;
}
return BreakIterator::kDone;
}
std::unique_ptr<UniLibBase::BreakIterator> UniLibBase::CreateBreakIterator(
const UnicodeText& text) const {
return std::unique_ptr<UniLibBase::BreakIterator>(
new UniLibBase::BreakIterator(jni_cache_.get(), text));
}
} // namespace libtextclassifier3