v811_spc009/external/libtextclassifier/native/utils/utf8/unilib-javaicu.h

/*
 * Copyright (C) 2018 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

// An implementation of Unilib that uses Android Java interfaces via JNI. The
// performance critical ops have been re-implemented in C++.
// Specifically, this class must be compatible with API level 14 (ICS).

#ifndef LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_
#define LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_

#include <jni.h>

#include <memory>
#include <mutex>  // NOLINT
#include <string>

#include "utils/base/integral_types.h"
#include "utils/java/jni-base.h"
#include "utils/java/jni-cache.h"
#include "utils/java/jni-helper.h"
#include "utils/utf8/unicodetext.h"
#include "utils/utf8/unilib-common.h"

namespace libtextclassifier3 {

class UniLibBase {
 public:
  UniLibBase();
  explicit UniLibBase(const std::shared_ptr<JniCache>& jni_cache);

  bool ParseInt32(const UnicodeText& text, int32* result) const;
  bool ParseInt64(const UnicodeText& text, int64* result) const;
  bool ParseDouble(const UnicodeText& text, double* result) const;

  bool IsOpeningBracket(char32 codepoint) const;
  bool IsClosingBracket(char32 codepoint) const;
  bool IsWhitespace(char32 codepoint) const;
  bool IsDigit(char32 codepoint) const;
  bool IsLower(char32 codepoint) const;
  bool IsUpper(char32 codepoint) const;
  bool IsPunctuation(char32 codepoint) const;

  char32 ToLower(char32 codepoint) const;
  char32 ToUpper(char32 codepoint) const;
  char32 GetPairedBracket(char32 codepoint) const;

  StatusOr<int32> Length(const UnicodeText& text) const;

  // Forward declaration for friend.
  class RegexPattern;

  class RegexMatcher {
   public:
    static constexpr int kError = -1;
    static constexpr int kNoError = 0;

    // Checks whether the input text matches the pattern exactly.
    bool Matches(int* status) const;

    // Approximate Matches() implementation implemented using Find(). It uses
    // the first Find() result and then checks that it spans the whole input.
    // NOTE: Unlike Matches() it can result in false negatives.
    // NOTE: Resets the matcher, so the current Find() state will be lost.
    bool ApproximatelyMatches(int* status);

    // Finds occurrences of the pattern in the input text.
    // Can be called repeatedly to find all occurrences. A call will update
    // internal state, so that 'Start', 'End' and 'Group' can be called to get
    // information about the match.
    // NOTE: Any call to ApproximatelyMatches() in between Find() calls will
    // modify the state.
    bool Find(int* status);

    // Gets the start offset of the last match (from  'Find').
    // Sets status to 'kError' if 'Find'
    // was not called previously.
    int Start(int* status) const;

    // Gets the start offset of the specified group of the last match.
    // (from  'Find').
    // Sets status to 'kError' if an invalid group was specified or if 'Find'
    // was not called previously.
    int Start(int group_idx, int* status) const;

    // Gets the end offset of the last match (from  'Find').
    // Sets status to 'kError' if 'Find'
    // was not called previously.
    int End(int* status) const;

    // Gets the end offset of the specified group of the last match.
    // (from  'Find').
    // Sets status to 'kError' if an invalid group was specified or if 'Find'
    // was not called previously.
    int End(int group_idx, int* status) const;

    // Gets the text of the last match (from 'Find').
    // Sets status to 'kError' if 'Find' was not called previously.
    UnicodeText Group(int* status) const;

    // Gets the text of the specified group of the last match (from 'Find').
    // Sets status to 'kError' if an invalid group was specified or if 'Find'
    // was not called previously.
    UnicodeText Group(int group_idx, int* status) const;

    // Returns the matched text (the 0th capturing group).
    std::string Text() const {
      StatusOr<std::string> status_or_result =
          JStringToUtf8String(jni_cache_->GetEnv(), text_.get());
      if (!status_or_result.ok()) {
        TC3_LOG(ERROR) << "JStringToUtf8String failed.";
        return "";
      }
      return status_or_result.ValueOrDie();
    }

   private:
    friend class RegexPattern;
    RegexMatcher(const JniCache* jni_cache, ScopedGlobalRef<jobject> matcher,
                 ScopedGlobalRef<jstring> text);
    bool UpdateLastFindOffset() const;

    const JniCache* jni_cache_;
    ScopedGlobalRef<jobject> matcher_;
    ScopedGlobalRef<jstring> text_;
    mutable int last_find_offset_ = 0;
    mutable int last_find_offset_codepoints_ = 0;
    mutable bool last_find_offset_dirty_ = true;
  };

  class RegexPattern {
   public:
    std::unique_ptr<RegexMatcher> Matcher(const UnicodeText& context) const;

   private:
    friend class UniLibBase;
    RegexPattern(const JniCache* jni_cache, const UnicodeText& pattern,
                 bool lazy);
    Status LockedInitializeIfNotAlready() const;

    const JniCache* jni_cache_;

    // These members need to be mutable because of the lazy initialization.
    // NOTE: The Matcher method first ensures (using a lock) that the
    // initialization was attempted (by using LockedInitializeIfNotAlready) and
    // then can access them without locking.
    mutable std::mutex mutex_;
    mutable ScopedGlobalRef<jobject> pattern_;
    mutable bool initialized_;
    mutable bool initialization_failure_;
    mutable UnicodeText pattern_text_;
  };

  class BreakIterator {
   public:
    int Next();

    static constexpr int kDone = -1;

   private:
    friend class UniLibBase;
    BreakIterator(const JniCache* jni_cache, const UnicodeText& text);

    const JniCache* jni_cache_;
    ScopedGlobalRef<jstring> text_;
    ScopedGlobalRef<jobject> iterator_;
    int last_break_index_;
    int last_unicode_index_;
  };

  std::unique_ptr<RegexPattern> CreateRegexPattern(
      const UnicodeText& regex) const;
  std::unique_ptr<RegexPattern> CreateLazyRegexPattern(
      const UnicodeText& regex) const;
  std::unique_ptr<BreakIterator> CreateBreakIterator(
      const UnicodeText& text) const;

 private:
  template <class T>
  bool ParseInt(const UnicodeText& text, T* result) const;

  std::shared_ptr<JniCache> jni_cache_;
};

template <class T>
bool UniLibBase::ParseInt(const UnicodeText& text, T* result) const {
  if (!jni_cache_) {
    return false;
  }

  // Avoid throwing exceptions when the text is unlikely to be a number.
  int32 result32 = 0;
  if (!PassesIntPreChesks(text, result32)) {
    return false;
  }

  JNIEnv* env = jni_cache_->GetEnv();
  TC3_ASSIGN_OR_RETURN_FALSE(const ScopedLocalRef<jstring> text_java,
                             jni_cache_->ConvertToJavaString(text));
  TC3_ASSIGN_OR_RETURN_FALSE(
      *result,
      JniHelper::CallStaticIntMethod<T>(
          env,
          /*print_exception_on_error=*/false, jni_cache_->integer_class.get(),
          jni_cache_->integer_parse_int, text_java.get()));
  return true;
}

}  // namespace libtextclassifier3

#endif  // LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_