v811_spc009/external/libtextclassifier/native/utils/wordpiece_tokenizer.h

/*
 * Copyright (C) 2018 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef LIBTEXTCLASSIFIER_UTILS_WORDPIECE_TOKENIZER_H_
#define LIBTEXTCLASSIFIER_UTILS_WORDPIECE_TOKENIZER_H_

#include <string>
#include <vector>

#include "absl/strings/string_view.h"

namespace libtextclassifier3 {

struct LookupStatus {
  LookupStatus() : error_msg(""), success(true) {}
  explicit LookupStatus(const std::string& msg)
      : error_msg(msg), success(false) {}
  std::string error_msg;
  bool success;

  static LookupStatus OK() { return LookupStatus(); }
};

class WordpieceVocab {
 public:
  virtual ~WordpieceVocab() {}
  virtual LookupStatus Contains(const absl::string_view key,
                                bool* value) const = 0;
};

LookupStatus WordpieceTokenize(
    const absl::string_view token, const int max_bytes_per_token,
    const int max_chars_per_subtoken, const std::string& suffix_indicator,
    bool use_unknown_token, const std::string& unknown_token,
    bool split_unknown_characters, const WordpieceVocab* vocab_map,
    std::vector<std::string>* subwords, std::vector<int>* begin_offset,
    std::vector<int>* end_offset, int* num_word_pieces);

// As above but with `max_bytes_per_subtoken` unknown,
// and split_unknown_characters=false. (For backwards compatibility.)
LookupStatus WordpieceTokenize(
    const absl::string_view token, const int max_bytes_per_token,
    const std::string& suffix_indicator, bool use_unknown_token,
    const std::string& unknown_token, const WordpieceVocab* vocab_map,
    std::vector<std::string>* subwords, std::vector<int>* begin_offset,
    std::vector<int>* end_offset, int* num_word_pieces);

}  // namespace libtextclassifier3

#endif  // LIBTEXTCLASSIFIER_UTILS_WORDPIECE_TOKENIZER_H_