You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
252 lines
8.5 KiB
252 lines
8.5 KiB
/*
|
|
* Copyright (C) 2018 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#ifndef LIBTEXTCLASSIFIER_UTILS_UTF8_UNICODETEXT_H_
|
|
#define LIBTEXTCLASSIFIER_UTILS_UTF8_UNICODETEXT_H_
|
|
|
|
#include <iterator>
|
|
#include <string>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#include "utils/base/integral_types.h"
|
|
#include "utils/base/logging.h"
|
|
#include "utils/strings/stringpiece.h"
|
|
#include "absl/strings/string_view.h"
|
|
|
|
namespace libtextclassifier3 {
|
|
|
|
// ***************************** UnicodeText **************************
|
|
//
|
|
// A UnicodeText object is a wrapper around a sequence of Unicode
|
|
// codepoint values that allows iteration over these values.
|
|
//
|
|
// The internal representation of the text is UTF-8. Since UTF-8 is a
|
|
// variable-width format, UnicodeText does not provide random access
|
|
// to the text, and changes to the text are permitted only at the end.
|
|
//
|
|
// The UnicodeText class defines a const_iterator. The dereferencing
|
|
// operator (*) returns a codepoint (int32). The iterator is a
|
|
// read-only iterator. It becomes invalid if the text is changed.
|
|
//
|
|
// Codepoints are integers in the range [0, 0xD7FF] or [0xE000,
|
|
// 0x10FFFF], but UnicodeText has the additional restriction that it
|
|
// can contain only those characters that are valid for interchange on
|
|
// the Web. This excludes all of the control codes except for carriage
|
|
// return, line feed, and horizontal tab. It also excludes
|
|
// non-characters, but codepoints that are in the Private Use regions
|
|
// are allowed, as are codepoints that are unassigned. (See the
|
|
// Unicode reference for details.)
|
|
//
|
|
// MEMORY MANAGEMENT:
|
|
//
|
|
// PointToUTF8(buffer, size) creates an alias pointing to buffer.
|
|
//
|
|
// The purpose of an alias is to avoid making an unnecessary copy of a
|
|
// UTF-8 buffer while still providing access to the Unicode values
|
|
// within that text through iterators. The lifetime of an alias must not
|
|
// exceed the lifetime of the buffer from which it was constructed.
|
|
//
|
|
// Aliases should be used with care. If the source from which an alias
|
|
// was created is freed, or if the contents are changed, while the
|
|
// alias is still in use, fatal errors could result. But it can be
|
|
// quite useful to have a UnicodeText "window" through which to see a
|
|
// UTF-8 buffer without having to pay the price of making a copy.
|
|
|
|
class UnicodeText {
|
|
public:
|
|
class const_iterator;
|
|
|
|
UnicodeText(); // Create an empty text.
|
|
UnicodeText(const UnicodeText& src, bool do_copy = true);
|
|
UnicodeText& operator=(UnicodeText&& src);
|
|
~UnicodeText();
|
|
|
|
class const_iterator {
|
|
typedef const_iterator CI;
|
|
|
|
public:
|
|
typedef std::bidirectional_iterator_tag iterator_category;
|
|
typedef char32 value_type;
|
|
typedef int difference_type;
|
|
typedef void pointer; // (Not needed.)
|
|
typedef const char32 reference; // (Needed for const_reverse_iterator)
|
|
|
|
// Iterators are default-constructible.
|
|
const_iterator();
|
|
|
|
// It's safe to make multiple passes over a UnicodeText.
|
|
const_iterator& operator=(const const_iterator& other);
|
|
|
|
char32 operator*() const; // Dereference
|
|
|
|
const_iterator& operator++(); // Advance (++iter)
|
|
const_iterator operator++(int) { // (iter++)
|
|
const_iterator result(*this);
|
|
++*this;
|
|
return result;
|
|
}
|
|
|
|
const_iterator& operator--(); // Retreat (--iter)
|
|
const_iterator operator--(int) { // (iter--)
|
|
const_iterator result(*this);
|
|
--*this;
|
|
return result;
|
|
}
|
|
|
|
friend bool operator==(const CI& lhs, const CI& rhs) {
|
|
return lhs.it_ == rhs.it_;
|
|
}
|
|
friend bool operator!=(const CI& lhs, const CI& rhs) {
|
|
return !(lhs == rhs);
|
|
}
|
|
friend bool operator<(const CI& lhs, const CI& rhs);
|
|
friend bool operator>(const CI& lhs, const CI& rhs) { return rhs < lhs; }
|
|
friend bool operator<=(const CI& lhs, const CI& rhs) {
|
|
return !(rhs < lhs);
|
|
}
|
|
friend bool operator>=(const CI& lhs, const CI& rhs) {
|
|
return !(lhs < rhs);
|
|
}
|
|
|
|
int utf8_length() const {
|
|
const unsigned char byte = static_cast<unsigned char>(it_[0]);
|
|
if (byte < 0x80) {
|
|
return 1;
|
|
} else if (byte < 0xE0) {
|
|
return 2;
|
|
} else if (byte < 0xF0) {
|
|
return 3;
|
|
} else {
|
|
return 4;
|
|
}
|
|
}
|
|
const char* utf8_data() const { return it_; }
|
|
|
|
private:
|
|
friend class UnicodeText;
|
|
explicit const_iterator(const char* it) : it_(it) {}
|
|
|
|
const char* it_;
|
|
};
|
|
|
|
const_iterator begin() const;
|
|
const_iterator end() const;
|
|
|
|
// Gets pointer to the underlying utf8 data.
|
|
const char* data() const;
|
|
|
|
// Gets length (in bytes) of the underlying utf8 data.
|
|
int size_bytes() const;
|
|
|
|
// Computes length (in number of Unicode codepoints) of the underlying utf8
|
|
// data.
|
|
// NOTE: Complexity O(n).
|
|
int size_codepoints() const;
|
|
|
|
bool empty() const;
|
|
|
|
// Checks whether the underlying data is valid utf8 data.
|
|
bool is_valid() const;
|
|
|
|
bool operator==(const UnicodeText& other) const;
|
|
|
|
// x.PointToUTF8(buf,len) changes x so that it points to buf
|
|
// ("becomes an alias"). It does not take ownership or copy buf.
|
|
// This function assumes that the input is interchange valid UTF8.
|
|
UnicodeText& Copy(const UnicodeText& src);
|
|
UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length);
|
|
UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length);
|
|
|
|
// Calling this may invalidate pointers to underlying data.
|
|
UnicodeText& AppendUTF8(const char* utf8, int len);
|
|
UnicodeText& push_back(char32 ch);
|
|
void clear();
|
|
|
|
// Returns an iterator for each codepoint.
|
|
std::vector<const_iterator> Codepoints() const;
|
|
|
|
// Returns the list of codepoints of the UnicodeText.
|
|
std::vector<char32> CodepointsChar32() const;
|
|
|
|
std::string ToUTF8String() const;
|
|
std::string UTF8Substring(int begin_codepoint, int end_codepoint) const;
|
|
static std::string UTF8Substring(const const_iterator& it_begin,
|
|
const const_iterator& it_end);
|
|
static UnicodeText Substring(const UnicodeText& text, int begin_codepoint,
|
|
int end_codepoint, bool do_copy = true);
|
|
static UnicodeText Substring(const const_iterator& it_begin,
|
|
const const_iterator& it_end,
|
|
bool do_copy = true);
|
|
|
|
private:
|
|
friend class const_iterator;
|
|
|
|
class Repr { // A byte-string.
|
|
public:
|
|
char* data_;
|
|
int size_;
|
|
int capacity_;
|
|
bool ours_; // Do we own data_?
|
|
|
|
Repr() : data_(nullptr), size_(0), capacity_(0), ours_(true) {}
|
|
Repr& operator=(Repr&& src);
|
|
~Repr() {
|
|
if (ours_) delete[] data_;
|
|
}
|
|
|
|
void clear();
|
|
void reserve(int capacity);
|
|
void resize(int size);
|
|
|
|
void append(const char* bytes, int byte_length);
|
|
void Copy(const char* data, int size);
|
|
void PointTo(const char* data, int size);
|
|
|
|
private:
|
|
Repr& operator=(const Repr&);
|
|
Repr(const Repr& other);
|
|
};
|
|
|
|
Repr repr_;
|
|
};
|
|
|
|
typedef std::pair<UnicodeText::const_iterator, UnicodeText::const_iterator>
|
|
UnicodeTextRange;
|
|
|
|
// NOTE: The following are needed to avoid implicit conversion from char* to
|
|
// std::string, or from ::string to std::string, because if this happens it
|
|
// often results in invalid memory access to a temporary object created during
|
|
// such conversion (if do_copy == false).
|
|
// NOTE: These methods don't check if the input string is UTF8 well formed, for
|
|
// efficiency reasons. Use UnicodeText::is_valid() when explicitly needed.
|
|
UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len,
|
|
bool do_copy = true);
|
|
UnicodeText UTF8ToUnicodeText(const char* utf8_buf, bool do_copy = true);
|
|
UnicodeText UTF8ToUnicodeText(const std::string& str, bool do_copy = true);
|
|
UnicodeText UTF8ToUnicodeText(StringPiece str, bool do_copy = true);
|
|
UnicodeText UTF8ToUnicodeText(absl::string_view str, bool do_copy = true);
|
|
|
|
inline logging::LoggingStringStream& operator<<(
|
|
logging::LoggingStringStream& stream, const UnicodeText& message) {
|
|
stream.message.append(message.data(), message.size_bytes());
|
|
return stream;
|
|
}
|
|
|
|
} // namespace libtextclassifier3
|
|
|
|
#endif // LIBTEXTCLASSIFIER_UTILS_UTF8_UNICODETEXT_H_
|