/* * Copyright (C) 2018 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef LIBTEXTCLASSIFIER_UTILS_STRINGS_UTF8_H_ #define LIBTEXTCLASSIFIER_UTILS_STRINGS_UTF8_H_ #include "utils/base/integral_types.h" namespace libtextclassifier3 { // Returns the length (number of bytes) of the Unicode code point starting at // src, based on inspecting just that one byte. Preconditions: src != NULL, // *src can be read. static inline int GetNumBytesForUTF8Char(const char *src) { // On most platforms, char is unsigned by default, but iOS is an exception. // The cast below makes sure we always interpret *src as an unsigned char. return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4" [(*(reinterpret_cast(src)) & 0xFF) >> 4]; } // Returns true if this byte is a trailing UTF-8 byte (10xx xxxx) static inline bool IsTrailByte(char x) { // return (x & 0xC0) == 0x80; // Since trail bytes are always in [0x80, 0xBF], we can optimize: return static_cast(x) < -0x40; } // Returns true iff src points to a well-formed UTF-8 string. bool IsValidUTF8(const char *src, int size); // Helper to ensure that strings are not truncated in the middle of // multi-byte UTF-8 characters. // Given a string, and a position at which to truncate, returns the // last position not after the provided cut point, that would truncate a // full character. int SafeTruncateLength(const char *str, int truncate_at); // Gets a unicode codepoint from a valid utf8 encoding. char32 ValidCharToRune(const char *str); // Checks whether a utf8 encoding is a valid codepoint and returns the number of // bytes of the codepoint. bool IsValidChar(const char *str, int size, int *num_bytes); // Converts a valid codepoint to utf8. // Returns the length of the encoding. int ValidRuneToChar(const char32 rune, char *dest); } // namespace libtextclassifier3 #endif // LIBTEXTCLASSIFIER_UTILS_STRINGS_UTF8_H_