v811_spc009/external/libtextclassifier/native/utils/strings/utf8.cc

/*
 * Copyright (C) 2018 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "utils/strings/utf8.h"

#include "utils/base/logging.h"

namespace libtextclassifier3 {

bool IsValidUTF8(const char *src, int size) {
  int char_length;
  for (int i = 0; i < size;) {
    if (!IsValidChar(src + i, size - i, &char_length)) {
      return false;
    }
    i += char_length;
  }
  return true;
}

int SafeTruncateLength(const char *str, int truncate_at) {
  // Always want to truncate at the start of a character, so if
  // it's in a middle, back up toward the start
  while (IsTrailByte(str[truncate_at]) && (truncate_at > 0)) {
    truncate_at--;
  }
  return truncate_at;
}

char32 ValidCharToRune(const char *str) {
  TC3_DCHECK(!IsTrailByte(str[0]) && GetNumBytesForUTF8Char(str) > 0);

  // Convert from UTF-8
  unsigned char byte1 = static_cast<unsigned char>(str[0]);
  if (byte1 < 0x80) {
    // One character sequence: 00000 - 0007F.
    return byte1;
  }

  unsigned char byte2 = static_cast<unsigned char>(str[1]);
  if (byte1 < 0xE0) {
    // Two character sequence: 00080 - 007FF.
    return ((byte1 & 0x1F) << 6) | (byte2 & 0x3F);
  }

  unsigned char byte3 = static_cast<unsigned char>(str[2]);
  if (byte1 < 0xF0) {
    // Three character sequence: 00800 - 0FFFF.
    return ((byte1 & 0x0F) << 12) | ((byte2 & 0x3F) << 6) | (byte3 & 0x3F);
  }

  unsigned char byte4 = static_cast<unsigned char>(str[3]);
  // Four character sequence: 10000 - 1FFFF.
  return ((byte1 & 0x07) << 18) | ((byte2 & 0x3F) << 12) |
         ((byte3 & 0x3F) << 6) | (byte4 & 0x3F);
}

bool IsValidChar(const char *str, int size, int *num_bytes) {
  // Unexpected trail byte.
  if (IsTrailByte(str[0])) {
    return false;
  }

  *num_bytes = GetNumBytesForUTF8Char(str);
  if (*num_bytes <= 0 || *num_bytes > size) {
    return false;
  }

  // Check that remaining bytes in the codepoint are trailing bytes.
  for (int k = 1; k < *num_bytes; k++) {
    if (!IsTrailByte(str[k])) {
      return false;
    }
  }

  // Exclude overlong encodings.
  // Check that the codepoint is encoded with the minimum number of required
  // bytes. An ascii value could be encoded in 4, 3 or 2 bytes but requires
  // only 1. There is a unique valid encoding for each code point.
  // This ensures that string comparisons and searches are well-defined.
  // See: https://en.wikipedia.org/wiki/UTF-8
  const char32 codepoint = ValidCharToRune(str);
  switch (*num_bytes) {
    case 1:
      return true;
    case 2:
      // Everything below 128 can be encoded in one byte.
      return (codepoint >= (1 << 7 /* num. payload bits in one byte */));
    case 3:
      return (codepoint >= (1 << 11 /* num. payload bits in two utf8 bytes */));
    case 4:
      return (codepoint >=
              (1 << 16 /* num. payload bits in three utf8 bytes */)) &&
             (codepoint < 0x10FFFF /* maximum rune value */);
  }
  return false;
}

int ValidRuneToChar(const char32 rune, char *dest) {
  // Convert to unsigned for range check.
  uint32 c;

  // 1 char 00-7F
  c = rune;
  if (c <= 0x7F) {
    dest[0] = static_cast<char>(c);
    return 1;
  }

  // 2 char 0080-07FF
  if (c <= 0x07FF) {
    dest[0] = 0xC0 | static_cast<char>(c >> 1 * 6);
    dest[1] = 0x80 | (c & 0x3F);
    return 2;
  }

  // 3 char 0800-FFFF
  if (c <= 0xFFFF) {
    dest[0] = 0xE0 | static_cast<char>(c >> 2 * 6);
    dest[1] = 0x80 | ((c >> 1 * 6) & 0x3F);
    dest[2] = 0x80 | (c & 0x3F);
    return 3;
  }

  // 4 char 10000-1FFFFF
  dest[0] = 0xF0 | static_cast<char>(c >> 3 * 6);
  dest[1] = 0x80 | ((c >> 2 * 6) & 0x3F);
  dest[2] = 0x80 | ((c >> 1 * 6) & 0x3F);
  dest[3] = 0x80 | (c & 0x3F);
  return 4;
}

}  // namespace libtextclassifier3