You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

150 lines
4.3 KiB

/*
* wchar_t helpers, version CPython >= 3.3.
*
* CPython 3.3 added support for sys.maxunicode == 0x10FFFF on all
* platforms, even ones with wchar_t limited to 2 bytes. As such,
* this code here works from the outside like wchar_helper.h in the
* case Py_UNICODE_SIZE == 4, but the implementation is very different.
*/
typedef uint16_t cffi_char16_t;
typedef uint32_t cffi_char32_t;
static PyObject *
_my_PyUnicode_FromChar32(const cffi_char32_t *w, Py_ssize_t size)
{
return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, w, size);
}
static PyObject *
_my_PyUnicode_FromChar16(const cffi_char16_t *w, Py_ssize_t size)
{
/* are there any surrogate pairs, and if so, how many? */
Py_ssize_t i, count_surrogates = 0;
for (i = 0; i < size - 1; i++) {
if (0xD800 <= w[i] && w[i] <= 0xDBFF &&
0xDC00 <= w[i+1] && w[i+1] <= 0xDFFF)
count_surrogates++;
}
if (count_surrogates == 0) {
/* no, fast path */
return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, w, size);
}
else
{
PyObject *result = PyUnicode_New(size - count_surrogates, 0x10FFFF);
Py_UCS4 *data;
assert(PyUnicode_KIND(result) == PyUnicode_4BYTE_KIND);
data = PyUnicode_4BYTE_DATA(result);
for (i = 0; i < size; i++)
{
cffi_char32_t ch = w[i];
if (0xD800 <= ch && ch <= 0xDBFF && i < size - 1) {
cffi_char32_t ch2 = w[i + 1];
if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
i++;
}
}
*data++ = ch;
}
return result;
}
}
static int
_my_PyUnicode_AsSingleChar16(PyObject *unicode, cffi_char16_t *result,
char *err_got)
{
cffi_char32_t ch;
if (PyUnicode_GET_LENGTH(unicode) != 1) {
sprintf(err_got, "unicode string of length %zd",
PyUnicode_GET_LENGTH(unicode));
return -1;
}
ch = PyUnicode_READ_CHAR(unicode, 0);
if (ch > 0xFFFF)
{
sprintf(err_got, "larger-than-0xFFFF character");
return -1;
}
*result = (cffi_char16_t)ch;
return 0;
}
static int
_my_PyUnicode_AsSingleChar32(PyObject *unicode, cffi_char32_t *result,
char *err_got)
{
if (PyUnicode_GET_LENGTH(unicode) != 1) {
sprintf(err_got, "unicode string of length %zd",
PyUnicode_GET_LENGTH(unicode));
return -1;
}
*result = PyUnicode_READ_CHAR(unicode, 0);
return 0;
}
static Py_ssize_t _my_PyUnicode_SizeAsChar16(PyObject *unicode)
{
Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
Py_ssize_t result = length;
unsigned int kind = PyUnicode_KIND(unicode);
if (kind == PyUnicode_4BYTE_KIND)
{
Py_UCS4 *data = PyUnicode_4BYTE_DATA(unicode);
Py_ssize_t i;
for (i = 0; i < length; i++) {
if (data[i] > 0xFFFF)
result++;
}
}
return result;
}
static Py_ssize_t _my_PyUnicode_SizeAsChar32(PyObject *unicode)
{
return PyUnicode_GET_LENGTH(unicode);
}
static int _my_PyUnicode_AsChar16(PyObject *unicode,
cffi_char16_t *result,
Py_ssize_t resultlen)
{
Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
unsigned int kind = PyUnicode_KIND(unicode);
void *data = PyUnicode_DATA(unicode);
Py_ssize_t i;
for (i = 0; i < len; i++) {
cffi_char32_t ordinal = PyUnicode_READ(kind, data, i);
if (ordinal > 0xFFFF) {
if (ordinal > 0x10FFFF) {
PyErr_Format(PyExc_ValueError,
"unicode character out of range for "
"conversion to char16_t: 0x%x", (int)ordinal);
return -1;
}
ordinal -= 0x10000;
*result++ = 0xD800 | (ordinal >> 10);
*result++ = 0xDC00 | (ordinal & 0x3FF);
}
else
*result++ = ordinal;
}
return 0;
}
static int _my_PyUnicode_AsChar32(PyObject *unicode,
cffi_char32_t *result,
Py_ssize_t resultlen)
{
if (PyUnicode_AsUCS4(unicode, (Py_UCS4 *)result, resultlen, 0) == NULL)
return -1;
return 0;
}