You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
441 lines
15 KiB
441 lines
15 KiB
// © 2017 and later: Unicode, Inc. and others.
|
|
// License & terms of use: http://www.unicode.org/copyright.html
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
#if !UCONFIG_NO_FORMATTING
|
|
|
|
#include "number_affixutils.h"
|
|
#include "unicode/utf16.h"
|
|
#include "unicode/uniset.h"
|
|
|
|
using namespace icu;
|
|
using namespace icu::number;
|
|
using namespace icu::number::impl;
|
|
|
|
TokenConsumer::~TokenConsumer() = default;
|
|
SymbolProvider::~SymbolProvider() = default;
|
|
|
|
int32_t AffixUtils::estimateLength(const UnicodeString &patternString, UErrorCode &status) {
|
|
AffixPatternState state = STATE_BASE;
|
|
int32_t offset = 0;
|
|
int32_t length = 0;
|
|
for (; offset < patternString.length();) {
|
|
UChar32 cp = patternString.char32At(offset);
|
|
|
|
switch (state) {
|
|
case STATE_BASE:
|
|
if (cp == u'\'') {
|
|
// First quote
|
|
state = STATE_FIRST_QUOTE;
|
|
} else {
|
|
// Unquoted symbol
|
|
length++;
|
|
}
|
|
break;
|
|
case STATE_FIRST_QUOTE:
|
|
if (cp == u'\'') {
|
|
// Repeated quote
|
|
length++;
|
|
state = STATE_BASE;
|
|
} else {
|
|
// Quoted code point
|
|
length++;
|
|
state = STATE_INSIDE_QUOTE;
|
|
}
|
|
break;
|
|
case STATE_INSIDE_QUOTE:
|
|
if (cp == u'\'') {
|
|
// End of quoted sequence
|
|
state = STATE_AFTER_QUOTE;
|
|
} else {
|
|
// Quoted code point
|
|
length++;
|
|
}
|
|
break;
|
|
case STATE_AFTER_QUOTE:
|
|
if (cp == u'\'') {
|
|
// Double quote inside of quoted sequence
|
|
length++;
|
|
state = STATE_INSIDE_QUOTE;
|
|
} else {
|
|
// Unquoted symbol
|
|
length++;
|
|
}
|
|
break;
|
|
default:
|
|
UPRV_UNREACHABLE;
|
|
}
|
|
|
|
offset += U16_LENGTH(cp);
|
|
}
|
|
|
|
switch (state) {
|
|
case STATE_FIRST_QUOTE:
|
|
case STATE_INSIDE_QUOTE:
|
|
status = U_ILLEGAL_ARGUMENT_ERROR;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return length;
|
|
}
|
|
|
|
UnicodeString AffixUtils::escape(const UnicodeString &input) {
|
|
AffixPatternState state = STATE_BASE;
|
|
int32_t offset = 0;
|
|
UnicodeString output;
|
|
for (; offset < input.length();) {
|
|
UChar32 cp = input.char32At(offset);
|
|
|
|
switch (cp) {
|
|
case u'\'':
|
|
output.append(u"''", -1);
|
|
break;
|
|
|
|
case u'-':
|
|
case u'+':
|
|
case u'%':
|
|
case u'‰':
|
|
case u'¤':
|
|
if (state == STATE_BASE) {
|
|
output.append(u'\'');
|
|
output.append(cp);
|
|
state = STATE_INSIDE_QUOTE;
|
|
} else {
|
|
output.append(cp);
|
|
}
|
|
break;
|
|
|
|
default:
|
|
if (state == STATE_INSIDE_QUOTE) {
|
|
output.append(u'\'');
|
|
output.append(cp);
|
|
state = STATE_BASE;
|
|
} else {
|
|
output.append(cp);
|
|
}
|
|
break;
|
|
}
|
|
offset += U16_LENGTH(cp);
|
|
}
|
|
|
|
if (state == STATE_INSIDE_QUOTE) {
|
|
output.append(u'\'');
|
|
}
|
|
|
|
return output;
|
|
}
|
|
|
|
Field AffixUtils::getFieldForType(AffixPatternType type) {
|
|
switch (type) {
|
|
case TYPE_MINUS_SIGN:
|
|
return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
|
|
case TYPE_PLUS_SIGN:
|
|
return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
|
|
case TYPE_PERCENT:
|
|
return {UFIELD_CATEGORY_NUMBER, UNUM_PERCENT_FIELD};
|
|
case TYPE_PERMILLE:
|
|
return {UFIELD_CATEGORY_NUMBER, UNUM_PERMILL_FIELD};
|
|
case TYPE_CURRENCY_SINGLE:
|
|
return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
|
|
case TYPE_CURRENCY_DOUBLE:
|
|
return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
|
|
case TYPE_CURRENCY_TRIPLE:
|
|
return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
|
|
case TYPE_CURRENCY_QUAD:
|
|
return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
|
|
case TYPE_CURRENCY_QUINT:
|
|
return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
|
|
case TYPE_CURRENCY_OVERFLOW:
|
|
return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
|
|
default:
|
|
UPRV_UNREACHABLE;
|
|
}
|
|
}
|
|
|
|
int32_t
|
|
AffixUtils::unescape(const UnicodeString &affixPattern, FormattedStringBuilder &output, int32_t position,
|
|
const SymbolProvider &provider, Field field, UErrorCode &status) {
|
|
int32_t length = 0;
|
|
AffixTag tag;
|
|
while (hasNext(tag, affixPattern)) {
|
|
tag = nextToken(tag, affixPattern, status);
|
|
if (U_FAILURE(status)) { return length; }
|
|
if (tag.type == TYPE_CURRENCY_OVERFLOW) {
|
|
// Don't go to the provider for this special case
|
|
length += output.insertCodePoint(
|
|
position + length,
|
|
0xFFFD,
|
|
{UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD},
|
|
status);
|
|
} else if (tag.type < 0) {
|
|
length += output.insert(
|
|
position + length, provider.getSymbol(tag.type), getFieldForType(tag.type), status);
|
|
} else {
|
|
length += output.insertCodePoint(position + length, tag.codePoint, field, status);
|
|
}
|
|
}
|
|
return length;
|
|
}
|
|
|
|
int32_t AffixUtils::unescapedCodePointCount(const UnicodeString &affixPattern,
|
|
const SymbolProvider &provider, UErrorCode &status) {
|
|
int32_t length = 0;
|
|
AffixTag tag;
|
|
while (hasNext(tag, affixPattern)) {
|
|
tag = nextToken(tag, affixPattern, status);
|
|
if (U_FAILURE(status)) { return length; }
|
|
if (tag.type == TYPE_CURRENCY_OVERFLOW) {
|
|
length += 1;
|
|
} else if (tag.type < 0) {
|
|
length += provider.getSymbol(tag.type).length();
|
|
} else {
|
|
length += U16_LENGTH(tag.codePoint);
|
|
}
|
|
}
|
|
return length;
|
|
}
|
|
|
|
bool
|
|
AffixUtils::containsType(const UnicodeString &affixPattern, AffixPatternType type, UErrorCode &status) {
|
|
if (affixPattern.length() == 0) {
|
|
return false;
|
|
}
|
|
AffixTag tag;
|
|
while (hasNext(tag, affixPattern)) {
|
|
tag = nextToken(tag, affixPattern, status);
|
|
if (U_FAILURE(status)) { return false; }
|
|
if (tag.type == type) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool AffixUtils::hasCurrencySymbols(const UnicodeString &affixPattern, UErrorCode &status) {
|
|
if (affixPattern.length() == 0) {
|
|
return false;
|
|
}
|
|
AffixTag tag;
|
|
while (hasNext(tag, affixPattern)) {
|
|
tag = nextToken(tag, affixPattern, status);
|
|
if (U_FAILURE(status)) { return false; }
|
|
if (tag.type < 0 && getFieldForType(tag.type) == Field(UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD)) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
UnicodeString AffixUtils::replaceType(const UnicodeString &affixPattern, AffixPatternType type,
|
|
char16_t replacementChar, UErrorCode &status) {
|
|
UnicodeString output(affixPattern); // copy
|
|
if (affixPattern.length() == 0) {
|
|
return output;
|
|
}
|
|
AffixTag tag;
|
|
while (hasNext(tag, affixPattern)) {
|
|
tag = nextToken(tag, affixPattern, status);
|
|
if (U_FAILURE(status)) { return output; }
|
|
if (tag.type == type) {
|
|
output.replace(tag.offset - 1, 1, replacementChar);
|
|
}
|
|
}
|
|
return output;
|
|
}
|
|
|
|
bool AffixUtils::containsOnlySymbolsAndIgnorables(const UnicodeString& affixPattern,
|
|
const UnicodeSet& ignorables, UErrorCode& status) {
|
|
if (affixPattern.length() == 0) {
|
|
return true;
|
|
}
|
|
AffixTag tag;
|
|
while (hasNext(tag, affixPattern)) {
|
|
tag = nextToken(tag, affixPattern, status);
|
|
if (U_FAILURE(status)) { return false; }
|
|
if (tag.type == TYPE_CODEPOINT && !ignorables.contains(tag.codePoint)) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void AffixUtils::iterateWithConsumer(const UnicodeString& affixPattern, TokenConsumer& consumer,
|
|
UErrorCode& status) {
|
|
if (affixPattern.length() == 0) {
|
|
return;
|
|
}
|
|
AffixTag tag;
|
|
while (hasNext(tag, affixPattern)) {
|
|
tag = nextToken(tag, affixPattern, status);
|
|
if (U_FAILURE(status)) { return; }
|
|
consumer.consumeToken(tag.type, tag.codePoint, status);
|
|
if (U_FAILURE(status)) { return; }
|
|
}
|
|
}
|
|
|
|
AffixTag AffixUtils::nextToken(AffixTag tag, const UnicodeString &patternString, UErrorCode &status) {
|
|
int32_t offset = tag.offset;
|
|
int32_t state = tag.state;
|
|
for (; offset < patternString.length();) {
|
|
UChar32 cp = patternString.char32At(offset);
|
|
int32_t count = U16_LENGTH(cp);
|
|
|
|
switch (state) {
|
|
case STATE_BASE:
|
|
switch (cp) {
|
|
case u'\'':
|
|
state = STATE_FIRST_QUOTE;
|
|
offset += count;
|
|
// continue to the next code point
|
|
break;
|
|
case u'-':
|
|
return makeTag(offset + count, TYPE_MINUS_SIGN, STATE_BASE, 0);
|
|
case u'+':
|
|
return makeTag(offset + count, TYPE_PLUS_SIGN, STATE_BASE, 0);
|
|
case u'%':
|
|
return makeTag(offset + count, TYPE_PERCENT, STATE_BASE, 0);
|
|
case u'‰':
|
|
return makeTag(offset + count, TYPE_PERMILLE, STATE_BASE, 0);
|
|
case u'¤':
|
|
state = STATE_FIRST_CURR;
|
|
offset += count;
|
|
// continue to the next code point
|
|
break;
|
|
default:
|
|
return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
|
|
}
|
|
break;
|
|
case STATE_FIRST_QUOTE:
|
|
if (cp == u'\'') {
|
|
return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
|
|
} else {
|
|
return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
|
|
}
|
|
case STATE_INSIDE_QUOTE:
|
|
if (cp == u'\'') {
|
|
state = STATE_AFTER_QUOTE;
|
|
offset += count;
|
|
// continue to the next code point
|
|
break;
|
|
} else {
|
|
return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
|
|
}
|
|
case STATE_AFTER_QUOTE:
|
|
if (cp == u'\'') {
|
|
return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
|
|
} else {
|
|
state = STATE_BASE;
|
|
// re-evaluate this code point
|
|
break;
|
|
}
|
|
case STATE_FIRST_CURR:
|
|
if (cp == u'¤') {
|
|
state = STATE_SECOND_CURR;
|
|
offset += count;
|
|
// continue to the next code point
|
|
break;
|
|
} else {
|
|
return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
|
|
}
|
|
case STATE_SECOND_CURR:
|
|
if (cp == u'¤') {
|
|
state = STATE_THIRD_CURR;
|
|
offset += count;
|
|
// continue to the next code point
|
|
break;
|
|
} else {
|
|
return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
|
|
}
|
|
case STATE_THIRD_CURR:
|
|
if (cp == u'¤') {
|
|
state = STATE_FOURTH_CURR;
|
|
offset += count;
|
|
// continue to the next code point
|
|
break;
|
|
} else {
|
|
return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
|
|
}
|
|
case STATE_FOURTH_CURR:
|
|
if (cp == u'¤') {
|
|
state = STATE_FIFTH_CURR;
|
|
offset += count;
|
|
// continue to the next code point
|
|
break;
|
|
} else {
|
|
return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
|
|
}
|
|
case STATE_FIFTH_CURR:
|
|
if (cp == u'¤') {
|
|
state = STATE_OVERFLOW_CURR;
|
|
offset += count;
|
|
// continue to the next code point
|
|
break;
|
|
} else {
|
|
return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
|
|
}
|
|
case STATE_OVERFLOW_CURR:
|
|
if (cp == u'¤') {
|
|
offset += count;
|
|
// continue to the next code point and loop back to this state
|
|
break;
|
|
} else {
|
|
return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
|
|
}
|
|
default:
|
|
UPRV_UNREACHABLE;
|
|
}
|
|
}
|
|
// End of string
|
|
switch (state) {
|
|
case STATE_BASE:
|
|
// No more tokens in string.
|
|
return {-1};
|
|
case STATE_FIRST_QUOTE:
|
|
case STATE_INSIDE_QUOTE:
|
|
// For consistent behavior with the JDK and ICU 58, set an error here.
|
|
status = U_ILLEGAL_ARGUMENT_ERROR;
|
|
return {-1};
|
|
case STATE_AFTER_QUOTE:
|
|
// No more tokens in string.
|
|
return {-1};
|
|
case STATE_FIRST_CURR:
|
|
return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
|
|
case STATE_SECOND_CURR:
|
|
return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
|
|
case STATE_THIRD_CURR:
|
|
return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
|
|
case STATE_FOURTH_CURR:
|
|
return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
|
|
case STATE_FIFTH_CURR:
|
|
return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
|
|
case STATE_OVERFLOW_CURR:
|
|
return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
|
|
default:
|
|
UPRV_UNREACHABLE;
|
|
}
|
|
}
|
|
|
|
bool AffixUtils::hasNext(const AffixTag &tag, const UnicodeString &string) {
|
|
// First check for the {-1} and default initializer syntax.
|
|
if (tag.offset < 0) {
|
|
return false;
|
|
} else if (tag.offset == 0) {
|
|
return string.length() > 0;
|
|
}
|
|
// The rest of the fields are safe to use now.
|
|
// Special case: the last character in string is an end quote.
|
|
if (tag.state == STATE_INSIDE_QUOTE && tag.offset == string.length() - 1 &&
|
|
string.charAt(tag.offset) == u'\'') {
|
|
return false;
|
|
} else if (tag.state != STATE_BASE) {
|
|
return true;
|
|
} else {
|
|
return tag.offset < string.length();
|
|
}
|
|
}
|
|
|
|
#endif /* #if !UCONFIG_NO_FORMATTING */
|