You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

412 lines
13 KiB

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*****************************************************************
* Copyright (c) 2002-2014, International Business Machines Corporation
* and others. All Rights Reserved.
*****************************************************************
* Date Name Description
* 06/06/2002 aliu Creation.
*****************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_TRANSLITERATION
#include "unicode/uobject.h"
#include "unicode/uscript.h"
#include "anytrans.h"
#include "hash.h"
#include "mutex.h"
#include "nultrans.h"
#include "putilimp.h"
#include "tridpars.h"
#include "uinvchar.h"
#include "uvector.h"
//------------------------------------------------------------
// Constants
static const UChar TARGET_SEP = 45; // '-'
static const UChar VARIANT_SEP = 47; // '/'
static const UChar ANY[] = {0x41,0x6E,0x79,0}; // "Any"
static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null"
static const UChar LATIN_PIVOT[] = {0x2D,0x4C,0x61,0x74,0x6E,0x3B,0x4C,0x61,0x74,0x6E,0x2D,0}; // "-Latn;Latn-"
// initial size for an Any-XXXX transform's cache of script-XXXX transforms
// (will grow as necessary, but we don't expect to have source text with more than 7 scripts)
#define ANY_TRANS_CACHE_INIT_SIZE 7
//------------------------------------------------------------
U_CDECL_BEGIN
/**
* Deleter function for Transliterator*.
*/
static void U_CALLCONV
_deleteTransliterator(void *obj) {
delete (icu::Transliterator*) obj;
}
U_CDECL_END
//------------------------------------------------------------
U_NAMESPACE_BEGIN
//------------------------------------------------------------
// ScriptRunIterator
/**
* Returns a series of ranges corresponding to scripts. They will be
* of the form:
*
* ccccSScSSccccTTcTcccc - c = common, S = first script, T = second
* | | - first run (start, limit)
* | | - second run (start, limit)
*
* That is, the runs will overlap. The reason for this is so that a
* transliterator can consider common characters both before and after
* the scripts.
*/
class ScriptRunIterator : public UMemory {
private:
const Replaceable& text;
int32_t textStart;
int32_t textLimit;
public:
/**
* The code of the current run, valid after next() returns. May
* be USCRIPT_INVALID_CODE if and only if the entire text is
* COMMON/INHERITED.
*/
UScriptCode scriptCode;
/**
* The start of the run, inclusive, valid after next() returns.
*/
int32_t start;
/**
* The end of the run, exclusive, valid after next() returns.
*/
int32_t limit;
/**
* Constructs a run iterator over the given text from start
* (inclusive) to limit (exclusive).
*/
ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit);
/**
* Returns TRUE if there are any more runs. TRUE is always
* returned at least once. Upon return, the caller should
* examine scriptCode, start, and limit.
*/
UBool next();
/**
* Adjusts internal indices for a change in the limit index of the
* given delta. A positive delta means the limit has increased.
*/
void adjustLimit(int32_t delta);
private:
ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class
ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class
};
ScriptRunIterator::ScriptRunIterator(const Replaceable& theText,
int32_t myStart, int32_t myLimit) :
text(theText)
{
textStart = myStart;
textLimit = myLimit;
limit = myStart;
}
UBool ScriptRunIterator::next() {
UChar32 ch;
UScriptCode s;
UErrorCode ec = U_ZERO_ERROR;
scriptCode = USCRIPT_INVALID_CODE; // don't know script yet
start = limit;
// Are we done?
if (start == textLimit) {
return FALSE;
}
// Move start back to include adjacent COMMON or INHERITED
// characters
while (start > textStart) {
ch = text.char32At(start - 1); // look back
s = uscript_getScript(ch, &ec);
if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) {
--start;
} else {
break;
}
}
// Move limit ahead to include COMMON, INHERITED, and characters
// of the current script.
while (limit < textLimit) {
ch = text.char32At(limit); // look ahead
s = uscript_getScript(ch, &ec);
if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) {
if (scriptCode == USCRIPT_INVALID_CODE) {
scriptCode = s;
} else if (s != scriptCode) {
break;
}
}
++limit;
}
// Return TRUE even if the entire text is COMMON / INHERITED, in
// which case scriptCode will be USCRIPT_INVALID_CODE.
return TRUE;
}
void ScriptRunIterator::adjustLimit(int32_t delta) {
limit += delta;
textLimit += delta;
}
//------------------------------------------------------------
// AnyTransliterator
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)
AnyTransliterator::AnyTransliterator(const UnicodeString& id,
const UnicodeString& theTarget,
const UnicodeString& theVariant,
UScriptCode theTargetScript,
UErrorCode& ec) :
Transliterator(id, NULL),
targetScript(theTargetScript)
{
cache = uhash_openSize(uhash_hashLong, uhash_compareLong, NULL, ANY_TRANS_CACHE_INIT_SIZE, &ec);
if (U_FAILURE(ec)) {
return;
}
uhash_setValueDeleter(cache, _deleteTransliterator);
target = theTarget;
if (theVariant.length() > 0) {
target.append(VARIANT_SEP).append(theVariant);
}
}
AnyTransliterator::~AnyTransliterator() {
uhash_close(cache);
}
/**
* Copy constructor.
*/
AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) :
Transliterator(o),
target(o.target),
targetScript(o.targetScript)
{
// Don't copy the cache contents
UErrorCode ec = U_ZERO_ERROR;
cache = uhash_openSize(uhash_hashLong, uhash_compareLong, NULL, ANY_TRANS_CACHE_INIT_SIZE, &ec);
if (U_FAILURE(ec)) {
return;
}
uhash_setValueDeleter(cache, _deleteTransliterator);
}
/**
* Transliterator API.
*/
AnyTransliterator* AnyTransliterator::clone() const {
return new AnyTransliterator(*this);
}
/**
* Implements {@link Transliterator#handleTransliterate}.
*/
void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
UBool isIncremental) const {
int32_t allStart = pos.start;
int32_t allLimit = pos.limit;
ScriptRunIterator it(text, pos.contextStart, pos.contextLimit);
while (it.next()) {
// Ignore runs in the ante context
if (it.limit <= allStart) continue;
// Try to instantiate transliterator from it.scriptCode to
// our target or target/variant
Transliterator* t = getTransliterator(it.scriptCode);
if (t == NULL) {
// We have no transliterator. Do nothing, but keep
// pos.start up to date.
pos.start = it.limit;
continue;
}
// If the run end is before the transliteration limit, do
// a non-incremental transliteration. Otherwise do an
// incremental one.
UBool incremental = isIncremental && (it.limit >= allLimit);
pos.start = uprv_max(allStart, it.start);
pos.limit = uprv_min(allLimit, it.limit);
int32_t limit = pos.limit;
t->filteredTransliterate(text, pos, incremental);
int32_t delta = pos.limit - limit;
allLimit += delta;
it.adjustLimit(delta);
// We're done if we enter the post context
if (it.limit >= allLimit) break;
}
// Restore limit. pos.start is fine where the last transliterator
// left it, or at the end of the last run.
pos.limit = allLimit;
}
Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {
if (source == targetScript || source == USCRIPT_INVALID_CODE) {
return NULL;
}
Transliterator* t = NULL;
{
Mutex m(NULL);
t = (Transliterator*) uhash_iget(cache, (int32_t) source);
}
if (t == NULL) {
UErrorCode ec = U_ZERO_ERROR;
UnicodeString sourceName(uscript_getShortName(source), -1, US_INV);
UnicodeString id(sourceName);
id.append(TARGET_SEP).append(target);
t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
if (U_FAILURE(ec) || t == NULL) {
delete t;
// Try to pivot around Latin, our most common script
id = sourceName;
id.append(LATIN_PIVOT, -1).append(target);
t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
if (U_FAILURE(ec) || t == NULL) {
delete t;
t = NULL;
}
}
if (t != NULL) {
Transliterator *rt = NULL;
{
Mutex m(NULL);
rt = static_cast<Transliterator *> (uhash_iget(cache, (int32_t) source));
if (rt == NULL) {
// Common case, no race to cache this new transliterator.
uhash_iput(cache, (int32_t) source, t, &ec);
} else {
// Race case, some other thread beat us to caching this transliterator.
Transliterator *temp = rt;
rt = t; // Our newly created transliterator that lost the race & now needs deleting.
t = temp; // The transliterator from the cache that we will return.
}
}
delete rt; // will be non-null only in case of races.
}
}
return t;
}
/**
* Return the script code for a given name, or -1 if not found.
*/
static UScriptCode scriptNameToCode(const UnicodeString& name) {
char buf[128];
UScriptCode code;
UErrorCode ec = U_ZERO_ERROR;
int32_t nameLen = name.length();
UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen);
if (isInvariant) {
name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV);
buf[127] = 0; // Make sure that we NULL terminate the string.
}
if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec))
{
code = USCRIPT_INVALID_CODE;
}
return code;
}
/**
* Registers standard transliterators with the system. Called by
* Transliterator during initialization. Scan all current targets and
* register those that are scripts T as Any-T/V.
*/
void AnyTransliterator::registerIDs() {
UErrorCode ec = U_ZERO_ERROR;
Hashtable seen(TRUE, ec);
int32_t sourceCount = Transliterator::_countAvailableSources();
for (int32_t s=0; s<sourceCount; ++s) {
UnicodeString source;
Transliterator::_getAvailableSource(s, source);
// Ignore the "Any" source
if (source.caseCompare(ANY, 3, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue;
int32_t targetCount = Transliterator::_countAvailableTargets(source);
for (int32_t t=0; t<targetCount; ++t) {
UnicodeString target;
Transliterator::_getAvailableTarget(t, source, target);
// Only process each target once
if (seen.geti(target) != 0) continue;
ec = U_ZERO_ERROR;
seen.puti(target, 1, ec);
// Get the script code for the target. If not a script, ignore.
UScriptCode targetScript = scriptNameToCode(target);
if (targetScript == USCRIPT_INVALID_CODE) continue;
int32_t variantCount = Transliterator::_countAvailableVariants(source, target);
// assert(variantCount >= 1);
for (int32_t v=0; v<variantCount; ++v) {
UnicodeString variant;
Transliterator::_getAvailableVariant(v, source, target, variant);
UnicodeString id;
TransliteratorIDParser::STVtoID(UnicodeString(TRUE, ANY, 3), target, variant, id);
ec = U_ZERO_ERROR;
AnyTransliterator* tl = new AnyTransliterator(id, target, variant,
targetScript, ec);
if (U_FAILURE(ec)) {
delete tl;
} else {
Transliterator::_registerInstance(tl);
Transliterator::_registerSpecialInverse(target, UnicodeString(TRUE, NULL_ID, 4), FALSE);
}
}
}
}
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_TRANSLITERATION */
//eof