You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
254 lines
10 KiB
254 lines
10 KiB
// © 2016 and later: Unicode, Inc. and others.
|
|
// License & terms of use: http://www.unicode.org/copyright.html
|
|
/*
|
|
*******************************************************************************
|
|
* Copyright (C) 2013-2015, International Business Machines
|
|
* Corporation and others. All Rights Reserved.
|
|
*******************************************************************************
|
|
* collationdatareader.h
|
|
*
|
|
* created on: 2013feb07
|
|
* created by: Markus W. Scherer
|
|
*/
|
|
|
|
#ifndef __COLLATIONDATAREADER_H__
|
|
#define __COLLATIONDATAREADER_H__
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
#if !UCONFIG_NO_COLLATION
|
|
|
|
#include "unicode/udata.h"
|
|
|
|
struct UDataMemory;
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
struct CollationTailoring;
|
|
|
|
/**
|
|
* Collation binary data reader.
|
|
*/
|
|
struct U_I18N_API CollationDataReader /* all static */ {
|
|
// The following constants are also copied into source/common/ucol_swp.cpp.
|
|
// Keep them in sync!
|
|
enum {
|
|
/**
|
|
* Number of int32_t indexes.
|
|
*
|
|
* Can be 2 if there are only options.
|
|
* Can be 7 or 8 if there are only options and a script reordering.
|
|
* The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0.
|
|
*/
|
|
IX_INDEXES_LENGTH, // 0
|
|
/**
|
|
* Bits 31..24: numericPrimary, for numeric collation
|
|
* 23..16: fast Latin format version (0 = no fast Latin table)
|
|
* 15.. 0: options bit set
|
|
*/
|
|
IX_OPTIONS,
|
|
IX_RESERVED2,
|
|
IX_RESERVED3,
|
|
|
|
/** Array offset to Jamo CE32s in ce32s[], or <0 if none. */
|
|
IX_JAMO_CE32S_START, // 4
|
|
|
|
// Byte offsets from the start of the data, after the generic header.
|
|
// The indexes[] are at byte offset 0, other data follows.
|
|
// Each data item is aligned properly.
|
|
// The data items should be in descending order of unit size,
|
|
// to minimize the need for padding.
|
|
// Each item's byte length is given by the difference between its offset and
|
|
// the next index/offset value.
|
|
/** Byte offset to int32_t reorderCodes[]. */
|
|
IX_REORDER_CODES_OFFSET,
|
|
/**
|
|
* Byte offset to uint8_t reorderTable[].
|
|
* Empty table if <256 bytes (padding only).
|
|
* Otherwise 256 bytes or more (with padding).
|
|
*/
|
|
IX_REORDER_TABLE_OFFSET,
|
|
/** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */
|
|
IX_TRIE_OFFSET,
|
|
|
|
IX_RESERVED8_OFFSET, // 8
|
|
/** Byte offset to int64_t ces[]. */
|
|
IX_CES_OFFSET,
|
|
IX_RESERVED10_OFFSET,
|
|
/** Byte offset to uint32_t ce32s[]. */
|
|
IX_CE32S_OFFSET,
|
|
|
|
/** Byte offset to uint32_t rootElements[]. */
|
|
IX_ROOT_ELEMENTS_OFFSET, // 12
|
|
/** Byte offset to UChar *contexts[]. */
|
|
IX_CONTEXTS_OFFSET,
|
|
/** Byte offset to uint16_t [] with serialized unsafeBackwardSet. */
|
|
IX_UNSAFE_BWD_OFFSET,
|
|
/** Byte offset to uint16_t fastLatinTable[]. */
|
|
IX_FAST_LATIN_TABLE_OFFSET,
|
|
|
|
/** Byte offset to uint16_t scripts[]. */
|
|
IX_SCRIPTS_OFFSET, // 16
|
|
/**
|
|
* Byte offset to UBool compressibleBytes[].
|
|
* Empty table if <256 bytes (padding only).
|
|
* Otherwise 256 bytes or more (with padding).
|
|
*/
|
|
IX_COMPRESSIBLE_BYTES_OFFSET,
|
|
IX_RESERVED18_OFFSET,
|
|
IX_TOTAL_SIZE
|
|
};
|
|
|
|
static void read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength,
|
|
CollationTailoring &tailoring, UErrorCode &errorCode);
|
|
|
|
static UBool U_CALLCONV
|
|
isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo);
|
|
|
|
private:
|
|
CollationDataReader(); // no constructor
|
|
};
|
|
|
|
/*
|
|
* Format of collation data (ucadata.icu, binary data in coll/ *.res files).
|
|
* Format version 5.
|
|
*
|
|
* The root collation data is stored in the ucadata.icu file.
|
|
* Tailorings are stored inside .res resource bundle files, with a complete file header.
|
|
*
|
|
* Collation data begins with a standard ICU data file header
|
|
* (DataHeader, see ucmndata.h and unicode/udata.h).
|
|
* The UDataInfo.dataVersion field contains the UCA and other version numbers,
|
|
* see the comments for CollationTailoring.version.
|
|
*
|
|
* After the header, the file contains the following parts.
|
|
* Constants are defined as enum values of the CollationDataReader class.
|
|
* See also the Collation class.
|
|
*
|
|
* int32_t indexes[indexesLength];
|
|
* The indexes array has variable length.
|
|
* Some tailorings only need the length and the options,
|
|
* others only add reorderCodes and the reorderTable,
|
|
* some need to store mappings.
|
|
* Only as many indexes are stored as needed to read all of the data.
|
|
*
|
|
* Index 0: indexesLength
|
|
* Index 1: numericPrimary, CollationFastLatin::VERSION, and options: see IX_OPTIONS
|
|
* Index 2..3: Unused/reserved/0.
|
|
* Index 4: Index into the ce32s array where the CE32s of the conjoining Jamo
|
|
* are stored in a short, contiguous part of the ce32s array.
|
|
*
|
|
* Indexes 5..19 are byte offsets in ascending order.
|
|
* Each byte offset marks the start of the next part in the data file,
|
|
* and the end of the previous one.
|
|
* When two consecutive byte offsets are the same (or too short),
|
|
* then the corresponding part is empty.
|
|
* Byte offsets are offsets from after the header,
|
|
* that is, from the beginning of the indexes[].
|
|
* Each part starts at an offset with proper alignment for its data.
|
|
* If necessary, the previous part may include padding bytes to achieve this alignment.
|
|
* The last byte offset that is stored in the indexes indicates the total size of the data
|
|
* (starting with the indexes).
|
|
*
|
|
* int32_t reorderCodes[]; -- empty in root
|
|
* The list of script and reordering codes.
|
|
*
|
|
* Beginning with format version 5, this array may optionally
|
|
* have trailing entries with a full list of reorder ranges
|
|
* as described for CollationSettings::reorderRanges.
|
|
*
|
|
* Script or reorder codes are first and do not exceed 16-bit values.
|
|
* Range limits are stored in the upper 16 bits, and are never 0.
|
|
* Split this array into reorder codes and ranges at the first entry
|
|
* with non-zero upper 16 bits.
|
|
*
|
|
* If the ranges are missing but needed for split-reordered primary lead bytes,
|
|
* then they are regenerated at load time.
|
|
*
|
|
* uint8_t reorderTable[256]; -- empty in root; can be longer to include padding bytes
|
|
* Primary-weight lead byte permutation table.
|
|
* Normally present when the reorderCodes are, but can be built at load time.
|
|
*
|
|
* Beginning with format version 5, a 0 entry at a non-zero index
|
|
* (which is otherwise an illegal value)
|
|
* means that the primary lead byte is "split"
|
|
* (there are different offsets for primaries that share that lead byte)
|
|
* and the reordering offset must be determined via the reorder ranges
|
|
* that are either stored as part of the reorderCodes array
|
|
* or regenerated at load time.
|
|
*
|
|
* UTrie2 trie; -- see utrie2_impl.h and utrie2.h
|
|
* The trie holds the main collation data. Each code point is mapped to a 32-bit value.
|
|
* It encodes a simple collation element (CE) in compact form, unless bits 7..6 are both set,
|
|
* in which case it is a special CE32 and contains a 4-bit tag and further data.
|
|
* See the Collation class for details.
|
|
*
|
|
* The trie has a value for each lead surrogate code unit with some bits encoding
|
|
* collective properties of the 1024 supplementary characters whose UTF-16 form starts with
|
|
* the lead surrogate. See Collation::LEAD_SURROGATE_TAG..
|
|
*
|
|
* int64_t ces[];
|
|
* 64-bit CEs and expansions that cannot be stored in a more compact form.
|
|
*
|
|
* uint32_t ce32s[];
|
|
* CE32s for expansions in compact form, and for characters whose trie values
|
|
* contain special data.
|
|
*
|
|
* uint32_t rootElements[]; -- empty in all tailorings
|
|
* Compact storage for all of the CEs that occur in the root collation.
|
|
* See the CollationRootElements class.
|
|
*
|
|
* UChar *contexts[];
|
|
* Serialized UCharsTrie structures with prefix (pre-context) and contraction mappings.
|
|
*
|
|
* uint16_t unsafeBackwardSet[]; -- see UnicodeSet::serialize()
|
|
* Serialized form of characters that are unsafe when iterating backwards,
|
|
* and at the end of an identical string prefix.
|
|
* Back up to a safe character.
|
|
* Lead surrogates are "unsafe" when any of their corresponding supplementary
|
|
* code points are unsafe.
|
|
* Does not include [:^lccc=0:][:^tccc=0:].
|
|
* For each tailoring, the root unsafeBackwardSet is subtracted.
|
|
* (As a result, in many tailorings no set needs to be stored.)
|
|
*
|
|
* uint16_t fastLatinTable[];
|
|
* Optional optimization for Latin text.
|
|
* See the CollationFastLatin class.
|
|
*
|
|
* uint16_t scripts[]; -- empty in all tailorings
|
|
* Format version 5:
|
|
* uint16_t numScripts;
|
|
* uint16_t scriptsIndex[numScripts+16];
|
|
* uint16_t scriptStarts[];
|
|
* See CollationData::numScripts etc.
|
|
*
|
|
* Format version 4:
|
|
* Table of the reordering groups with their first and last lead bytes,
|
|
* and their script and reordering codes.
|
|
* See CollationData::scripts.
|
|
*
|
|
* UBool compressibleBytes[]; -- empty in all tailorings
|
|
* Flag for getSortKey(), indicating primary weight lead bytes that are compressible.
|
|
*
|
|
* -----------------
|
|
* Changes for formatVersion 5 (ICU 55)
|
|
*
|
|
* Reordering moves single scripts, not groups of scripts.
|
|
* Reorder ranges are optionally appended to the reorderCodes,
|
|
* and a 0 entry in the reorderTable indicates a split lead byte.
|
|
* The scripts data has a new format.
|
|
*
|
|
* The rootElements may contain secondary and tertiary weights below common=05.
|
|
* (Used for small Hiragana letters.)
|
|
* Where is occurs, there is also an explicit unit with common secondary & tertiary weights.
|
|
* There are no other data structure changes, but builder code needs to be able to handle such data.
|
|
*
|
|
* The collation element for the merge separator code point U+FFFE
|
|
* does not necessarily have special, unique secondary/tertiary weights any more.
|
|
*/
|
|
|
|
U_NAMESPACE_END
|
|
|
|
#endif // !UCONFIG_NO_COLLATION
|
|
#endif // __COLLATIONDATAREADER_H__
|