You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
454 lines
23 KiB
454 lines
23 KiB
<!-- © 2019 and later: Unicode, Inc. and others.
|
|
License & terms of use: http://www.unicode.org/copyright.html -->
|
|
|
|
<!--================================================================================
|
|
Setup:
|
|
Follow the installation instructions in README.txt in this directory.
|
|
|
|
To build ICU data files:
|
|
1: Determine the CLDR base directory and set the CLDR_DIR environment variable.
|
|
2: Determine the flags required (see the list of properties below).
|
|
3: Run: ant -f build-icu-data.xml -D<flag-name>=<flag-value>...
|
|
================================================================================-->
|
|
<!-- TODO: Add things like copying of a template directory and deleting previous files
|
|
(perhaps always generate into a temporary directory and copy back to avoid having
|
|
inconsistent state when the conversion is cancelled). -->
|
|
<project name="Convert" default="all" basedir=".">
|
|
|
|
<target name="all" depends="init-args, prepare-jar, clean, convert"/>
|
|
|
|
<!-- Initialize the properties which were not already set on the command line. -->
|
|
<target name="init-args">
|
|
<property environment="env"/>
|
|
<!-- Inherit properties from environment variable unless specified. As usual
|
|
with Ant, this is messier than it should be. All we are saying here is:
|
|
"Use the property if explicitly set, otherwise use the environment variable."
|
|
We cannot just set the property to the environment variable, since expansion
|
|
fails for non existant properties, and you are left with a literal value of
|
|
"${env.CLDR_DATA_DIR}". -->
|
|
<condition property="cldrDataDir" value="${env.CLDR_DATA_DIR}">
|
|
<isset property="env.CLDR_DATA_DIR"/>
|
|
</condition>
|
|
<fail unless="cldrDataDir"
|
|
message="Set the CLDR_DATA_DIR environment variable (or cldrDataDir property) to the CLDR data directory (typically ending in '/production')"/>
|
|
|
|
<!-- Ant does not inherit this from the user's environment (and it can matter).
|
|
This is only needed because we have to "exec" a new Ant task below. -->
|
|
<condition property="javaHome" value="${env.JAVA_HOME}">
|
|
<isset property="env.JAVA_HOME"/>
|
|
</condition>
|
|
|
|
<!-- The output directory into which to write the converted ICU data. By default
|
|
this will overwrite (without deletion) the ICU data files in this ICU release,
|
|
so it is recommended that for testing, it be set to another value. -->
|
|
<property name="outDir" value="${basedir}/../../../icu4c/source/data/"/>
|
|
|
|
<!-- The directory in which the additional ICU XML data is stored. -->
|
|
<property name="specialsDir" value="${basedir}/../../../icu4c/source/data/xml"/>
|
|
|
|
<!-- Default value for ICU version (icuver.txt). Update this for each release. -->
|
|
<property name="icuVersion" value="68.2.0.0"/>
|
|
|
|
<!-- Default value for ICU data version (icuver.txt). Update this for each release. -->
|
|
<property name="icuDataVersion" value="68.2.0.0"/>
|
|
|
|
<!-- An override for the CLDR version string (icuver.txt and others). This will be
|
|
extracted from the CLDR library used for building the data if not set here. -->
|
|
<property name="cldrVersion" value=""/>
|
|
|
|
<!-- The minimum draft status for CLDR data to be used in the conversion. See
|
|
CldrDraftStatus for more details. -->
|
|
<property name="minDraftStatus" value="contributed"/>
|
|
|
|
<!-- A regular expression to match the locale IDs to be generated (useful for
|
|
debugging specific regions). This is applied after locale ID specifications
|
|
have been expanded into full locale IDs, so the value "en" will NOT match
|
|
"en_GB" or "en_001" etc. -->
|
|
<property name="localeIdFilter" value=""/>
|
|
|
|
<!-- Whether to synthetically generate "pseudo locale" data ("en_XA" and "ar_XB"). -->
|
|
<property name="includePseudoLocales" value="false"/>
|
|
|
|
<!-- Whether to emit a debug report containing some possibly useful information after
|
|
the conversion has finished. -->
|
|
<!-- TODO: Currently this isn't hugely useful, so find out what people want. -->
|
|
<property name="emitReport" value="false"/>
|
|
|
|
<!-- List of output "types" to be generated (e.g. "rbnf,plurals,locales"); an empty
|
|
list means "build everything".
|
|
|
|
Note that the grouping of types is based on the legacy converter behaviour and
|
|
is not always directly associated with an output directory (e.g. "locales"
|
|
produces locale data for curr/, lang/, main/, region/, unit/, zone/ but NOT
|
|
coll/, brkitr/ or rbnf/).
|
|
|
|
Pass in the value "HELP" (or any invalid value) to see the full list of types. -->
|
|
<!-- TODO: Find out what common use cases are and use them. -->
|
|
<property name="outputTypes" value=""/>
|
|
|
|
<!-- Override to force the 'clean' task to delete files it cannot determine to be
|
|
auto-generated by this tool. This is useful if the file header changes since
|
|
the heading is what's used to recognize auto-generated files. -->
|
|
<property name="forceDelete" value="false"/>
|
|
</target>
|
|
|
|
<!-- Build a standalone JAR which is called by Ant (and which avoids needing to mess
|
|
about making Ant know the Maven class-path). -->
|
|
<target name="prepare-jar" depends="init-args">
|
|
<exec executable="mvn" searchpath="true" failonerror="true">
|
|
<arg value="compile"/>
|
|
</exec>
|
|
</target>
|
|
|
|
<!-- Somewhat hacky wrapper target which invokes the real conversion task.
|
|
This is done so we can set the environment variable of the new process and
|
|
effectively overwrite the CLDR_DIR value. If ever the CLDR library doesn't
|
|
need to use CLDR_DIR at runtime to find the production data, this can all be
|
|
removed. -->
|
|
<target name="convert" depends="init-args, prepare-jar">
|
|
<exec executable="ant" searchpath="true" failonerror="true">
|
|
<!-- The CLDR library wants CLDR_DIR set, to the data directory. -->
|
|
<env key="CLDR_DIR" value="${cldrDataDir}" />
|
|
<!-- Force inherit JAVA_HOME (this can be important). -->
|
|
<env key="JAVA_HOME" value="${javaHome}" />
|
|
<!-- Initial Ant command line with all the "interesting" bit in. -->
|
|
<arg line="-f build-icu-data.xml convert-impl -DcldrDir=${cldrDataDir}"/>
|
|
<!-- List all properties in the "convert-impl" task (except cldrDir). -->
|
|
<arg value="-DoutDir=${outDir}"/>
|
|
<arg value="-DspecialsDir=${specialsDir}"/>
|
|
<arg value="-DoutputTypes=${outputTypes}"/>
|
|
<arg value="-DicuVersion=${icuVersion}"/>
|
|
<arg value="-DicuDataVersion=${icuDataVersion}"/>
|
|
<arg value="-DcldrVersion=${cldrVersion}"/>
|
|
<arg value="-DminDraftStatus=${minDraftStatus}"/>
|
|
<arg value="-DlocaleIdFilter=${localeIdFilter}"/>
|
|
<arg value="-DincludePseudoLocales=${includePseudoLocales}"/>
|
|
<arg value="-DemitReport=${emitReport}"/>
|
|
</exec>
|
|
</target>
|
|
|
|
<!-- Do the actual CLDR data conversion, based on the command line arguments, built in
|
|
default properties and the configuration in the "<convert>" element below. -->
|
|
<target name="convert-impl">
|
|
<taskdef name="convert" classname="org.unicode.icu.tool.cldrtoicu.ant.ConvertIcuDataTask">
|
|
<classpath>
|
|
<pathelement path="target/cldr-to-icu-1.0-SNAPSHOT-jar-with-dependencies.jar"/>
|
|
</classpath>
|
|
</taskdef>
|
|
<convert cldrDir="${cldrDir}" outputDir="${outDir}" specialsDir="${specialsDir}"
|
|
outputTypes="${outputTypes}" cldrVersion="${cldrVersion}"
|
|
icuVersion="${icuVersion}" icuDataVersion="${icuDataVersion}"
|
|
minimalDraftStatus="${minDraftStatus}" localeIdFilter="${localeIdFilter}"
|
|
includePseudoLocales="${includePseudoLocales}" emitReport="${emitReport}">
|
|
|
|
<!-- The primary set of locale IDs to be generated by default. The IDs in this list are
|
|
automatically expanded to include default scripts and all available regions. The
|
|
rules are:
|
|
|
|
1) Base languages are expanded to include default scripts (e.g. "en" -> "en_Latn").
|
|
2) All region and variant subtags are added for any base language or language+script
|
|
(e.g. "en" -> "en_GB" or "shi_Latn" -> "shi_Latn_MA").
|
|
|
|
If a non-default script is desired it should be listed explicitly (e.g. "sr_Latn").
|
|
|
|
Locale IDs with deprecated subtags (which become aliases) must still be listed in
|
|
full (e.g. "en_RH" or "sr_Latn_YU").
|
|
-->
|
|
<localeIds>
|
|
// A
|
|
af, agq, ak, am, ar, ars, as, asa, ast, az, az_AZ, az_Cyrl
|
|
|
|
// B
|
|
bas, be, bem, bez, bg, bm, bn, bo, br, brx, bs, bs_BA, bs_Cyrl
|
|
|
|
// C
|
|
ca, ccp, ce, ceb, cgg, chr, ckb, cs, cy
|
|
|
|
// D
|
|
da, dav, de, dje, doi, dsb, dua, dyo, dz
|
|
|
|
// E
|
|
ebu, ee, el, en, en_NH, en_RH, eo, es, et, eu, ewo
|
|
|
|
// F
|
|
fa, ff, ff_Adlm, ff_CM, ff_GN, ff_MR, ff_SN, fi, fil, fo, fr, fur, fy
|
|
|
|
// G
|
|
ga, gd, gl, gsw, gu, guz, gv
|
|
|
|
// H
|
|
ha, haw, he, hi, hr, hsb, hu, hy
|
|
|
|
// I
|
|
ia, id, ig, ii, in, in_ID, is, it, iw, iw_IL
|
|
|
|
// J
|
|
ja, jgo, jmc, jv
|
|
|
|
// K
|
|
ka, kab, kam, kde, kea, khq, ki, kk, kkj, kl, kln, km, kn, ko, kok, ks
|
|
ks_IN, ksb, ksf, ksh, ku, kw, ky
|
|
|
|
// L
|
|
lag, lb, lg, lkt, ln, lo, lrc, lt, lu, luo, luy, lv
|
|
|
|
// M
|
|
mai, mas, mer, mfe, mg, mgh, mgo, mi, mk, ml, mn, mni, mni_IN, mo, mr, ms
|
|
mt, mua, my, mzn
|
|
|
|
// N
|
|
naq, nb, nd, ne, nl, nmg, nn, nnh, no, no_NO, no_NO_NY, nus, nyn
|
|
|
|
// O
|
|
om, or, os
|
|
|
|
// P
|
|
pa, pa_Arab, pa_IN, pa_PK, pcm, pl, ps, pt
|
|
|
|
// Q
|
|
qu
|
|
|
|
// R
|
|
rm, rn, ro, rof, ru, rw, rwk
|
|
|
|
// S
|
|
sa, sah, saq, sat, sat_IN, sbp, sd, sd_Deva, sd_PK, se, seh, ses, sg, sh, sh_BA, sh_CS, sh_YU
|
|
shi, shi_Latn, shi_MA, si, sk, sl, smn, sn, so, sq, sr, sr_BA, sr_CS, sr_Cyrl_CS, sr_Cyrl_YU, sr_Latn
|
|
sr_Latn_CS, sr_Latn_YU, sr_ME, sr_RS, sr_XK, sr_YU, su, su_ID, sv, sw
|
|
|
|
// T
|
|
ta, te, teo, tg, th, ti, tk, tl, tl_PH, to, tr, tt, twq, tzm
|
|
|
|
// U
|
|
ug, uk, ur, uz, uz_AF, uz_Arab, uz_Cyrl, uz_UZ
|
|
|
|
// V
|
|
vai, vai_LR, vai_Latn, vi, vun
|
|
|
|
// W
|
|
wae, wo
|
|
|
|
// X
|
|
xh, xog
|
|
|
|
// Y
|
|
yav, yi, yo, yue, yue_CN, yue_HK, yue_Hans
|
|
|
|
// Z
|
|
zgh, zh, zh_CN, zh_HK, zh_Hant, zh_MO, zh_SG, zh_TW, zu
|
|
</localeIds>
|
|
|
|
<!-- The following elements configure directories in which a subset of the available
|
|
locales IDs should be generated. Unlike the main <localeId> element, these
|
|
filters must specify all locale IDs in full (but since they mostly select base
|
|
languages, this isn't a big deal).
|
|
|
|
As well as allowing some data directories to have a subset of available data (via
|
|
the <localeIds> element) there are also mechanisms for controlling aliasing and
|
|
the locale parent relation which allows the sharing of some ICU data in cases
|
|
where it would otherwise need to be copied. The two mechanisms are:
|
|
|
|
1: inheritLanguageSubtag: Used to rewrite the parent of a locale ID from "root" to
|
|
its language subtag (e.g. "zh_Hant" has a natural parent of "root", but to allow
|
|
some base language data to be shared it can be made to have a parent of "zh").
|
|
|
|
2: forcedAlias: Used to add aliases for specific directories in order to affect the
|
|
ICU behaviour in special cases.
|
|
|
|
Between them these mechanisms are known as "tailorings" of the affected locales. -->
|
|
<!-- TODO: Explain why these special cases are needed/different. -->
|
|
|
|
<!-- Collation data is large, but also more sharable than other data, which is why there
|
|
are a number of aliases and parent remappings for this directory. -->
|
|
<directory dir="coll" inheritLanguageSubtag="bs_Cyrl, sr_Latn, zh_Hant">
|
|
<!-- These aliases are to avoid needing to copy and maintain the same collation data
|
|
for "zh" and "yue". The maximized versions of "yue_Hans" is "yue_Hans_CN" (vs
|
|
"zh_Hans_CN"), and for "yue" it's "yue_Hant_HK" (vs "zh_Hant_HK"), so the
|
|
aliases are effectively just rewriting the base language. -->
|
|
<forcedAlias source="yue" target="zh_Hant"/>
|
|
<forcedAlias source="yue_Hant" target="zh_Hant"/>
|
|
<forcedAlias source="yue_CN" target="zh_Hans"/>
|
|
<forcedAlias source="yue_Hans" target="zh_Hans"/>
|
|
<forcedAlias source="yue_Hans_CN" target="zh_Hans"/>
|
|
|
|
<!-- TODO: Find out and document this properly. -->
|
|
<forcedAlias source="sr_ME" target="sr_Cyrl_ME"/>
|
|
|
|
<localeIds>
|
|
root,
|
|
|
|
// A-B
|
|
af, am, ars, ar, as, az, be, bg, bn, bo, br, bs_Cyrl, bs,
|
|
|
|
// C-F
|
|
ca, ceb, chr, cs, cy, da, de_AT, de, dsb, dz, ee, el, en,
|
|
en_US_POSIX, en_US, eo, es, et, fa_AF, fa, ff_Adlm, ff, fil, fi, fo, fr_CA, fr,
|
|
|
|
// G-J
|
|
ga, gl, gu, ha, haw, he, hi, hr, hsb, hu, hy,
|
|
id_ID, id, ig, in, in_ID, is, it, iw_IL, iw, ja,
|
|
|
|
// K-P
|
|
ka, kk, kl, km, kn, kok, ko, ku, ky, lb, lkt, ln, lo, lt, lv,
|
|
mk, ml, mn, mo, mr, ms, mt, my, nb, ne, nl, nn, no_NO, no,
|
|
om, or, pa_IN, pa, pa_Guru, pl, ps, pt,
|
|
|
|
// R-T
|
|
ro, ru, sa, se, sh_BA, sh_CS, sh, sh_YU, si, sk, sl, smn, sq,
|
|
sr_BA, sr_Cyrl_ME, sr_Latn, sr_ME, sr_RS, sr, sv, sw,
|
|
ta, te, th, tk, to, tr,
|
|
|
|
// U-Z
|
|
ug, uk, ur, uz, vi, wae, wo, xh, yi, yo, yue_CN, yue_Hans_CN, yue_Hans
|
|
yue_Hant, yue, zh_CN, zh_Hans, zh_Hant, zh_HK, zh_MO, zh_SG, zh_TW, zh, zu
|
|
</localeIds>
|
|
</directory>
|
|
|
|
<directory dir="rbnf">
|
|
<!-- It is not at all clear why this is being done. It's certainly not exactly the
|
|
same as above, since (a) the alias is reversed (b) "zh_Hant" does exist, with
|
|
different data than "yue", so this alias is not just rewriting the base
|
|
language. -->
|
|
<!-- TODO: Find out and document this properly. -->
|
|
<forcedAlias source="zh_Hant_HK" target="yue"/>
|
|
|
|
<localeIds>
|
|
root,
|
|
|
|
// A-E
|
|
af, ak, am, ars, ar, az, be, bg, bs, ca, ccp, chr, cs, cy,
|
|
da, de_CH, de, ee, el, en_001, en_IN, en, eo, es_419, es_DO,
|
|
es_GT, es_HN, es_MX, es_NI, es_PA, es_PR, es_SV, es, es_US, et,
|
|
|
|
// F-P
|
|
fa_AF, fa, ff, fil, fi, fo, fr_BE, fr_CH, fr, ga, he, hi, hr,
|
|
hu, hy, id, in, is, it, iw, ja, ka, kl, km, ko, ky, lb,
|
|
lo, lrc, lt, lv, mk, ms, mt, my, nb, nl, nn, no, pl, pt_PT, pt,
|
|
|
|
// Q-Z
|
|
qu, ro, ru, se, sh, sk, sl, sq, sr_Latn, sr, su, sv, sw, ta, th, tr,
|
|
uk, vi, yue_Hans, yue, zh_Hant_HK, zh_Hant, zh_HK, zh_MO, zh_TW, zh
|
|
</localeIds>
|
|
</directory>
|
|
|
|
<directory dir="brkitr" inheritLanguageSubtag="zh_Hant">
|
|
<localeIds>
|
|
root,
|
|
de, el, en, en_US_POSIX, en_US, es, fr, it, ja, pt, ru, zh_Hant, zh
|
|
</localeIds>
|
|
</directory>
|
|
|
|
<!-- GLOBAL ALIASES -->
|
|
|
|
<!-- Some spoken languages (e.g. "ars") inherit all their data from a written language
|
|
(e.g. "ar_SA"). However CLDR doesn't currently support a way to represent that
|
|
relationship. Unlike deprecated languages for which an alias can be inferred from
|
|
the "languageAlias" CLDR data, there's no way in CLDR to represent the fact that
|
|
we want "ars" (a non-deprecated language) to inherit the data of "ar_SA".
|
|
|
|
This alias is the first example of potentially many cases where ICU needs to
|
|
generate an alias in order to affect "sideways inheritance" for spoken languages,
|
|
and at some stage it should probably be supported properly in the CLDR data. -->
|
|
<forcedAlias source="ars" target="ar_SA"/>
|
|
|
|
<!-- A legacy global alias (note that "no_NO_NY" is not even structurally valid). -->
|
|
<forcedAlias source="no_NO_NY" target="nn_NO"/>
|
|
|
|
<!-- ALTERNATE VALUES -->
|
|
|
|
<!-- The following elements configure alternate values for some special case paths.
|
|
The target path will only be replaced if both it, and the source path, exist in
|
|
the CLDR data (paths will not be modified if only the source path exists).
|
|
|
|
Since the paths must represent the same semantic type of data, they must be in the
|
|
same "namespace" (same element names) and must not contain value attributes. Thus
|
|
they can only differ by distinguishing attributes (either added or modified).
|
|
|
|
This feature is typically used to select alternate translations (e.g. short forms)
|
|
for certain paths. -->
|
|
<!-- <altPath target="//path/to/value[@attr='foo']"
|
|
source="//path/to/value[@attr='bar']"
|
|
locales="xx,yy_ZZ"/> -->
|
|
<!-- Android patch (b/36123938) begin. -->
|
|
<altPath target="//ldml/localeDisplayNames/territories/territory[@type='FK']"
|
|
source="//ldml/localeDisplayNames/territories/territory[@type='FK'][@alt='variant']"/>
|
|
<altPath target="//ldml/localeDisplayNames/territories/territory[@type='HK']"
|
|
source="//ldml/localeDisplayNames/territories/territory[@type='HK'][@alt='short']"/>
|
|
<altPath target="//ldml/localeDisplayNames/territories/territory[@type='MK']"
|
|
source="//ldml/localeDisplayNames/territories/territory[@type='MK'][@alt='variant']"/>
|
|
<altPath target="//ldml/localeDisplayNames/territories/territory[@type='MO']"
|
|
source="//ldml/localeDisplayNames/territories/territory[@type='MO'][@alt='short']"/>
|
|
<!-- Android patch (b/36123938) end. -->
|
|
<!-- Android patch (b/8264703) begin. -->
|
|
<altPath target="//ldml/localeDisplayNames/territories/territory[@type='PS']"
|
|
source="//ldml/localeDisplayNames/territories/territory[@type='PS'][@alt='short']"/>
|
|
<!-- Android patch (b/8264703) end. -->
|
|
<!-- Android patch (b/21295835) begin. -->
|
|
<altPath target="//ldml/numbers/currencies/currency[@type='UAH']/symbol"
|
|
source="//ldml/numbers/currencies/currency[@type='UAH']/symbol[@alt='variant']"/>
|
|
<!-- Android patch (b/21295835) end. -->
|
|
</convert>
|
|
</target>
|
|
|
|
<target name="clean" depends="init-args, prepare-jar">
|
|
<taskdef name="outputDirectories" classname="org.unicode.icu.tool.cldrtoicu.ant.CleanOutputDirectoryTask">
|
|
<classpath>
|
|
<pathelement path="target/cldr-to-icu-1.0-SNAPSHOT-jar-with-dependencies.jar"/>
|
|
</classpath>
|
|
</taskdef>
|
|
|
|
<!-- If a directory is listed here, then every file in it is assumed to be automatically
|
|
generated by the conversion tool, unless it is explicitly listed in a <retain> element.
|
|
The tool then checks every file to determine if it has the expected header present,
|
|
indiciating that it was automatically generated, before deleting it.
|
|
|
|
If unexpected files are found, the "clean" task will fail without deleting anything
|
|
(unless'forceDelete' is set to override this). Note that even if 'forceDelete' is set,
|
|
the files listed explicitly below will never be deleted by this process.
|
|
|
|
This two-step approach minimizes the risk that the conversion process will ever
|
|
accidentally delete a manually maintained file.
|
|
-->
|
|
<outputDirectories root="${outDir}" forceDelete="${forceDelete}">
|
|
<dir name="brkitr">
|
|
<retain path="dictionaries"/>
|
|
<retain path="rules"/>
|
|
</dir>
|
|
<dir name="coll">
|
|
<!-- Legacy files whose file names aren't supported for automatic generation.
|
|
Simple to maintain manually and unlikely to ever change again. -->
|
|
<retain path="de__PHONEBOOK.txt"/>
|
|
<retain path="de_.txt"/>
|
|
<retain path="es__TRADITIONAL.txt"/>
|
|
<retain path="es_.txt"/>
|
|
</dir>
|
|
<dir name="curr"/>
|
|
<dir name="lang"/>
|
|
<dir name="locales"/>
|
|
<dir name="misc">
|
|
<!-- Machine generated files produced by different tools.
|
|
Possibly worth moving into the new LDML conversion tool one day. -->
|
|
<retain path="currencyNumericCodes.txt"/>
|
|
<retain path="zoneinfo64.txt"/>
|
|
<!-- Project file (not ICU data), unlikely to ever be auto-generated. -->
|
|
<retain path="icudata.rc"/>
|
|
<!-- Small high-level metadata file, stable and easy to maintain manually. -->
|
|
<retain path="icustd.txt"/>
|
|
</dir>
|
|
<dir name="rbnf"/>
|
|
<dir name="region"/>
|
|
<dir name="translit">
|
|
<!-- Small, easy to maintain, special case top-level files. -->
|
|
<retain path="en.txt"/>
|
|
<retain path="el.txt"/>
|
|
</dir>
|
|
<dir name="unit"/>
|
|
<dir name="zone">
|
|
<!-- Manually edited to support TZ database name compatibility. -->
|
|
<retain path="tzdbNames.txt"/>
|
|
</dir>
|
|
</outputDirectories>
|
|
</target>
|
|
</project>
|
|
|