You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
279 lines
9.1 KiB
279 lines
9.1 KiB
#!/usr/bin/python
|
|
# -*- coding: utf-8 -*-
|
|
#
|
|
# created on: 2013jun05
|
|
# created by: Markus W. Scherer
|
|
|
|
"""Converts CLDR collation files from XML syntax to ICU syntax.
|
|
|
|
Handles the CLDR collation data in the post-CLDR 23 trunk in 2013 June.
|
|
Preserves indentation (except where it joins lines) and text vs. NCR etc.
|
|
Does not handle arbitrary LDML XML collation syntax."""
|
|
|
|
# Invoke with two arguments:
|
|
# - the source folder path
|
|
# - the destination folder path
|
|
# For example:
|
|
# ~/svn.cldr$ collicu/tools/scripts/coll2icu.py trunk/common/collation collicu/common/collation
|
|
|
|
import codecs
|
|
import glob
|
|
import os.path
|
|
import sys
|
|
|
|
def GetIndent(s):
|
|
for i in range(len(s)):
|
|
if s[i] not in " \t": return s[:i]
|
|
return s
|
|
|
|
|
|
# substring replacements
|
|
replacements = (
|
|
# White space and syntax characters must be quoted.
|
|
# Using '\\u0020' rather than just ' ' for clarity.
|
|
("<reset> </reset>", "&'\\u0020'"), # can't just replace all "> <"
|
|
(">!<", ">'!'<"),
|
|
('>"<', ">'\\\"'<"),
|
|
(">"<", ">'\\\"'<"),
|
|
(">#<", ">'\\u0023'<"),
|
|
(">$<", ">'$'<"),
|
|
(">%<", ">'%'<"),
|
|
(">&<", ">'&'<"),
|
|
(">&<", ">'&'<"),
|
|
(">'<", ">''<"),
|
|
(">'<", ">''<"),
|
|
(">(<", ">'('<"),
|
|
(">)<", ">')'<"),
|
|
(">*<", ">'*'<"),
|
|
(">+<", ">'+'<"),
|
|
(">,<", ">','<"),
|
|
(">-<", ">'-'<"),
|
|
(">.<", ">'.'<"),
|
|
(">/<", ">'/'<"),
|
|
(">:<", ">':'<"),
|
|
(">;<", ">';'<"),
|
|
("><<", ">'<'<"),
|
|
(">=<", ">'='<"),
|
|
(">><", ">'>'<"),
|
|
(">?<", ">'?'<"),
|
|
(">@<", ">'@'<"),
|
|
(">[<", ">'['<"),
|
|
(">\\<", ">'\\\\'<"),
|
|
(">]<", ">']'<"),
|
|
(">^<", ">'^'<"),
|
|
(">_<", ">'_'<"),
|
|
(">`<", ">'`'<"),
|
|
(">{<", ">'{'<"),
|
|
(">|<", ">'|'<"),
|
|
(">}<", ">'}'<"),
|
|
(">~<", ">'~'<"),
|
|
# ha.xml has the following
|
|
("'y", "''y"),
|
|
("'Y", "''Y"),
|
|
# kl.xml has the following
|
|
("K'", "K''"),
|
|
# not Pattern_White_Space, just obscure
|
|
(u"\u00A0", u"\\u00A0"),
|
|
(u"\u200C", u"\\u200C"),
|
|
(u"\u200D", u"\\u200D"),
|
|
(u"\u3000", u"\\u3000"),
|
|
# obscure, and some tools do not handle noncharacters well
|
|
(u"\uFDD0", u"'\\uFDD0'"),
|
|
# The old ICU collation rule parser seems to need more escaping than it should.
|
|
(u"≠", u"'≠'"),
|
|
# fi.xml resets contain a space
|
|
(u" ̵</reset>", u"'\\u0020'̵"),
|
|
# fa.xml <sc> with non-NFD_Inert chars
|
|
(u"<sc>\u0650\u064f\u064b\u064d\u064c</sc>", u"<<\u0650<<\u064f<<\u064b<<\u064d<<\u064c"),
|
|
# ml.xml strings contain spaces
|
|
(u" </s>", u"'\\u0020'"),
|
|
(u" </reset>", u"'\\u0020'"),
|
|
# vi.xml <sc> with non-NFD_Inert chars
|
|
(u"<sc>\u0309\u0303\u0301\u0323</sc>", u"<<\u0309<<\u0303<<\u0301<<\u0323"),
|
|
# en_US_POSIX needs a lot of quoting.
|
|
("<pc> !"#$%&'()*+,-./</pc>", "<*'\\u0020'-'/'"),
|
|
("<pc>0123456789:;<=>?@</pc>", "<*0-'@'"),
|
|
("<pc>[\]^_`</pc>", "<*'['-'`'"),
|
|
("<pc>{|}~</pc>", "<*'{'-'\u007F'"),
|
|
# CJK parenthesized resets
|
|
("<reset>(", "&'('"),
|
|
(")</reset>", "')'"),
|
|
# Convert XML elements into ICU syntax.
|
|
("><!--", "> #"), # add a space before an inline comment
|
|
("<!--", "#"),
|
|
(" -->", ""),
|
|
("-->", ""),
|
|
("<reset>", "&"),
|
|
('<reset before="primary">', "&[before 1]"),
|
|
('<reset before="secondary">', "&[before 2]"),
|
|
('<reset before="tertiary">', "&[before 3]"),
|
|
("</reset>", ""),
|
|
("<p>", "<"),
|
|
("</p>", ""),
|
|
("<s>", "<<"),
|
|
("</s>", ""),
|
|
("<t>", "<<<"),
|
|
("</t>", ""),
|
|
("<i>", "="),
|
|
("</i>", ""),
|
|
("<pc>", "<*"),
|
|
("</pc>", ""),
|
|
("<sc>", "<<*"),
|
|
("</sc>", ""),
|
|
("<tc>", "<<<*"),
|
|
("</tc>", ""),
|
|
("<ic>", "=*"),
|
|
("</ic>", ""),
|
|
("<x>", ""),
|
|
("</x>", ""),
|
|
("<extend>", "/"),
|
|
("</extend>", ""),
|
|
("</context>", "|"),
|
|
("<first_tertiary_ignorable/>", "[first tertiary ignorable]"),
|
|
("<last_tertiary_ignorable/>", "[last tertiary ignorable]"),
|
|
("<first_secondary_ignorable/>", "[first secondary ignorable]"),
|
|
("<last_secondary_ignorable/>", "[last secondary ignorable]"),
|
|
("<first_primary_ignorable/>", "[first primary ignorable]"),
|
|
("<last_primary_ignorable/>", "[last primary ignorable]"),
|
|
("<first_variable/>", "[first variable]"),
|
|
("<last_variable/>", "[last variable]"),
|
|
("<first_non_ignorable/>", "[first regular]"),
|
|
("<last_non_ignorable/>", "[last regular]"),
|
|
("<last_non_ignorable />", "[last regular]"),
|
|
("<first_trailing/>", "[first trailing]"),
|
|
("<last_trailing/>", "[last trailing]")
|
|
)
|
|
|
|
|
|
def ConvertFile(src, dest):
|
|
in_rules = False
|
|
partial = ""
|
|
in_ml_comment = False
|
|
for line in src:
|
|
if "<rules>" in line:
|
|
indent = GetIndent(line)
|
|
stripped = line.strip()
|
|
# Replace import-only rules with import elements.
|
|
if stripped == '<rules><import source="sr"/></rules>':
|
|
dest.write(indent + '<import source="sr"/>\n')
|
|
elif stripped == '<rules><import source="hr" type="search"/></rules>':
|
|
dest.write(indent + '<import source="hr" type="search"/>\n')
|
|
elif stripped == '<rules><import source="hr"/></rules>':
|
|
dest.write(indent + '<import source="hr"/>\n')
|
|
elif stripped == '<rules><import source="ps"/></rules>':
|
|
dest.write(indent + '<import source="ps"/>\n')
|
|
else:
|
|
# Replace the XML <rules> section with ICU syntax rules in <cr>.
|
|
assert stripped == "<rules>"
|
|
dest.write(indent + "<cr><![CDATA[\n")
|
|
in_rules = True
|
|
elif "</rules>" in line:
|
|
# Flush, and go back to just copying lines until the next <rules>.
|
|
if partial:
|
|
dest.write(partial + "\n")
|
|
partial = ""
|
|
in_ml_comment = False
|
|
dest.write(GetIndent(line) + "]]></cr>\n")
|
|
in_rules = False
|
|
else:
|
|
if in_rules:
|
|
# Find out whether we want to concatenate the current line
|
|
# with the previous and/or next one.
|
|
finish_partial = False # Finish collected, partial input.
|
|
start_ml_comment = False # Start of a multi-line comment.
|
|
stop_comment = False # End of a comment, must terminate the line.
|
|
if ("<reset" in line) or line.lstrip().startswith("<!--"):
|
|
finish_partial = True
|
|
if partial and len(partial.strip()) > 80:
|
|
finish_partial = True
|
|
if "<!--" in line and "-->" not in line:
|
|
start_ml_comment = True
|
|
if "-->" in line:
|
|
assert line.rstrip().endswith("-->")
|
|
stop_comment = True
|
|
|
|
# Convert XML syntax to ICU syntax.
|
|
if "<context>" in line:
|
|
# Swap context & relation:
|
|
# <x><context>カ</context><i>ー</i></x>
|
|
# turns into
|
|
# =カ|ー
|
|
if "<i>" in line:
|
|
line = line.replace("<i>", "").replace("<context>", "<i>")
|
|
elif "<t>" in line:
|
|
line = line.replace("<t>", "").replace("<context>", "<t>")
|
|
|
|
for (xml, icu) in replacements:
|
|
line = line.replace(xml, icu)
|
|
|
|
while True:
|
|
# Convert a Numeric Character Reference to \\uhhhh.
|
|
i = line.find("&#x")
|
|
if i < 0: break
|
|
limit = line.find(";", i + 3)
|
|
cp = line[i + 3:limit]
|
|
while len(cp) < 4: cp = "0" + cp
|
|
assert len(cp) == 4 # not handling supplementary code points
|
|
line = line[:i] + "\\u" + cp + line[limit + 1:]
|
|
|
|
# Start/continue/finish concatenation, and output.
|
|
if partial and finish_partial:
|
|
# Write collected input.
|
|
dest.write(partial + "\n")
|
|
partial = ""
|
|
|
|
if start_ml_comment:
|
|
# Start a multi-line comment.
|
|
assert not partial
|
|
comment_indent = GetIndent(line) # can be the empty string
|
|
in_ml_comment = True
|
|
elif in_ml_comment:
|
|
# Continue a multi-line comment.
|
|
assert not partial
|
|
if line.startswith(comment_indent):
|
|
if line[len(comment_indent)] in " \t":
|
|
# Preserve further indentation.
|
|
line = comment_indent + "#" + line[len(comment_indent):]
|
|
else:
|
|
# Add a space after the #.
|
|
line = comment_indent + "# " + line[len(comment_indent):]
|
|
else:
|
|
# Indent at least as much as the first line.
|
|
line = line.lstrip()
|
|
if line:
|
|
line = comment_indent + "# " + line
|
|
else:
|
|
line = comment_indent + "#\n"
|
|
elif stop_comment:
|
|
# Just output the line, do not start collecting input.
|
|
# ICU-syntax comments end with the end of the line,
|
|
# do not append rules to them.
|
|
if partial:
|
|
line = partial + line.lstrip() + "\n"
|
|
partial = ""
|
|
elif not partial:
|
|
# Start collecting input.
|
|
partial = line.rstrip()
|
|
elif partial:
|
|
# Continue collecting input.
|
|
partial += line.strip()
|
|
|
|
if stop_comment:
|
|
in_ml_comment = False
|
|
if not partial: dest.write(line)
|
|
|
|
|
|
def main():
|
|
(src_root, dest_root) = sys.argv[1:3]
|
|
src_pattern = os.path.join(src_root, "*.xml")
|
|
for src_path in glob.iglob(src_pattern):
|
|
basename = os.path.basename(src_path)
|
|
dest_path = os.path.join(dest_root, basename)
|
|
with codecs.open(src_path, "r", "UTF-8") as src:
|
|
with codecs.open(dest_path, "w", "UTF-8") as dest:
|
|
ConvertFile(src, dest)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|