#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# created on: 2013jun05
# created by: Markus W. Scherer
"""Converts CLDR collation files from XML syntax to ICU syntax.
Handles the CLDR collation data in the post-CLDR 23 trunk in 2013 June.
Preserves indentation (except where it joins lines) and text vs. NCR etc.
Does not handle arbitrary LDML XML collation syntax."""
# Invoke with two arguments:
# - the source folder path
# - the destination folder path
# For example:
# ~/svn.cldr$ collicu/tools/scripts/coll2icu.py trunk/common/collation collicu/common/collation
import codecs
import glob
import os.path
import sys
def GetIndent(s):
for i in range(len(s)):
if s[i] not in " \t": return s[:i]
return s
# substring replacements
replacements = (
# White space and syntax characters must be quoted.
# Using '\\u0020' rather than just ' ' for clarity.
(" ", "&'\\u0020'"), # can't just replace all "> <"
(">!<", ">'!'<"),
('>"<', ">'\\\"'<"),
(">"<", ">'\\\"'<"),
(">#<", ">'\\u0023'<"),
(">$<", ">'$'<"),
(">%<", ">'%'<"),
(">&<", ">'&'<"),
(">&<", ">'&'<"),
(">'<", ">''<"),
(">'<", ">''<"),
(">(<", ">'('<"),
(">)<", ">')'<"),
(">*<", ">'*'<"),
(">+<", ">'+'<"),
(">,<", ">','<"),
(">-<", ">'-'<"),
(">.<", ">'.'<"),
(">/<", ">'/'<"),
(">:<", ">':'<"),
(">;<", ">';'<"),
("><<", ">'<'<"),
(">=<", ">'='<"),
(">><", ">'>'<"),
(">?<", ">'?'<"),
(">@<", ">'@'<"),
(">[<", ">'['<"),
(">\\<", ">'\\\\'<"),
(">]<", ">']'<"),
(">^<", ">'^'<"),
(">_<", ">'_'<"),
(">`<", ">'`'<"),
(">{<", ">'{'<"),
(">|<", ">'|'<"),
(">}<", ">'}'<"),
(">~<", ">'~'<"),
# ha.xml has the following
("'y", "''y"),
("'Y", "''Y"),
# kl.xml has the following
("K'", "K''"),
# not Pattern_White_Space, just obscure
(u"\u00A0", u"\\u00A0"),
(u"\u200C", u"\\u200C"),
(u"\u200D", u"\\u200D"),
(u"\u3000", u"\\u3000"),
# obscure, and some tools do not handle noncharacters well
(u"\uFDD0", u"'\\uFDD0'"),
# The old ICU collation rule parser seems to need more escaping than it should.
(u"≠", u"'≠'"),
# fi.xml resets contain a space
(u" ̵", u"'\\u0020'̵"),
# fa.xml with non-NFD_Inert chars
(u"\u0650\u064f\u064b\u064d\u064c", u"<<\u0650<<\u064f<<\u064b<<\u064d<<\u064c"),
# ml.xml strings contain spaces
(u" ", u"'\\u0020'"),
(u" ", u"'\\u0020'"),
# vi.xml with non-NFD_Inert chars
(u"\u0309\u0303\u0301\u0323", u"<<\u0309<<\u0303<<\u0301<<\u0323"),
# en_US_POSIX needs a lot of quoting.
(" !"#$%&'()*+,-./", "<*'\\u0020'-'/'"),
("0123456789:;<=>?@", "<*0-'@'"),
("[\]^_`", "<*'['-'`'"),
("{|}~", "<*'{'-'\u007F'"),
# CJK parenthesized resets
("(", "&'('"),
(")", "')'"),
# Convert XML elements into ICU syntax.
(">", ""),
("-->", ""),
("", "&"),
('', "&[before 1]"),
('', "&[before 2]"),
('', "&[before 3]"),
("", ""),
("", "<"),
("
", ""),
("", "<<"),
("", ""),
("", "<<<"),
("", ""),
("", "="),
("", ""),
("", "<*"),
("", ""),
("", "<<*"),
("", ""),
("", "<<<*"),
("", ""),
("", "=*"),
("", ""),
("", ""),
("", ""),
("", "/"),
("", ""),
("", "|"),
("", "[first tertiary ignorable]"),
("", "[last tertiary ignorable]"),
("", "[first secondary ignorable]"),
("", "[last secondary ignorable]"),
("", "[first primary ignorable]"),
("", "[last primary ignorable]"),
("", "[first variable]"),
("", "[last variable]"),
("", "[first regular]"),
("", "[last regular]"),
("", "[last regular]"),
("", "[first trailing]"),
("", "[last trailing]")
)
def ConvertFile(src, dest):
in_rules = False
partial = ""
in_ml_comment = False
for line in src:
if "" in line:
indent = GetIndent(line)
stripped = line.strip()
# Replace import-only rules with import elements.
if stripped == '':
dest.write(indent + '\n')
elif stripped == '':
dest.write(indent + '\n')
elif stripped == '':
dest.write(indent + '\n')
elif stripped == '':
dest.write(indent + '\n')
else:
# Replace the XML section with ICU syntax rules in .
assert stripped == ""
dest.write(indent + "" in line:
# Flush, and go back to just copying lines until the next .
if partial:
dest.write(partial + "\n")
partial = ""
in_ml_comment = False
dest.write(GetIndent(line) + "]]>\n")
in_rules = False
else:
if in_rules:
# Find out whether we want to concatenate the current line
# with the previous and/or next one.
finish_partial = False # Finish collected, partial input.
start_ml_comment = False # Start of a multi-line comment.
stop_comment = False # End of a comment, must terminate the line.
if (" 80:
finish_partial = True
if "" not in line:
start_ml_comment = True
if "-->" in line:
assert line.rstrip().endswith("-->")
stop_comment = True
# Convert XML syntax to ICU syntax.
if "" in line:
# Swap context & relation:
# カー
# turns into
# =カ|ー
if "" in line:
line = line.replace("", "").replace("", "")
elif "" in line:
line = line.replace("", "").replace("", "")
for (xml, icu) in replacements:
line = line.replace(xml, icu)
while True:
# Convert a Numeric Character Reference to \\uhhhh.
i = line.find("")
if i < 0: break
limit = line.find(";", i + 3)
cp = line[i + 3:limit]
while len(cp) < 4: cp = "0" + cp
assert len(cp) == 4 # not handling supplementary code points
line = line[:i] + "\\u" + cp + line[limit + 1:]
# Start/continue/finish concatenation, and output.
if partial and finish_partial:
# Write collected input.
dest.write(partial + "\n")
partial = ""
if start_ml_comment:
# Start a multi-line comment.
assert not partial
comment_indent = GetIndent(line) # can be the empty string
in_ml_comment = True
elif in_ml_comment:
# Continue a multi-line comment.
assert not partial
if line.startswith(comment_indent):
if line[len(comment_indent)] in " \t":
# Preserve further indentation.
line = comment_indent + "#" + line[len(comment_indent):]
else:
# Add a space after the #.
line = comment_indent + "# " + line[len(comment_indent):]
else:
# Indent at least as much as the first line.
line = line.lstrip()
if line:
line = comment_indent + "# " + line
else:
line = comment_indent + "#\n"
elif stop_comment:
# Just output the line, do not start collecting input.
# ICU-syntax comments end with the end of the line,
# do not append rules to them.
if partial:
line = partial + line.lstrip() + "\n"
partial = ""
elif not partial:
# Start collecting input.
partial = line.rstrip()
elif partial:
# Continue collecting input.
partial += line.strip()
if stop_comment:
in_ml_comment = False
if not partial: dest.write(line)
def main():
(src_root, dest_root) = sys.argv[1:3]
src_pattern = os.path.join(src_root, "*.xml")
for src_path in glob.iglob(src_pattern):
basename = os.path.basename(src_path)
dest_path = os.path.join(dest_root, basename)
with codecs.open(src_path, "r", "UTF-8") as src:
with codecs.open(dest_path, "w", "UTF-8") as dest:
ConvertFile(src, dest)
if __name__ == "__main__":
main()