#!/usr/bin/python # -*- coding: utf-8 -*- # # created on: 2013jun05 # created by: Markus W. Scherer """Converts CLDR collation files from XML syntax to ICU syntax. Handles the CLDR collation data in the post-CLDR 23 trunk in 2013 June. Preserves indentation (except where it joins lines) and text vs. NCR etc. Does not handle arbitrary LDML XML collation syntax.""" # Invoke with two arguments: # - the source folder path # - the destination folder path # For example: # ~/svn.cldr$ collicu/tools/scripts/coll2icu.py trunk/common/collation collicu/common/collation import codecs import glob import os.path import sys def GetIndent(s): for i in range(len(s)): if s[i] not in " \t": return s[:i] return s # substring replacements replacements = ( # White space and syntax characters must be quoted. # Using '\\u0020' rather than just ' ' for clarity. (" ", "&'\\u0020'"), # can't just replace all "> <" (">!<", ">'!'<"), ('>"<', ">'\\\"'<"), (">"<", ">'\\\"'<"), (">#<", ">'\\u0023'<"), (">$<", ">'$'<"), (">%<", ">'%'<"), (">&<", ">'&'<"), (">&<", ">'&'<"), (">'<", ">''<"), (">'<", ">''<"), (">(<", ">'('<"), (">)<", ">')'<"), (">*<", ">'*'<"), (">+<", ">'+'<"), (">,<", ">','<"), (">-<", ">'-'<"), (">.<", ">'.'<"), (">/<", ">'/'<"), (">:<", ">':'<"), (">;<", ">';'<"), ("><<", ">'<'<"), (">=<", ">'='<"), (">><", ">'>'<"), (">?<", ">'?'<"), (">@<", ">'@'<"), (">[<", ">'['<"), (">\\<", ">'\\\\'<"), (">]<", ">']'<"), (">^<", ">'^'<"), (">_<", ">'_'<"), (">`<", ">'`'<"), (">{<", ">'{'<"), (">|<", ">'|'<"), (">}<", ">'}'<"), (">~<", ">'~'<"), # ha.xml has the following ("'y", "''y"), ("'Y", "''Y"), # kl.xml has the following ("K'", "K''"), # not Pattern_White_Space, just obscure (u"\u00A0", u"\\u00A0"), (u"\u200C", u"\\u200C"), (u"\u200D", u"\\u200D"), (u"\u3000", u"\\u3000"), # obscure, and some tools do not handle noncharacters well (u"\uFDD0", u"'\\uFDD0'"), # The old ICU collation rule parser seems to need more escaping than it should. (u"≠", u"'≠'"), # fi.xml resets contain a space (u" ̵", u"'\\u0020'̵"), # fa.xml with non-NFD_Inert chars (u"\u0650\u064f\u064b\u064d\u064c", u"<<\u0650<<\u064f<<\u064b<<\u064d<<\u064c"), # ml.xml strings contain spaces (u" ", u"'\\u0020'"), (u" ", u"'\\u0020'"), # vi.xml with non-NFD_Inert chars (u"\u0309\u0303\u0301\u0323", u"<<\u0309<<\u0303<<\u0301<<\u0323"), # en_US_POSIX needs a lot of quoting. (" !"#$%&'()*+,-./", "<*'\\u0020'-'/'"), ("0123456789:;<=>?@", "<*0-'@'"), ("[\]^_`", "<*'['-'`'"), ("{|}~", "<*'{'-'\u007F'"), # CJK parenthesized resets ("(", "&'('"), (")", "')'"), # Convert XML elements into ICU syntax. (">", ""), ("-->", ""), ("", "&"), ('', "&[before 1]"), ('', "&[before 2]"), ('', "&[before 3]"), ("", ""), ("

", "<"), ("

", ""), ("~~", "<<"), ("~~", ""), ("", "<<<"), ("", ""), ("", "="), ("", ""), ("", "<*"), ("", ""), ("", "<<*"), ("", ""), ("", "<<<*"), ("", ""), ("", "=*"), ("", ""), ("", ""), ("", ""), ("", "/"), ("", ""), ("", "|"), ("", "[first tertiary ignorable]"), ("", "[last tertiary ignorable]"), ("", "[first secondary ignorable]"), ("", "[last secondary ignorable]"), ("", "[first primary ignorable]"), ("", "[last primary ignorable]"), ("", "[first variable]"), ("", "[last variable]"), ("", "[first regular]"), ("", "[last regular]"), ("", "[last regular]"), ("", "[first trailing]"), ("", "[last trailing]") ) def ConvertFile(src, dest): in_rules = False partial = "" in_ml_comment = False for line in src: if "" in line: indent = GetIndent(line) stripped = line.strip() # Replace import-only rules with import elements. if stripped == '': dest.write(indent + '\n') elif stripped == '': dest.write(indent + '\n') elif stripped == '': dest.write(indent + '\n') elif stripped == '': dest.write(indent + '\n') else: # Replace the XML section with ICU syntax rules in . assert stripped == "" dest.write(indent + "" in line: # Flush, and go back to just copying lines until the next . if partial: dest.write(partial + "\n") partial = "" in_ml_comment = False dest.write(GetIndent(line) + "]]>\n") in_rules = False else: if in_rules: # Find out whether we want to concatenate the current line # with the previous and/or next one. finish_partial = False # Finish collected, partial input. start_ml_comment = False # Start of a multi-line comment. stop_comment = False # End of a comment, must terminate the line. if (" 80: finish_partial = True if "" not in line: start_ml_comment = True if "-->" in line: assert line.rstrip().endswith("-->") stop_comment = True # Convert XML syntax to ICU syntax. if "" in line: # Swap context & relation: # カー # turns into # =カ|ー if "" in line: line = line.replace("", "").replace("", "") elif "" in line: line = line.replace("", "").replace("", "") for (xml, icu) in replacements: line = line.replace(xml, icu) while True: # Convert a Numeric Character Reference to \\uhhhh. i = line.find("&#x") if i < 0: break limit = line.find(";", i + 3) cp = line[i + 3:limit] while len(cp) < 4: cp = "0" + cp assert len(cp) == 4 # not handling supplementary code points line = line[:i] + "\\u" + cp + line[limit + 1:] # Start/continue/finish concatenation, and output. if partial and finish_partial: # Write collected input. dest.write(partial + "\n") partial = "" if start_ml_comment: # Start a multi-line comment. assert not partial comment_indent = GetIndent(line) # can be the empty string in_ml_comment = True elif in_ml_comment: # Continue a multi-line comment. assert not partial if line.startswith(comment_indent): if line[len(comment_indent)] in " \t": # Preserve further indentation. line = comment_indent + "#" + line[len(comment_indent):] else: # Add a space after the #. line = comment_indent + "# " + line[len(comment_indent):] else: # Indent at least as much as the first line. line = line.lstrip() if line: line = comment_indent + "# " + line else: line = comment_indent + "#\n" elif stop_comment: # Just output the line, do not start collecting input. # ICU-syntax comments end with the end of the line, # do not append rules to them. if partial: line = partial + line.lstrip() + "\n" partial = "" elif not partial: # Start collecting input. partial = line.rstrip() elif partial: # Continue collecting input. partial += line.strip() if stop_comment: in_ml_comment = False if not partial: dest.write(line) def main(): (src_root, dest_root) = sys.argv[1:3] src_pattern = os.path.join(src_root, "*.xml") for src_path in glob.iglob(src_pattern): basename = os.path.basename(src_path) dest_path = os.path.join(dest_root, basename) with codecs.open(src_path, "r", "UTF-8") as src: with codecs.open(dest_path, "w", "UTF-8") as dest: ConvertFile(src, dest) if __name__ == "__main__": main()