v811_spc009/external/icu/icu4c/source/python/icutools/databuilder/filtration.py

# Copyright (C) 2018 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html

# Python 2/3 Compatibility (ICU-20299)
# TODO(ICU-20301): Remove this.
from __future__ import print_function

from abc import abstractmethod
from collections import defaultdict
import re
import sys

from . import *
from . import utils
from .request_types import *


# Note: for this to be a proper abstract class, it should extend abc.ABC.
# There is no nice way to do this that works in both Python 2 and 3.
# TODO(ICU-20301): Make this inherit from abc.ABC.
class Filter(object):
    @staticmethod
    def create_from_json(json_data, io):
        assert io != None
        if "filterType" in json_data:
            filter_type = json_data["filterType"]
        else:
            filter_type = "file-stem"

        if filter_type == "file-stem":
            return FileStemFilter(json_data)
        elif filter_type == "language":
            return LanguageFilter(json_data)
        elif filter_type == "regex":
            return RegexFilter(json_data)
        elif filter_type == "exclude":
            return ExclusionFilter()
        elif filter_type == "union":
            return UnionFilter(json_data, io)
        elif filter_type == "locale":
            return LocaleFilter(json_data, io)
        else:
            print("Error: Unknown filterType option: %s" % filter_type, file=sys.stderr)
            return None

    def filter(self, request):
        if not request.apply_file_filter(self):
            return []
        for file in request.all_input_files():
            assert self.match(file)
        return [request]

    @staticmethod
    def _file_to_file_stem(file):
        start = file.filename.rfind("/")
        limit = file.filename.rfind(".")
        return file.filename[start+1:limit]

    @staticmethod
    def _file_to_subdir(file):
        limit = file.filename.rfind("/")
        if limit == -1:
            return None
        return file.filename[:limit]

    @abstractmethod
    def match(self, file):
        pass


class InclusionFilter(Filter):
    def match(self, file):
        return True


class ExclusionFilter(Filter):
    def match(self, file):
        return False


class IncludeExcludeFilter(Filter):
    def __init__(self, json_data):
        if "whitelist" in json_data:
            self.is_includelist = True
            self.includelist = json_data["whitelist"]
        elif "includelist" in json_data:
            self.is_includelist = True
            self.includelist = json_data["includelist"]
        elif "blacklist" in json_data:
            self.is_includelist = False
            self.excludelist = json_data["blacklist"]
        elif "excludelist" in json_data:
            self.is_includelist = False
            self.excludelist = json_data["excludelist"]
        else:
            raise AssertionError("Need either includelist or excludelist: %s" % str(json_data))

    def match(self, file):
        file_stem = self._file_to_file_stem(file)
        return self._should_include(file_stem)

    @abstractmethod
    def _should_include(self, file_stem):
        pass


class FileStemFilter(IncludeExcludeFilter):
    def _should_include(self, file_stem):
        if self.is_includelist:
            return file_stem in self.includelist
        else:
            return file_stem not in self.excludelist


class LanguageFilter(IncludeExcludeFilter):
    def _should_include(self, file_stem):
        language = file_stem.split("_")[0]
        if language == "root":
            # Always include root.txt
            return True
        if self.is_includelist:
            return language in self.includelist
        else:
            return language not in self.excludelist


class RegexFilter(IncludeExcludeFilter):
    def __init__(self, *args):
        # TODO(ICU-20301): Change this to: super().__init__(*args)
        super(RegexFilter, self).__init__(*args)
        if self.is_includelist:
            self.includelist = [re.compile(pat) for pat in self.includelist]
        else:
            self.excludelist = [re.compile(pat) for pat in self.excludelist]

    def _should_include(self, file_stem):
        if self.is_includelist:
            for pattern in self.includelist:
                if pattern.match(file_stem):
                    return True
            return False
        else:
            for pattern in self.excludelist:
                if pattern.match(file_stem):
                    return False
            return True


class UnionFilter(Filter):
    def __init__(self, json_data, io):
        # Collect the sub-filters.
        self.sub_filters = []
        for filter_json in json_data["unionOf"]:
            self.sub_filters.append(Filter.create_from_json(filter_json, io))

    def match(self, file):
        """Match iff any of the sub-filters match."""
        for filter in self.sub_filters:
            if filter.match(file):
                return True
        return False


LANGUAGE_SCRIPT_REGEX = re.compile(r"^([a-z]{2,3})_[A-Z][a-z]{3}$")
LANGUAGE_ONLY_REGEX = re.compile(r"^[a-z]{2,3}$")

class LocaleFilter(Filter):
    def __init__(self, json_data, io):
        if "whitelist" in json_data:
            self.locales_requested = list(json_data["whitelist"])
        elif "includelist" in json_data:
            self.locales_requested = list(json_data["includelist"])
        else:
            raise AssertionError("You must have an includelist in a locale filter")
        self.include_children = json_data.get("includeChildren", True)
        self.include_scripts = json_data.get("includeScripts", False)

        # Load the dependency graph from disk
        self.dependency_data_by_tree = {
            tree: io.read_locale_deps(tree)
            for tree in utils.ALL_TREES
        }

    def match(self, file):
        tree = self._file_to_subdir(file)
        assert tree is not None
        locale = self._file_to_file_stem(file)

        # A locale is *required* if it is *requested* or an ancestor of a
        # *requested* locale.
        if locale in self._locales_required(tree):
            return True

        # Resolve include_scripts and include_children.
        return self._match_recursive(locale, tree)

    def _match_recursive(self, locale, tree):
        # Base case: return True if we reached a *requested* locale,
        # or False if we ascend out of the locale tree.
        if locale is None:
            return False
        if locale in self.locales_requested:
            return True

        # Check for alternative scripts.
        # This causes sr_Latn to check sr instead of going directly to root.
        if self.include_scripts:
            match = LANGUAGE_SCRIPT_REGEX.match(locale)
            if match and self._match_recursive(match.group(1), tree):
                return True

        # Check if we are a descendant of a *requested* locale.
        if self.include_children:
            parent = self._get_parent_locale(locale, tree)
            if self._match_recursive(parent, tree):
                return True

        # No matches.
        return False

    def _get_parent_locale(self, locale, tree):
        """Gets the parent locale in the given tree, according to dependency data."""
        dependency_data = self.dependency_data_by_tree[tree]
        if "parents" in dependency_data and locale in dependency_data["parents"]:
            return dependency_data["parents"][locale]
        if "aliases" in dependency_data and locale in dependency_data["aliases"]:
            return dependency_data["aliases"][locale]
        if LANGUAGE_ONLY_REGEX.match(locale):
            return "root"
        i = locale.rfind("_")
        if i < 0:
            assert locale == "root", "Invalid locale: %s/%s" % (tree, locale)
            return None
        return locale[:i]

    def _locales_required(self, tree):
        """Returns a generator of all required locales in the given tree."""
        for locale in self.locales_requested:
            while locale is not None:
                yield locale
                locale = self._get_parent_locale(locale, tree)


def apply_filters(requests, config, io):
    """Runs the filters and returns a new list of requests."""
    requests = _apply_file_filters(requests, config, io)
    requests = _apply_resource_filters(requests, config, io)
    return requests


def _apply_file_filters(old_requests, config, io):
    """Filters out entire files."""
    filters = _preprocess_file_filters(old_requests, config, io)
    new_requests = []
    for request in old_requests:
        category = request.category
        if category in filters:
            new_requests += filters[category].filter(request)
        else:
            new_requests.append(request)
    return new_requests


def _preprocess_file_filters(requests, config, io):
    all_categories = set(
        request.category
        for request in requests
    )
    all_categories.remove(None)
    all_categories = list(sorted(all_categories))
    json_data = config.filters_json_data
    filters = {}
    default_filter_json = "exclude" if config.strategy == "additive" else "include"
    for category in all_categories:
        filter_json = default_filter_json
        # Figure out the correct filter to create
        if "featureFilters" in json_data and category in json_data["featureFilters"]:
            filter_json = json_data["featureFilters"][category]
        if filter_json == "include" and "localeFilter" in json_data and category.endswith("_tree"):
            filter_json = json_data["localeFilter"]
        # Resolve the filter JSON into a filter object
        if filter_json == "exclude":
            filters[category] = ExclusionFilter()
        elif filter_json == "include":
            pass  # no-op
        else:
            filters[category] = Filter.create_from_json(filter_json, io)
    if "featureFilters" in json_data:
        for category in json_data["featureFilters"]:
            if category not in all_categories:
                print("Warning: category %s is not known" % category, file=sys.stderr)
    return filters


class ResourceFilterInfo(object):
    def __init__(self, category, strategy):
        self.category = category
        self.strategy = strategy
        self.filter_tmp_dir = "filters/%s" % category
        self.input_files = None
        self.filter_files = None
        self.rules_by_file = None

    def apply_to_requests(self, all_requests):
        # Call this method only once per list of requests.
        assert self.input_files is None
        for request in all_requests:
            if request.category != self.category:
                continue
            if not isinstance(request, AbstractExecutionRequest):
                continue
            if request.tool != IcuTool("genrb"):
                continue
            if not request.input_files:
                continue
            self._set_files(request.input_files)
            request.dep_targets += [self.filter_files[:]]
            arg_str = "--filterDir {TMP_DIR}/%s" % self.filter_tmp_dir
            request.args = "%s %s" % (arg_str, request.args)

        # Make sure we found the target request
        if self.input_files is None:
            print("WARNING: Category not found: %s" % self.category, file=sys.stderr)
            self.input_files = []
            self.filter_files = []
            self.rules_by_file = []

    def _set_files(self, files):
        # Note: The input files to genrb for a certain category should always
        # be the same. For example, there are often two genrb calls: one for
        # --writePoolBundle, and the other for --usePoolBundle. They are both
        # expected to have the same list of input files.
        if self.input_files is not None:
            assert self.input_files == files
            return
        self.input_files = list(files)
        self.filter_files = [
            TmpFile("%s/%s" % (self.filter_tmp_dir, basename))
            for basename in (
                file.filename[file.filename.rfind("/")+1:]
                for file in files
            )
        ]
        if self.strategy == "additive":
            self.rules_by_file = [
                [r"-/", r"+/%%ALIAS", r"+/%%Parent"]
                for _ in range(len(files))
            ]
        else:
            self.rules_by_file = [
                [r"+/"]
                for _ in range(len(files))
            ]

    def add_rules(self, file_filter, rules):
        for file, rule_list in zip(self.input_files, self.rules_by_file):
            if file_filter.match(file):
                rule_list += rules

    def make_requests(self):
        # Map from rule list to filter files with that rule list
        unique_rules = defaultdict(list)
        for filter_file, rules in zip(self.filter_files, self.rules_by_file):
            unique_rules[tuple(rules)].append(filter_file)

        new_requests = []
        i = 0
        for rules, filter_files in unique_rules.items():
            base_filter_file = filter_files[0]
            new_requests += [
                PrintFileRequest(
                    name = "%s_print_%d" % (self.category, i),
                    output_file = base_filter_file,
                    content = self._generate_resource_filter_txt(rules)
                )
            ]
            i += 1
            for filter_file in filter_files[1:]:
                new_requests += [
                    CopyRequest(
                        name = "%s_copy_%d" % (self.category, i),
                        input_file = base_filter_file,
                        output_file = filter_file
                    )
                ]
                i += 1
        return new_requests

    @staticmethod
    def _generate_resource_filter_txt(rules):
        result = "# Caution: This file is automatically generated\n\n"
        result += "\n".join(rules)
        return result


def _apply_resource_filters(all_requests, config, io):
    """Creates filters for looking within resource bundle files."""
    json_data = config.filters_json_data
    if "resourceFilters" not in json_data:
        return all_requests

    collected = {}
    for entry in json_data["resourceFilters"]:
        if "files" in entry:
            file_filter = Filter.create_from_json(entry["files"], io)
        else:
            file_filter = InclusionFilter()
        for category in entry["categories"]:
            # not defaultdict because we need to pass arguments to the constructor
            if category not in collected:
                filter_info = ResourceFilterInfo(category, config.strategy)
                filter_info.apply_to_requests(all_requests)
                collected[category] = filter_info
            else:
                filter_info = collected[category]
            filter_info.add_rules(file_filter, entry["rules"])

    # Add the filter generation requests to the beginning so that by default
    # they are made before genrb gets run (order is required by windirect)
    new_requests = []
    for filter_info in collected.values():
        new_requests += filter_info.make_requests()
    new_requests += all_requests
    return new_requests