You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
180 lines
4.4 KiB
180 lines
4.4 KiB
#!/usr/bin/env python
|
|
|
|
# Copyright 2020 The Amber Authors. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""
|
|
Script to check files for inclusive language. The script will scan all files
|
|
and flag non-inclusive terminology which is identified.
|
|
|
|
Usage, run the script from a folder and the script will scan down through that
|
|
folder.
|
|
"""
|
|
|
|
import fnmatch
|
|
import os
|
|
import re
|
|
import sys
|
|
|
|
REGEXES = [
|
|
r"(?i)black[-_]?list",
|
|
r"(?i)white[-_]?list",
|
|
r"(?i)gr[ea]y[-_]?list",
|
|
r"(?i)(first class citizen)",
|
|
r"(?i)black[-_]?hat",
|
|
r"(?i)white[-_]?hat",
|
|
r"(?i)gr[ea]y[-_]?hat",
|
|
r"(?i)master",
|
|
r"(?i)slave",
|
|
r"(?i)\bhim\b",
|
|
r"(?i)\bhis\b",
|
|
r"(?i)\bshe\b",
|
|
r"(?i)\bher\b",
|
|
r"(?i)\bhers\b",
|
|
r"(?i)\bman\b",
|
|
r"(?i)\bwoman\b",
|
|
r"(?i)\she\s",
|
|
r"(?i)\she$",
|
|
r"(?i)^he\s",
|
|
r"(?i)^he$",
|
|
r"(?i)\she['|\u2019]d\s",
|
|
r"(?i)\she['|\u2019]d$",
|
|
r"(?i)^he['|\u2019]d\s",
|
|
r"(?i)^he['|\u2019]d$",
|
|
r"(?i)\she['|\u2019]s\s",
|
|
r"(?i)\she['|\u2019]s$",
|
|
r"(?i)^he['|\u2019]s\s",
|
|
r"(?i)^he['|\u2019]s$",
|
|
r"(?i)\she['|\u2019]ll\s",
|
|
r"(?i)\she['|\u2019]ll$",
|
|
r"(?i)^he['|\u2019]ll\s",
|
|
r"(?i)^he['|\u2019]ll$",
|
|
r"(?i)grandfather",
|
|
r"(?i)\bmitm\b",
|
|
r"(?i)\bcrazy\b",
|
|
r"(?i)\binsane\b",
|
|
r"(?i)\bblind\sto\b",
|
|
r"(?i)\bflying\sblind\b",
|
|
r"(?i)\bblind\seye\b",
|
|
r"(?i)\bcripple\b",
|
|
r"(?i)\bcrippled\b",
|
|
r"(?i)\bdumb\b",
|
|
r"(?i)\bdummy\b",
|
|
r"(?i)\bparanoid\b",
|
|
r"(?i)\bsane\b",
|
|
r"(?i)\bsanity\b",
|
|
r"(?i)red[-_]?line",
|
|
]
|
|
|
|
SUPPRESSIONS = [
|
|
r"(?i)MS_SLAVE",
|
|
r"(?i)man[ -_]?page",
|
|
]
|
|
|
|
|
|
REGEX_LIST = []
|
|
for reg in REGEXES:
|
|
REGEX_LIST.append(re.compile(reg))
|
|
|
|
SUPPRESSION_LIST = []
|
|
for supp in SUPPRESSIONS:
|
|
SUPPRESSION_LIST.append(re.compile(supp))
|
|
|
|
def find(top, filename_glob, skip_glob_list):
|
|
"""Returns files in the tree rooted at top matching filename_glob but not
|
|
in directories matching skip_glob_list."""
|
|
|
|
file_list = []
|
|
for path, dirs, files in os.walk(top):
|
|
for glob in skip_glob_list:
|
|
for match in fnmatch.filter(dirs, glob):
|
|
dirs.remove(match)
|
|
for filename in fnmatch.filter(files, filename_glob):
|
|
if filename == os.path.basename(__file__):
|
|
continue
|
|
file_list.append(os.path.join(path, filename))
|
|
return file_list
|
|
|
|
|
|
def filtered_descendants(glob):
|
|
"""Returns glob-matching filenames under the current directory, but skips
|
|
some irrelevant paths."""
|
|
return find('.', glob, ['third_party', 'external', 'build*', 'out*',
|
|
'CompilerIdCXX', '.git'])
|
|
|
|
def check_match(filename, contents):
|
|
"""Check if contents contains any matching entries"""
|
|
ret = False
|
|
for reg in REGEX_LIST:
|
|
match = reg.search(contents)
|
|
if match:
|
|
suppressed = False
|
|
for supp in SUPPRESSION_LIST:
|
|
idx = match.start()
|
|
supp_match = supp.match(contents[idx:])
|
|
if supp_match:
|
|
suppressed = True
|
|
|
|
# This is a hack to handle the MS_ prefix that is needed
|
|
# to check for. Find a better way if we get more suppressions
|
|
# which modify the prefix of the string
|
|
if idx >= 3:
|
|
supp_match = supp.match(contents[idx - 3:])
|
|
if supp_match:
|
|
suppressed = True
|
|
|
|
if not suppressed:
|
|
# No matching suppression.
|
|
print("{}: found non-inclusive language: {}".format(
|
|
filename, match.group(0)))
|
|
ret = True
|
|
|
|
return ret
|
|
|
|
|
|
def alert_if_lang_matches(glob):
|
|
"""Prints names of all files matching non-inclusive language.
|
|
|
|
Finds all glob-matching files under the current directory and checks if they
|
|
contain the language pattern. Prints the names of all the files that
|
|
match.
|
|
|
|
Returns the total number of file names printed.
|
|
"""
|
|
verbose = False
|
|
printed_count = 0
|
|
for file in filtered_descendants(glob):
|
|
has_match = False
|
|
try:
|
|
with open(file, 'r', encoding='utf8') as contents:
|
|
if check_match(file, contents.read()):
|
|
printed_count += 1
|
|
except:
|
|
if verbose:
|
|
print("skipping {}".format(file))
|
|
|
|
return printed_count
|
|
|
|
|
|
def main():
|
|
globs = ['*']
|
|
count = 0
|
|
for glob in globs:
|
|
count += alert_if_lang_matches(glob)
|
|
|
|
sys.exit(count > 0)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|