You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
75 lines
2.0 KiB
75 lines
2.0 KiB
import itertools
|
|
import os
|
|
import re
|
|
import sys
|
|
|
|
|
|
def get_characters():
|
|
"""Find every Unicode character that is valid in a Python `identifier`_ but
|
|
is not matched by the regex ``\\w`` group.
|
|
|
|
``\\w`` matches some characters that aren't valid in identifiers, but
|
|
:meth:`str.isidentifier` will catch that later in lexing.
|
|
|
|
All start characters are valid continue characters, so we only test for
|
|
continue characters.
|
|
|
|
_identifier: https://docs.python.org/3/reference/lexical_analysis.html#identifiers
|
|
"""
|
|
for cp in range(sys.maxunicode + 1):
|
|
s = chr(cp)
|
|
|
|
if ("a" + s).isidentifier() and not re.match(r"\w", s):
|
|
yield s
|
|
|
|
|
|
def collapse_ranges(data):
|
|
"""Given a sorted list of unique characters, generate ranges representing
|
|
sequential code points.
|
|
|
|
Source: https://stackoverflow.com/a/4629241/400617
|
|
"""
|
|
for _, b in itertools.groupby(enumerate(data), lambda x: ord(x[1]) - x[0]):
|
|
b = list(b)
|
|
yield b[0][1], b[-1][1]
|
|
|
|
|
|
def build_pattern(ranges):
|
|
"""Output the regex pattern for ranges of characters.
|
|
|
|
One and two character ranges output the individual characters.
|
|
"""
|
|
out = []
|
|
|
|
for a, b in ranges:
|
|
if a == b: # single char
|
|
out.append(a)
|
|
elif ord(b) - ord(a) == 1: # two chars, range is redundant
|
|
out.append(a)
|
|
out.append(b)
|
|
else:
|
|
out.append(f"{a}-{b}")
|
|
|
|
return "".join(out)
|
|
|
|
|
|
def main():
|
|
"""Build the regex pattern and write it to
|
|
``jinja2/_identifier.py``.
|
|
"""
|
|
pattern = build_pattern(collapse_ranges(get_characters()))
|
|
filename = os.path.abspath(
|
|
os.path.join(os.path.dirname(__file__), "..", "src", "jinja2", "_identifier.py")
|
|
)
|
|
|
|
with open(filename, "w", encoding="utf8") as f:
|
|
f.write("import re\n\n")
|
|
f.write("# generated by scripts/generate_identifier_pattern.py\n")
|
|
f.write("pattern = re.compile(\n")
|
|
f.write(f' r"[\\w{pattern}]+" # noqa: B950\n')
|
|
f.write(")\n")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|