You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
150 lines
3.5 KiB
150 lines
3.5 KiB
#!/bin/sh
|
|
|
|
set -e
|
|
D="$(dirname "$0")"
|
|
|
|
# Convenience function for checking that a command exists.
|
|
requires() {
|
|
cmd="$1"
|
|
if ! command -v "$cmd" > /dev/null 2>&1; then
|
|
echo "DEPENDENCY MISSING: $cmd must be installed" >&2
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
# Test if an array ($2) contains a particular element ($1).
|
|
array_exists() {
|
|
needle="$1"
|
|
shift
|
|
|
|
for el in "$@"; do
|
|
if [ "$el" = "$needle" ]; then
|
|
return 0
|
|
fi
|
|
done
|
|
return 1
|
|
}
|
|
|
|
graphemes() {
|
|
regex="$(sh "$D/regex/grapheme.sh")"
|
|
|
|
echo "generating forward grapheme DFA"
|
|
ucd-generate dfa \
|
|
--name GRAPHEME_BREAK_FWD \
|
|
--sparse --minimize --anchored --state-size 2 \
|
|
src/unicode/fsm/ \
|
|
"$regex"
|
|
|
|
echo "generating reverse grapheme DFA"
|
|
ucd-generate dfa \
|
|
--name GRAPHEME_BREAK_REV \
|
|
--reverse --longest \
|
|
--sparse --minimize --anchored --state-size 2 \
|
|
src/unicode/fsm/ \
|
|
"$regex"
|
|
}
|
|
|
|
words() {
|
|
regex="$(sh "$D/regex/word.sh")"
|
|
|
|
echo "generating forward word DFA (this can take a while)"
|
|
ucd-generate dfa \
|
|
--name WORD_BREAK_FWD \
|
|
--sparse --minimize --anchored --state-size 4 \
|
|
src/unicode/fsm/ \
|
|
"$regex"
|
|
}
|
|
|
|
sentences() {
|
|
regex="$(sh "$D/regex/sentence.sh")"
|
|
|
|
echo "generating forward sentence DFA (this can take a while)"
|
|
ucd-generate dfa \
|
|
--name SENTENCE_BREAK_FWD \
|
|
--minimize \
|
|
--sparse --anchored --state-size 4 \
|
|
src/unicode/fsm/ \
|
|
"$regex"
|
|
}
|
|
|
|
regional_indicator() {
|
|
# For finding all occurrences of region indicators. This is used to handle
|
|
# regional indicators as a special case for the reverse grapheme iterator
|
|
# and the reverse word iterator.
|
|
echo "generating regional indicator DFA"
|
|
ucd-generate dfa \
|
|
--name REGIONAL_INDICATOR_REV \
|
|
--reverse \
|
|
--classes --minimize --anchored --premultiply --state-size 1 \
|
|
src/unicode/fsm/ \
|
|
"\p{gcb=Regional_Indicator}"
|
|
}
|
|
|
|
simple_word() {
|
|
echo "generating forward simple word DFA"
|
|
ucd-generate dfa \
|
|
--name SIMPLE_WORD_FWD \
|
|
--sparse --minimize --state-size 2 \
|
|
src/unicode/fsm/ \
|
|
"\w"
|
|
}
|
|
|
|
whitespace() {
|
|
echo "generating forward whitespace DFA"
|
|
ucd-generate dfa \
|
|
--name WHITESPACE_ANCHORED_FWD \
|
|
--anchored --classes --premultiply --minimize --state-size 1 \
|
|
src/unicode/fsm/ \
|
|
"\s+"
|
|
|
|
echo "generating reverse whitespace DFA"
|
|
ucd-generate dfa \
|
|
--name WHITESPACE_ANCHORED_REV \
|
|
--reverse \
|
|
--anchored --classes --premultiply --minimize --state-size 1 \
|
|
src/unicode/fsm/ \
|
|
"\s+"
|
|
}
|
|
|
|
main() {
|
|
if array_exists "-h" "$@" || array_exists "--help" "$@"; then
|
|
echo "Usage: $(basename "$0") [--list-commands] [<command>] ..." >&2
|
|
exit
|
|
fi
|
|
|
|
commands="
|
|
graphemes
|
|
sentences
|
|
words
|
|
regional-indicator
|
|
simple-word
|
|
whitespace
|
|
"
|
|
if array_exists "--list-commands" "$@"; then
|
|
for cmd in $commands; do
|
|
echo "$cmd"
|
|
done
|
|
exit
|
|
fi
|
|
|
|
# ucd-generate is used to compile regexes into DFAs.
|
|
requires ucd-generate
|
|
|
|
mkdir -p src/unicode/fsm/
|
|
|
|
cmds=$*
|
|
if [ $# -eq 0 ] || array_exists "all" "$@"; then
|
|
cmds=$commands
|
|
fi
|
|
for cmd in $cmds; do
|
|
if array_exists "$cmd" $commands; then
|
|
fun="$(echo "$cmd" | sed 's/-/_/g')"
|
|
eval "$fun"
|
|
else
|
|
echo "unrecognized command: $cmd" >&2
|
|
fi
|
|
done
|
|
}
|
|
|
|
main "$@"
|