v811_spc009/external/rust/crates/bstr/scripts/regex/sentence.sh

#!/bin/sh

# vim: indentexpr= nosmartindent autoindent
# vim: tabstop=2 shiftwidth=2 softtabstop=2

# This is a regex that I reverse engineered from the sentence boundary chain
# rules in UAX #29. Unlike the grapheme regex, which is essentially provided
# for us in UAX #29, no such sentence regex exists.
#
# I looked into how ICU achieves this, since UAX #29 hints that producing
# finite state machines for grapheme/sentence/word/line breaking is possible,
# but only easy to do for graphemes. ICU does this by implementing their own
# DSL for describing the break algorithms in terms of the chaining rules
# directly. You can see an example for sentences in
# icu4c/source/data/brkitr/rules/sent.txt. ICU then builds a finite state
# machine from those rules in a mostly standard way, but implements the
# "chaining" aspect of the rules by connecting overlapping end and start
# states. For example, given SB7:
#
#     (Upper | Lower) ATerm x Upper
#
# Then the naive way to convert this into a regex would be something like
#
#     [\p{sb=Upper}\p{sb=Lower}]\p{sb=ATerm}\p{sb=Upper}
#
# Unfortunately, this is incorrect. Why? Well, consider an example like so:
#
#     U.S.A.
#
# A correct implementation of the sentence breaking algorithm should not insert
# any breaks here, exactly in accordance with repeatedly applying rule SB7 as
# given above. Our regex fails to do this because it will first match `U.S`
# without breaking them---which is correct---but will then start looking for
# its next rule beginning with a full stop (in ATerm) and followed by an
# uppercase letter (A). This will wind up triggering rule SB11 (without
# matching `A`), which inserts a break.
#
# The reason why this happens is because our initial application of rule SB7
# "consumes" the next uppercase letter (S), which we want to reuse as a prefix
# in the next rule application. A natural way to express this would be with
# look-around, although it's not clear that works in every case since you
# ultimately might want to consume that ending uppercase letter. In any case,
# we can't use look-around in our truly regular regexes, so we must fix this.
# The approach we take is to explicitly repeat rules when a suffix of a rule
# is a prefix of another rule. In the case of SB7, the end of the rule, an
# uppercase letter, also happens to match the beginning of the rule. This can
# in turn be repeated indefinitely. Thus, our actual translation to a regex is:
#
#     [\p{sb=Upper}\p{sb=Lower}]\p{sb=ATerm}\p{sb=Upper}(\p{sb=ATerm}\p{sb=Upper}*
#
# It turns out that this is exactly what ICU does, but in their case, they do
# it automatically. In our case, we connect the chaining rules manually. It's
# tedious. With that said, we do no implement Unicode line breaking with this
# approach, which is a far scarier beast. In that case, it would probably be
# worth writing the code to do what ICU does.
#
# In the case of sentence breaks, there aren't *too* many overlaps of this
# nature. We list them out exhaustively to make this clear, because it's
# essentially impossible to easily observe this in the regex. (It took me a
# full day to figure all of this out.) Rules marked with N/A mean that they
# specify a break, and this strategy only really applies to stringing together
# non-breaks.
#
#     SB1   - N/A
#     SB2   - N/A
#     SB3   - None
#     SB4   - N/A
#     SB5   - None
#     SB6   - None
#     SB7   - End overlaps with beginning of SB7
#     SB8   - End overlaps with beginning of SB7
#     SB8a  - End overlaps with beginning of SB6, SB8, SB8a, SB9, SB10, SB11
#     SB9   - None
#     SB10  - None
#     SB11  - None
#     SB998 - N/A
#
# SB8a is in particular quite tricky to get right without look-ahead, since it
# allows ping-ponging between match rules SB8a and SB9-11, where SB9-11
# otherwise indicate that a break has been found. In the regex below, we tackle
# this by only permitting part of SB8a to match inside our core non-breaking
# repetition. In particular, we only allow the parts of SB8a to match that
# permit the non-breaking components to continue. If a part of SB8a matches
# that guarantees a pop out to SB9-11, (like `STerm STerm`), then we let it
# happen. This still isn't correct because an SContinue might be seen which
# would allow moving back into SB998 and thus the non-breaking repetition, so
# we handle that case as well.
#
# Finally, the last complication here is the sprinkling of $Ex* everywhere.
# This essentially corresponds to the implementation of SB5 by following
# UAX #29's recommendation in S6.2. Essentially, we use it avoid ever breaking
# in the middle of a grapheme cluster.

CR="\p{sb=CR}"
LF="\p{sb=LF}"
Sep="\p{sb=Sep}"
Close="\p{sb=Close}"
Sp="\p{sb=Sp}"
STerm="\p{sb=STerm}"
ATerm="\p{sb=ATerm}"
SContinue="\p{sb=SContinue}"
Numeric="\p{sb=Numeric}"
Upper="\p{sb=Upper}"
Lower="\p{sb=Lower}"
OLetter="\p{sb=OLetter}"

Ex="[\p{sb=Extend}\p{sb=Format}]"
ParaSep="[$Sep $CR $LF]"
SATerm="[$STerm $ATerm]"

LetterSepTerm="[$OLetter $Upper $Lower $ParaSep $SATerm]"

echo "(?x)
(
  # SB6
  $ATerm $Ex*
    $Numeric
  |
  # SB7
  [$Upper $Lower] $Ex* $ATerm $Ex*
    $Upper $Ex*
    # overlap with SB7
    ($ATerm $Ex* $Upper $Ex*)*
  |
  # SB8
  $ATerm $Ex* $Close* $Ex* $Sp* $Ex*
    ([^$LetterSepTerm] $Ex*)* $Lower $Ex*
    # overlap with SB7
    ($ATerm $Ex* $Upper $Ex*)*
  |
  # SB8a
  $SATerm $Ex* $Close* $Ex* $Sp* $Ex*
  (
    $SContinue
    |
    $ATerm $Ex*
      # Permit repetition of SB8a
      (($Close $Ex*)* ($Sp $Ex*)* $SATerm)*
      # In order to continue non-breaking matching, we now must observe
      # a match with a rule that keeps us in SB6-8a. Otherwise, we've entered
      # one of SB9-11 and know that a break must follow.
      (
        # overlap with SB6
        $Numeric
        |
        # overlap with SB8
        ($Close $Ex*)* ($Sp $Ex*)*
          ([^$LetterSepTerm] $Ex*)* $Lower $Ex*
          # overlap with SB7
          ($ATerm $Ex* $Upper $Ex*)*
        |
        # overlap with SB8a
        ($Close $Ex*)* ($Sp $Ex*)* $SContinue
      )
    |
    $STerm $Ex*
      # Permit repetition of SB8a
      (($Close $Ex*)* ($Sp $Ex*)* $SATerm)*
      # As with ATerm above, in order to continue non-breaking matching, we
      # must now observe a match with a rule that keeps us out of SB9-11.
      # For STerm, the only such possibility is to see an SContinue. Anything
      # else will result in a break.
      ($Close $Ex*)* ($Sp $Ex*)* $SContinue
  )
  |
  # SB998
  # The logic behind this catch-all is that if we get to this point and
  # see a Sep, CR, LF, STerm or ATerm, then it has to fall into one of
  # SB9, SB10 or SB11. In the cases of SB9-11, we always find a break since
  # SB11 acts as a catch-all to induce a break following a SATerm that isn't
  # handled by rules SB6-SB8a.
  [^$ParaSep $SATerm]
)*
# The following collapses rules SB3, SB4, part of SB8a, SB9, SB10 and SB11.
($SATerm $Ex* ($Close $Ex*)* ($Sp $Ex*)*)* ($CR $LF | $ParaSep)?
"
v811_spc009_project 7 months ago			`#!/bin/sh`

			`# vim: indentexpr= nosmartindent autoindent`
			`# vim: tabstop=2 shiftwidth=2 softtabstop=2`

			`# This is a regex that I reverse engineered from the sentence boundary chain`
			`# rules in UAX #29. Unlike the grapheme regex, which is essentially provided`
			`# for us in UAX #29, no such sentence regex exists.`
			`#`
			`# I looked into how ICU achieves this, since UAX #29 hints that producing`
			`# finite state machines for grapheme/sentence/word/line breaking is possible,`
			`# but only easy to do for graphemes. ICU does this by implementing their own`
			`# DSL for describing the break algorithms in terms of the chaining rules`
			`# directly. You can see an example for sentences in`
			`# icu4c/source/data/brkitr/rules/sent.txt. ICU then builds a finite state`
			`# machine from those rules in a mostly standard way, but implements the`
			`# "chaining" aspect of the rules by connecting overlapping end and start`
			`# states. For example, given SB7:`
			`#`
			`# (Upper \| Lower) ATerm x Upper`
			`#`
			`# Then the naive way to convert this into a regex would be something like`
			`#`
			`# [\p{sb=Upper}\p{sb=Lower}]\p{sb=ATerm}\p{sb=Upper}`
			`#`
			`# Unfortunately, this is incorrect. Why? Well, consider an example like so:`
			`#`
			`# U.S.A.`
			`#`
			`# A correct implementation of the sentence breaking algorithm should not insert`
			`# any breaks here, exactly in accordance with repeatedly applying rule SB7 as`
			# given above. Our regex fails to do this because it will first match `U.S`
			`# without breaking them---which is correct---but will then start looking for`
			`# its next rule beginning with a full stop (in ATerm) and followed by an`
			`# uppercase letter (A). This will wind up triggering rule SB11 (without`
			# matching `A`), which inserts a break.
			`#`
			`# The reason why this happens is because our initial application of rule SB7`
			`# "consumes" the next uppercase letter (S), which we want to reuse as a prefix`
			`# in the next rule application. A natural way to express this would be with`
			`# look-around, although it's not clear that works in every case since you`
			`# ultimately might want to consume that ending uppercase letter. In any case,`
			`# we can't use look-around in our truly regular regexes, so we must fix this.`
			`# The approach we take is to explicitly repeat rules when a suffix of a rule`
			`# is a prefix of another rule. In the case of SB7, the end of the rule, an`
			`# uppercase letter, also happens to match the beginning of the rule. This can`
			`# in turn be repeated indefinitely. Thus, our actual translation to a regex is:`
			`#`
			`# [\p{sb=Upper}\p{sb=Lower}]\p{sb=ATerm}\p{sb=Upper}(\p{sb=ATerm}\p{sb=Upper}*`
			`#`
			`# It turns out that this is exactly what ICU does, but in their case, they do`
			`# it automatically. In our case, we connect the chaining rules manually. It's`
			`# tedious. With that said, we do no implement Unicode line breaking with this`
			`# approach, which is a far scarier beast. In that case, it would probably be`
			`# worth writing the code to do what ICU does.`
			`#`
			`# In the case of sentence breaks, there aren't too many overlaps of this`
			`# nature. We list them out exhaustively to make this clear, because it's`
			`# essentially impossible to easily observe this in the regex. (It took me a`
			`# full day to figure all of this out.) Rules marked with N/A mean that they`
			`# specify a break, and this strategy only really applies to stringing together`
			`# non-breaks.`
			`#`
			`# SB1 - N/A`
			`# SB2 - N/A`
			`# SB3 - None`
			`# SB4 - N/A`
			`# SB5 - None`
			`# SB6 - None`
			`# SB7 - End overlaps with beginning of SB7`
			`# SB8 - End overlaps with beginning of SB7`
			`# SB8a - End overlaps with beginning of SB6, SB8, SB8a, SB9, SB10, SB11`
			`# SB9 - None`
			`# SB10 - None`
			`# SB11 - None`
			`# SB998 - N/A`
			`#`
			`# SB8a is in particular quite tricky to get right without look-ahead, since it`
			`# allows ping-ponging between match rules SB8a and SB9-11, where SB9-11`
			`# otherwise indicate that a break has been found. In the regex below, we tackle`
			`# this by only permitting part of SB8a to match inside our core non-breaking`
			`# repetition. In particular, we only allow the parts of SB8a to match that`
			`# permit the non-breaking components to continue. If a part of SB8a matches`
			# that guarantees a pop out to SB9-11, (like `STerm STerm`), then we let it
			`# happen. This still isn't correct because an SContinue might be seen which`
			`# would allow moving back into SB998 and thus the non-breaking repetition, so`
			`# we handle that case as well.`
			`#`
			`# Finally, the last complication here is the sprinkling of $Ex* everywhere.`
			`# This essentially corresponds to the implementation of SB5 by following`
			`# UAX #29's recommendation in S6.2. Essentially, we use it avoid ever breaking`
			`# in the middle of a grapheme cluster.`

			`CR="\p{sb=CR}"`
			`LF="\p{sb=LF}"`
			`Sep="\p{sb=Sep}"`
			`Close="\p{sb=Close}"`
			`Sp="\p{sb=Sp}"`
			`STerm="\p{sb=STerm}"`
			`ATerm="\p{sb=ATerm}"`
			`SContinue="\p{sb=SContinue}"`
			`Numeric="\p{sb=Numeric}"`
			`Upper="\p{sb=Upper}"`
			`Lower="\p{sb=Lower}"`
			`OLetter="\p{sb=OLetter}"`

			`Ex="[\p{sb=Extend}\p{sb=Format}]"`
			`ParaSep="[$Sep $CR $LF]"`
			`SATerm="[$STerm $ATerm]"`

			`LetterSepTerm="[$OLetter $Upper $Lower $ParaSep $SATerm]"`

			`echo "(?x)`
			`(`
			`# SB6`
			`$ATerm $Ex*`
			`$Numeric`
			`\|`
			`# SB7`
			`[$Upper $Lower] $Ex* $ATerm $Ex*`
			`$Upper $Ex*`
			`# overlap with SB7`
			`($ATerm $Ex* $Upper $Ex)`
			`\|`
			`# SB8`
			`$ATerm $Ex* $Close* $Ex* $Sp* $Ex*`
			`([^$LetterSepTerm] $Ex) $Lower $Ex*`
			`# overlap with SB7`
			`($ATerm $Ex* $Upper $Ex)`
			`\|`
			`# SB8a`
			`$SATerm $Ex* $Close* $Ex* $Sp* $Ex*`
			`(`
			`$SContinue`
			`\|`
			`$ATerm $Ex*`
			`# Permit repetition of SB8a`
			`(($Close $Ex) ($Sp $Ex) $SATerm)*`
			`# In order to continue non-breaking matching, we now must observe`
			`# a match with a rule that keeps us in SB6-8a. Otherwise, we've entered`
			`# one of SB9-11 and know that a break must follow.`
			`(`
			`# overlap with SB6`
			`$Numeric`
			`\|`
			`# overlap with SB8`
			`($Close $Ex) ($Sp $Ex)`
			`([^$LetterSepTerm] $Ex) $Lower $Ex*`
			`# overlap with SB7`
			`($ATerm $Ex* $Upper $Ex)`
			`\|`
			`# overlap with SB8a`
			`($Close $Ex) ($Sp $Ex) $SContinue`
			`)`
			`\|`
			`$STerm $Ex*`
			`# Permit repetition of SB8a`
			`(($Close $Ex) ($Sp $Ex) $SATerm)*`
			`# As with ATerm above, in order to continue non-breaking matching, we`
			`# must now observe a match with a rule that keeps us out of SB9-11.`
			`# For STerm, the only such possibility is to see an SContinue. Anything`
			`# else will result in a break.`
			`($Close $Ex) ($Sp $Ex) $SContinue`
			`)`
			`\|`
			`# SB998`
			`# The logic behind this catch-all is that if we get to this point and`
			`# see a Sep, CR, LF, STerm or ATerm, then it has to fall into one of`
			`# SB9, SB10 or SB11. In the cases of SB9-11, we always find a break since`
			`# SB11 acts as a catch-all to induce a break following a SATerm that isn't`
			`# handled by rules SB6-SB8a.`
			`[^$ParaSep $SATerm]`
			`)*`
			`# The following collapses rules SB3, SB4, part of SB8a, SB9, SB10 and SB11.`
			`($SATerm $Ex* ($Close $Ex) ($Sp $Ex))* ($CR $LF \| $ParaSep)?`
			`"`