You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
481 lines
15 KiB
481 lines
15 KiB
/* uwildmat.c is reused from libinn - https://launchpad.net/ubuntu/+source/inn2/2.5.4-1
|
|
|
|
This provides wild card matching originally used in InterNetNews and is
|
|
described in https://tools.ietf.org/html/rfc3977#section-4
|
|
|
|
INN licence:
|
|
INN as a whole and all code contained in it not otherwise marked with
|
|
different licenses and/or copyrights is covered by the following copyright
|
|
and license:
|
|
|
|
Copyright (c) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
|
|
2013, 2014 by Internet Systems Consortium, Inc. ("ISC")
|
|
Copyright (c) 1991, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
|
|
2002, 2003 by The Internet Software Consortium and Rich Salz
|
|
|
|
This code is derived from software contributed to the Internet Software
|
|
Consortium by Rich Salz.
|
|
|
|
Permission to use, copy, modify, and distribute this software for any
|
|
purpose with or without fee is hereby granted, provided that the above
|
|
copyright notice and this permission notice appear in all copies.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
|
|
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
|
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY
|
|
SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
|
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
|
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
|
|
|
*/
|
|
|
|
/* $Id: uwildmat.c 8918 2010-01-22 23:28:28Z iulius $
|
|
**
|
|
** wildmat pattern matching with Unicode UTF-8 extensions.
|
|
**
|
|
** Do shell-style pattern matching for ?, \, [], and * characters. Might not
|
|
** be robust in face of malformed patterns; e.g., "foo[a-" could cause a
|
|
** segmentation violation. It is 8-bit clean. (Robustness hopefully fixed
|
|
** July 2000; all malformed patterns should now just fail to match anything.)
|
|
**
|
|
** Original by Rich $alz, mirror!rs, Wed Nov 26 19:03:17 EST 1986.
|
|
** Rich $alz is now <rsalz@osf.org>.
|
|
**
|
|
** April, 1991: Replaced mutually-recursive calls with in-line code for the
|
|
** star character.
|
|
**
|
|
** Special thanks to Lars Mathiesen <thorinn@diku.dk> for the ABORT code.
|
|
** This can greatly speed up failing wildcard patterns. For example:
|
|
**
|
|
** pattern: -*-*-*-*-*-*-12-*-*-*-m-*-*-*
|
|
** text 1: -adobe-courier-bold-o-normal--12-120-75-75-m-70-iso8859-1
|
|
** text 2: -adobe-courier-bold-o-normal--12-120-75-75-X-70-iso8859-1
|
|
**
|
|
** Text 1 matches with 51 calls, while text 2 fails with 54 calls. Without
|
|
** the ABORT code, it takes 22310 calls to fail. Ugh. The following
|
|
** explanation is from Lars:
|
|
**
|
|
** The precondition that must be fulfilled is that DoMatch will consume at
|
|
** least one character in text. This is true if *p is neither '*' nor '\0'.)
|
|
** The last return has ABORT instead of false to avoid quadratic behaviour in
|
|
** cases like pattern "*a*b*c*d" with text "abcxxxxx". With false, each
|
|
** star-loop has to run to the end of the text; with ABORT only the last one
|
|
** does.
|
|
**
|
|
** Once the control of one instance of DoMatch enters the star-loop, that
|
|
** instance will return either true or ABORT, and any calling instance will
|
|
** therefore return immediately after (without calling recursively again).
|
|
** In effect, only one star-loop is ever active. It would be possible to
|
|
** modify the code to maintain this context explicitly, eliminating all
|
|
** recursive calls at the cost of some complication and loss of clarity (and
|
|
** the ABORT stuff seems to be unclear enough by itself). I think it would
|
|
** be unwise to try to get this into a released version unless you have a
|
|
** good test data base to try it out on.
|
|
**
|
|
** June, 1991: Robert Elz <kre@munnari.oz.au> added minus and close bracket
|
|
** handling for character sets.
|
|
**
|
|
** July, 2000: Largely rewritten by Russ Allbery <rra@stanford.edu> to add
|
|
** support for ',', '!', and optionally '@' to the core wildmat routine.
|
|
** Broke the character class matching into a separate function for clarity
|
|
** since it's infrequently used in practice, and added some simple lookahead
|
|
** to significantly decrease the recursive calls in the '*' matching code.
|
|
** Added support for UTF-8 as the default character set for any high-bit
|
|
** characters.
|
|
**
|
|
** For more information on UTF-8, see RFC 3629.
|
|
**
|
|
** Please note that this file is intentionally written so that conditionally
|
|
** executed expressions are on separate lines from the condition to
|
|
** facilitate analysis of the coverage of the test suite using purecov.
|
|
** Please preserve this. As of March 11, 2001, purecov reports that the
|
|
** accompanying test suite achieves 100% coverage of this file.
|
|
*/
|
|
|
|
#include <ctype.h>
|
|
#include <string.h>
|
|
#include <stdint.h>
|
|
#include "uwildmat/uwildmat.h"
|
|
|
|
#define ABORT -1
|
|
|
|
/* Whether or not an octet looks like the start of a UTF-8 character. */
|
|
#define ISUTF8(c) (((c) & 0xc0) == 0xc0)
|
|
|
|
|
|
/*
|
|
** Determine the length of a non-ASCII character in octets (for advancing
|
|
** pointers when skipping over characters). Takes a pointer to the start of
|
|
** the character and to the last octet of the string. If end is NULL, expect
|
|
** the string pointed to by start to be nul-terminated. If the character is
|
|
** malformed UTF-8, return 1 to treat it like an eight-bit local character.
|
|
*/
|
|
static int
|
|
utf8_length(const unsigned char *start, const unsigned char *end)
|
|
{
|
|
unsigned char mask = 0x80;
|
|
const unsigned char *p;
|
|
int length = 0;
|
|
int left;
|
|
|
|
for (; mask > 0 && (*start & mask) == mask; mask >>= 1)
|
|
length++;
|
|
if (length < 2 || length > 6)
|
|
return 1;
|
|
if (end != NULL && (end - start + 1) < length)
|
|
return 1;
|
|
left = length - 1;
|
|
for (p = start + 1; left > 0 && (*p & 0xc0) == 0x80; p++)
|
|
left--;
|
|
return (left == 0) ? length : 1;
|
|
}
|
|
|
|
|
|
/*
|
|
** Check whether a string contains only valid UTF-8 characters.
|
|
*/
|
|
bool
|
|
is_valid_utf8(const char *text)
|
|
{
|
|
unsigned char mask;
|
|
const unsigned char *p;
|
|
int length;
|
|
int left;
|
|
|
|
for (p = (const unsigned char *)text; *p != '\0';) {
|
|
mask = 0x80;
|
|
length = 0;
|
|
|
|
/* Find out the expected length of the character. */
|
|
for (; mask > 0 && (*p & mask) == mask; mask >>= 1)
|
|
length++;
|
|
|
|
p++;
|
|
|
|
/* Valid ASCII. */
|
|
if (length == 0)
|
|
continue;
|
|
|
|
/* Invalid length. */
|
|
if (length < 2 || length > 6)
|
|
return false;
|
|
|
|
/* Check that each byte looks like 10xxxxxx, except for the first. */
|
|
left = length - 1;
|
|
for (; left > 0 && (*p & 0xc0) == 0x80; p++)
|
|
left--;
|
|
|
|
if (left > 0)
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
/*
|
|
** Convert a UTF-8 character to UCS-4. Takes a pointer to the start of the
|
|
** character and to the last octet of the string, and to a uint32_t into
|
|
** which to put the decoded UCS-4 value. If end is NULL, expect the string
|
|
** pointed to by start to be nul-terminated. Returns the number of octets in
|
|
** the UTF-8 encoding. If the UTF-8 character is malformed, set result to
|
|
** the decimal value of the first octet; this is wrong, but it will generally
|
|
** cause the rest of the wildmat matching to do the right thing for non-UTF-8
|
|
** input.
|
|
*/
|
|
static int
|
|
utf8_decode(const unsigned char *start, const unsigned char *end,
|
|
uint32_t *result)
|
|
{
|
|
uint32_t value = 0;
|
|
int length, i;
|
|
const unsigned char *p = start;
|
|
unsigned char mask;
|
|
|
|
length = utf8_length(start, end);
|
|
if (length < 2) {
|
|
*result = *start;
|
|
return 1;
|
|
}
|
|
mask = (1 << (7 - length)) - 1;
|
|
value = *p & mask;
|
|
p++;
|
|
for (i = length - 1; i > 0; i--) {
|
|
value = (value << 6) | (*p & 0x3f);
|
|
p++;
|
|
}
|
|
*result = value;
|
|
return length;
|
|
}
|
|
|
|
|
|
/*
|
|
** Match a character class against text, a UCS-4 character. start is a
|
|
** pointer to the first character of the character class, end a pointer to
|
|
** the last. Returns whether the class matches that character.
|
|
*/
|
|
static bool
|
|
match_class(uint32_t text, const unsigned char *start,
|
|
const unsigned char *end)
|
|
{
|
|
bool reversed, allowrange;
|
|
const unsigned char *p = start;
|
|
uint32_t first = 0;
|
|
uint32_t last;
|
|
uint32_t lc = tolower(text);
|
|
|
|
/* Check for an inverted character class (starting with ^). If the
|
|
character matches the character class, we return !reversed; that way,
|
|
we return true if it's a regular character class and false if it's a
|
|
reversed one. If the character doesn't match, we return reversed. */
|
|
reversed = (*p == '^');
|
|
if (reversed)
|
|
p++;
|
|
|
|
/* Walk through the character class until we reach the end or find a
|
|
match, handling character ranges as we go. Only permit a range to
|
|
start when allowrange is true; this allows - to be treated like a
|
|
normal character as the first character of the class and catches
|
|
malformed ranges like a-e-n. We treat the character at the beginning
|
|
of a range as both a regular member of the class and the beginning of
|
|
the range; this is harmless (although it means that malformed ranges
|
|
like m-a will match m and nothing else). */
|
|
allowrange = false;
|
|
while (p <= end) {
|
|
if (allowrange && *p == '-' && p < end) {
|
|
p++;
|
|
p += utf8_decode(p, end, &last);
|
|
if ((text >= first && text <= last) ||
|
|
(lc >= first && lc <= last))
|
|
return !reversed;
|
|
allowrange = false;
|
|
} else {
|
|
p += utf8_decode(p, end, &first);
|
|
if (text == first || lc == first)
|
|
return !reversed;
|
|
allowrange = true;
|
|
}
|
|
}
|
|
return reversed;
|
|
}
|
|
|
|
|
|
/*
|
|
** Match the text against the pattern between start and end. This is a
|
|
** single pattern; a leading ! or @ must already be taken care of, and
|
|
** commas must be dealt with outside of this routine.
|
|
*/
|
|
static int
|
|
match_pattern(const unsigned char *text, const unsigned char *start,
|
|
const unsigned char *end)
|
|
{
|
|
const unsigned char *q, *endclass;
|
|
const unsigned char *p = start;
|
|
bool ismeta;
|
|
int matched, width;
|
|
uint32_t c;
|
|
unsigned char lc;
|
|
|
|
for (; p <= end; p++) {
|
|
if (!*text && *p != '*')
|
|
return ABORT;
|
|
|
|
switch (*p) {
|
|
case '\\':
|
|
if (!*++p)
|
|
return ABORT;
|
|
/* Fall through. */
|
|
|
|
default:
|
|
lc = tolower(*text);
|
|
if (*text++ != *p && lc != *p)
|
|
return false;
|
|
break;
|
|
|
|
case '?':
|
|
text += ISUTF8(*text) ? utf8_length(text, NULL) : 1;
|
|
break;
|
|
|
|
case '*':
|
|
/* Consecutive stars are equivalent to one. Advance pattern to
|
|
the character after the star. */
|
|
for (++p; *p == '*'; p++)
|
|
;
|
|
|
|
/* A trailing star will match anything. */
|
|
if (p > end)
|
|
return true;
|
|
|
|
/* Basic algorithm: Recurse at each point where the * could
|
|
possibly match. If the match succeeds or aborts, return
|
|
immediately; otherwise, try the next position.
|
|
|
|
Optimization: If the character after the * in the pattern
|
|
isn't a metacharacter (the common case), then the * has to
|
|
consume characters at least up to the next occurrence of that
|
|
character in the text. Scan forward for those points rather
|
|
than recursing at every possible point to save the extra
|
|
function call overhead. */
|
|
ismeta = (*p == '[' || *p == '?' || *p == '\\');
|
|
while (*text) {
|
|
width = ISUTF8(*text) ? utf8_length(text, NULL) : 1;
|
|
if (ismeta) {
|
|
matched = match_pattern(text, p, end);
|
|
text += width;
|
|
} else {
|
|
while (*text && *text != *p) {
|
|
text += width;
|
|
width = ISUTF8(*text) ? utf8_length(text, NULL) : 1;
|
|
}
|
|
if (!*text)
|
|
return ABORT;
|
|
matched = match_pattern(++text, p + 1, end);
|
|
}
|
|
if (matched != false)
|
|
return matched;
|
|
}
|
|
return ABORT;
|
|
|
|
case '[':
|
|
/* Find the end of the character class, making sure not to pick
|
|
up a close bracket at the beginning of the class. */
|
|
p++;
|
|
q = p + (*p == '^') + 1;
|
|
if (q > end)
|
|
return ABORT;
|
|
endclass = memchr(q, ']', (size_t) (end - q + 1));
|
|
if (!endclass)
|
|
return ABORT;
|
|
|
|
/* Do the heavy lifting in another function for clarity, since
|
|
character classes are an uncommon case. */
|
|
text += utf8_decode(text, NULL, &c);
|
|
if (!match_class(c, p, endclass - 1))
|
|
return false;
|
|
p = endclass;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return (*text == '\0');
|
|
}
|
|
|
|
|
|
/*
|
|
** Takes text and a wildmat expression; a wildmat expression is a
|
|
** comma-separated list of wildmat patterns, optionally preceded by ! to
|
|
** invert the sense of the expression. Returns UWILDMAT_MATCH if that
|
|
** expression matches the text, UWILDMAT_FAIL otherwise. If allowpoison is
|
|
** set, allow @ to introduce a poison expression (the same as !, but if it
|
|
** triggers the failed match the routine returns UWILDMAT_POISON instead).
|
|
*/
|
|
static enum uwildmat
|
|
match_expression(const unsigned char *text, const unsigned char *start,
|
|
bool allowpoison)
|
|
{
|
|
const unsigned char *end, *split;
|
|
const unsigned char *p = start;
|
|
bool reverse, escaped;
|
|
bool match = false;
|
|
bool poison = false;
|
|
bool poisoned = false;
|
|
|
|
/* Handle the empty expression separately, since otherwise end will be
|
|
set to an invalid pointer. */
|
|
if (!*p)
|
|
return !*text ? UWILDMAT_MATCH : UWILDMAT_FAIL;
|
|
end = start + strlen((const char *) start) - 1;
|
|
|
|
/* Main match loop. Find each comma that separates patterns, and attempt
|
|
to match the text with each pattern in order. The last matching
|
|
pattern determines whether the whole expression matches. */
|
|
for (; p <= end + 1; p = split + 1) {
|
|
if (allowpoison)
|
|
poison = (*p == '@');
|
|
reverse = (*p == '!') || poison;
|
|
if (reverse)
|
|
p++;
|
|
|
|
/* Find the first unescaped comma, if any. If there is none, split
|
|
will be one greater than end and point at the nul at the end of
|
|
the string. */
|
|
for (escaped = false, split = p; split <= end; split++) {
|
|
if (*split == '[') {
|
|
split++;
|
|
if (*split == ']')
|
|
split++;
|
|
while (split <= end && *split != ']')
|
|
split++;
|
|
}
|
|
if (*split == ',' && !escaped)
|
|
break;
|
|
escaped = (*split == '\\') ? !escaped : false;
|
|
}
|
|
|
|
/* Optimization: If match == !reverse and poison == poisoned, this
|
|
pattern can't change the result, so don't do any work. */
|
|
if (match == !reverse && poison == poisoned)
|
|
continue;
|
|
if (match_pattern(text, p, split - 1) == true) {
|
|
poisoned = poison;
|
|
match = !reverse;
|
|
}
|
|
}
|
|
if (poisoned)
|
|
return UWILDMAT_POISON;
|
|
return match ? UWILDMAT_MATCH : UWILDMAT_FAIL;
|
|
}
|
|
|
|
|
|
/*
|
|
** User-level routine used for wildmats where @ should be treated as a
|
|
** regular character.
|
|
*/
|
|
bool
|
|
uwildmat(const char *text, const char *pat)
|
|
{
|
|
const unsigned char *utext = (const unsigned char *) text;
|
|
const unsigned char *upat = (const unsigned char *) pat;
|
|
|
|
if (upat[0] == '*' && upat[1] == '\0')
|
|
return true;
|
|
else
|
|
return (match_expression(utext, upat, false) == UWILDMAT_MATCH);
|
|
}
|
|
|
|
|
|
/*
|
|
** User-level routine used for wildmats that support poison matches.
|
|
*/
|
|
enum uwildmat
|
|
uwildmat_poison(const char *text, const char *pat)
|
|
{
|
|
const unsigned char *utext = (const unsigned char *) text;
|
|
const unsigned char *upat = (const unsigned char *) pat;
|
|
|
|
if (upat[0] == '*' && upat[1] == '\0')
|
|
return UWILDMAT_MATCH;
|
|
else
|
|
return match_expression(utext, upat, true);
|
|
}
|
|
|
|
|
|
/*
|
|
** User-level routine for simple expressions (neither , nor ! are special).
|
|
*/
|
|
bool
|
|
uwildmat_simple(const char *text, const char *pat)
|
|
{
|
|
const unsigned char *utext = (const unsigned char *) text;
|
|
const unsigned char *upat = (const unsigned char *) pat;
|
|
size_t length;
|
|
|
|
if (upat[0] == '*' && upat[1] == '\0')
|
|
return true;
|
|
else {
|
|
length = strlen(pat);
|
|
return (match_pattern(utext, upat, upat + length - 1) == true);
|
|
}
|
|
}
|