You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

462 lines
13 KiB

use std::collections::BTreeMap;
use std::env;
use std::fmt::{self, Write};
use std::thread;
use regex;
use regex_automata::{DenseDFA, ErrorKind, Regex, RegexBuilder, StateID, DFA};
use serde_bytes;
use toml;
macro_rules! load {
($col:ident, $path:expr) => {
$col.extend(RegexTests::load(
concat!("../data/tests/", $path),
include_bytes!(concat!("../data/tests/", $path)),
));
};
}
lazy_static! {
pub static ref SUITE: RegexTestCollection = {
let mut col = RegexTestCollection::new();
load!(col, "fowler/basic.toml");
load!(col, "fowler/nullsubexpr.toml");
load!(col, "fowler/repetition.toml");
load!(col, "fowler/repetition-long.toml");
load!(col, "crazy.toml");
load!(col, "flags.toml");
load!(col, "iter.toml");
load!(col, "no-unicode.toml");
load!(col, "unicode.toml");
col
};
}
#[derive(Clone, Debug)]
pub struct RegexTestCollection {
pub by_name: BTreeMap<String, RegexTest>,
}
#[derive(Clone, Debug, Deserialize)]
pub struct RegexTests {
pub tests: Vec<RegexTest>,
}
#[derive(Clone, Debug, Deserialize)]
pub struct RegexTest {
pub name: String,
#[serde(default)]
pub options: Vec<RegexTestOption>,
pub pattern: String,
#[serde(with = "serde_bytes")]
pub input: Vec<u8>,
#[serde(rename = "matches")]
pub matches: Vec<Match>,
#[serde(default)]
pub captures: Vec<Option<Match>>,
#[serde(default)]
pub fowler_line_number: Option<u64>,
}
#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq)]
#[serde(rename_all = "kebab-case")]
pub enum RegexTestOption {
Anchored,
CaseInsensitive,
NoUnicode,
Escaped,
#[serde(rename = "invalid-utf8")]
InvalidUTF8,
}
#[derive(Clone, Copy, Deserialize, Eq, PartialEq)]
pub struct Match {
pub start: usize,
pub end: usize,
}
impl RegexTestCollection {
fn new() -> RegexTestCollection {
RegexTestCollection { by_name: BTreeMap::new() }
}
fn extend(&mut self, tests: RegexTests) {
for test in tests.tests {
let name = test.name.clone();
if self.by_name.contains_key(&name) {
panic!("found duplicate test {}", name);
}
self.by_name.insert(name, test);
}
}
pub fn tests(&self) -> Vec<&RegexTest> {
self.by_name.values().collect()
}
}
impl RegexTests {
fn load(path: &str, slice: &[u8]) -> RegexTests {
let mut data: RegexTests = toml::from_slice(slice)
.expect(&format!("failed to load {}", path));
for test in &mut data.tests {
if test.options.contains(&RegexTestOption::Escaped) {
test.input = unescape_bytes(&test.input);
}
}
data
}
}
#[derive(Debug)]
pub struct RegexTester {
asserted: bool,
results: RegexTestResults,
skip_expensive: bool,
whitelist: Vec<regex::Regex>,
blacklist: Vec<regex::Regex>,
}
impl Drop for RegexTester {
fn drop(&mut self) {
// If we haven't asserted yet, then the test is probably buggy, so
// fail it. But if we're already panicking (e.g., a bug in the regex
// engine), then don't double-panic, which causes an immediate abort.
if !thread::panicking() && !self.asserted {
panic!("must call RegexTester::assert at end of test");
}
}
}
impl RegexTester {
pub fn new() -> RegexTester {
let mut tester = RegexTester {
asserted: false,
results: RegexTestResults::default(),
skip_expensive: false,
whitelist: vec![],
blacklist: vec![],
};
for x in env::var("REGEX_TEST").unwrap_or("".to_string()).split(",") {
let x = x.trim();
if x.is_empty() {
continue;
}
if x.starts_with("-") {
tester = tester.blacklist(&x[1..]);
} else {
tester = tester.whitelist(x);
}
}
tester
}
pub fn skip_expensive(mut self) -> RegexTester {
self.skip_expensive = true;
self
}
pub fn whitelist(mut self, name: &str) -> RegexTester {
self.whitelist.push(regex::Regex::new(name).unwrap());
self
}
pub fn blacklist(mut self, name: &str) -> RegexTester {
self.blacklist.push(regex::Regex::new(name).unwrap());
self
}
pub fn assert(&mut self) {
self.asserted = true;
self.results.assert();
}
pub fn build_regex<S: StateID>(
&self,
mut builder: RegexBuilder,
test: &RegexTest,
) -> Option<Regex<DenseDFA<Vec<S>, S>>> {
if self.skip(test) {
return None;
}
self.apply_options(test, &mut builder);
match builder.build_with_size::<S>(&test.pattern) {
Ok(re) => Some(re),
Err(err) => {
if let ErrorKind::Unsupported(_) = *err.kind() {
None
} else {
panic!(
"failed to build {:?} with pattern '{:?}': {}",
test.name, test.pattern, err
);
}
}
}
}
pub fn test_all<'a, I, T>(&mut self, builder: RegexBuilder, tests: I)
where
I: IntoIterator<IntoIter = T, Item = &'a RegexTest>,
T: Iterator<Item = &'a RegexTest>,
{
for test in tests {
let builder = builder.clone();
let re: Regex = match self.build_regex(builder, test) {
None => continue,
Some(re) => re,
};
self.test(test, &re);
}
}
pub fn test<'a, D: DFA>(&mut self, test: &RegexTest, re: &Regex<D>) {
self.test_is_match(test, re);
self.test_find(test, re);
// Some tests (namely, fowler) are designed only to detect the
// first match even if there are more subsequent matches. To that
// end, we only test match iteration when the number of matches
// expected is not 1, or if the test name has 'iter' in it.
if test.name.contains("iter") || test.matches.len() != 1 {
self.test_find_iter(test, re);
}
}
pub fn test_is_match<'a, D: DFA>(
&mut self,
test: &RegexTest,
re: &Regex<D>,
) {
self.asserted = false;
let got = re.is_match(&test.input);
let expected = test.matches.len() >= 1;
if got == expected {
self.results.succeeded.push(test.clone());
return;
}
self.results.failed.push(RegexTestFailure {
test: test.clone(),
kind: RegexTestFailureKind::IsMatch,
});
}
pub fn test_find<'a, D: DFA>(&mut self, test: &RegexTest, re: &Regex<D>) {
self.asserted = false;
let got =
re.find(&test.input).map(|(start, end)| Match { start, end });
if got == test.matches.get(0).map(|&m| m) {
self.results.succeeded.push(test.clone());
return;
}
self.results.failed.push(RegexTestFailure {
test: test.clone(),
kind: RegexTestFailureKind::Find { got },
});
}
pub fn test_find_iter<'a, D: DFA>(
&mut self,
test: &RegexTest,
re: &Regex<D>,
) {
self.asserted = false;
let got: Vec<Match> = re
.find_iter(&test.input)
.map(|(start, end)| Match { start, end })
.collect();
if got == test.matches {
self.results.succeeded.push(test.clone());
return;
}
self.results.failed.push(RegexTestFailure {
test: test.clone(),
kind: RegexTestFailureKind::FindIter { got },
});
}
fn skip(&self, test: &RegexTest) -> bool {
if self.skip_expensive {
if test.name.starts_with("repetition-long") {
return true;
}
}
if !self.blacklist.is_empty() {
if self.blacklist.iter().any(|re| re.is_match(&test.name)) {
return true;
}
}
if !self.whitelist.is_empty() {
if !self.whitelist.iter().any(|re| re.is_match(&test.name)) {
return true;
}
}
false
}
fn apply_options(&self, test: &RegexTest, builder: &mut RegexBuilder) {
for opt in &test.options {
match *opt {
RegexTestOption::Anchored => {
builder.anchored(true);
}
RegexTestOption::CaseInsensitive => {
builder.case_insensitive(true);
}
RegexTestOption::NoUnicode => {
builder.unicode(false);
}
RegexTestOption::Escaped => {}
RegexTestOption::InvalidUTF8 => {
builder.allow_invalid_utf8(true);
}
}
}
}
}
#[derive(Clone, Debug, Default)]
pub struct RegexTestResults {
/// Tests that succeeded.
pub succeeded: Vec<RegexTest>,
/// Failed tests, indexed by group name.
pub failed: Vec<RegexTestFailure>,
}
#[derive(Clone, Debug)]
pub struct RegexTestFailure {
test: RegexTest,
kind: RegexTestFailureKind,
}
#[derive(Clone, Debug)]
pub enum RegexTestFailureKind {
IsMatch,
Find { got: Option<Match> },
FindIter { got: Vec<Match> },
}
impl RegexTestResults {
pub fn assert(&self) {
if self.failed.is_empty() {
return;
}
let failures = self
.failed
.iter()
.map(|f| f.to_string())
.collect::<Vec<String>>()
.join("\n\n");
panic!(
"found {} failures:\n{}\n{}\n{}\n\n\
Set the REGEX_TEST environment variable to filter tests, \n\
e.g., REGEX_TEST=crazy-misc,-crazy-misc2 runs every test \n\
whose name contains crazy-misc but not crazy-misc2\n\n",
self.failed.len(),
"~".repeat(79),
failures.trim(),
"~".repeat(79)
)
}
}
impl fmt::Display for RegexTestFailure {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(
f,
"{}: {}\n \
options: {:?}\n \
pattern: {}\n \
pattern (escape): {}\n \
input: {}\n \
input (escape): {}\n \
input (hex): {}",
self.test.name,
self.kind.fmt(&self.test)?,
self.test.options,
self.test.pattern,
escape_default(&self.test.pattern),
nice_raw_bytes(&self.test.input),
escape_bytes(&self.test.input),
hex_bytes(&self.test.input)
)
}
}
impl RegexTestFailureKind {
fn fmt(&self, test: &RegexTest) -> Result<String, fmt::Error> {
let mut buf = String::new();
match *self {
RegexTestFailureKind::IsMatch => {
if let Some(&m) = test.matches.get(0) {
write!(buf, "expected match (at {}), but none found", m)?
} else {
write!(buf, "expected no match, but found a match")?
}
}
RegexTestFailureKind::Find { got } => write!(
buf,
"expected {:?}, but found {:?}",
test.matches.get(0),
got
)?,
RegexTestFailureKind::FindIter { ref got } => write!(
buf,
"expected {:?}, but found {:?}",
test.matches, got
)?,
}
Ok(buf)
}
}
impl fmt::Display for Match {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "({}, {})", self.start, self.end)
}
}
impl fmt::Debug for Match {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "({}, {})", self.start, self.end)
}
}
fn nice_raw_bytes(bytes: &[u8]) -> String {
use std::str;
match str::from_utf8(bytes) {
Ok(s) => s.to_string(),
Err(_) => escape_bytes(bytes),
}
}
fn escape_bytes(bytes: &[u8]) -> String {
use std::ascii;
let escaped = bytes
.iter()
.flat_map(|&b| ascii::escape_default(b))
.collect::<Vec<u8>>();
String::from_utf8(escaped).unwrap()
}
fn hex_bytes(bytes: &[u8]) -> String {
bytes.iter().map(|&b| format!(r"\x{:02X}", b)).collect()
}
fn escape_default(s: &str) -> String {
s.chars().flat_map(|c| c.escape_default()).collect()
}
fn unescape_bytes(bytes: &[u8]) -> Vec<u8> {
use std::str;
use unescape::unescape;
unescape(&str::from_utf8(bytes).expect("all input must be valid UTF-8"))
}