From 850077d9c2ee4985b7cedddc2625f6c678e50b8b Mon Sep 17 00:00:00 2001 From: Ethan Pailes Date: Sun, 22 Apr 2018 14:26:38 -0400 Subject: [PATCH 1/8] Add function to determine if a regex is onepass. This patch adds the analysis function, `is_onepass` found in `analysis.rs`, which is required in order to determine if a particular regex can be executed using the onepass DFA. A regex is said to be onepass iff there are no non-deterministic splits in it. An example of a non-determinism in a regex is `/alex|apple/`. Here we can't know which branch to take because both of them start with `a`. A more subtle example is `/(?:alex)*apple/`. After every iteration of the Kleene star, we might branch back to `alex` or continue on to `apple`. --- regex-syntax/src/hir/interval.rs | 5 + regex-syntax/src/hir/mod.rs | 20 ++ src/analysis.rs | 547 +++++++++++++++++++++++++++++++ src/lib.rs | 1 + 4 files changed, 573 insertions(+) create mode 100644 src/analysis.rs diff --git a/regex-syntax/src/hir/interval.rs b/regex-syntax/src/hir/interval.rs index a7e70ef596..8c2f97ef14 100644 --- a/regex-syntax/src/hir/interval.rs +++ b/regex-syntax/src/hir/interval.rs @@ -309,6 +309,11 @@ impl IntervalSet { } true } + + /// Returns true iff this class is empty. + pub fn is_empty(&self) -> bool { + self.ranges.is_empty() + } } /// An iterator over intervals. diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 903e6085be..fb82dd9425 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -797,6 +797,16 @@ impl ClassUnicode { pub fn symmetric_difference(&mut self, other: &ClassUnicode) { self.set.symmetric_difference(&other.set); } + + /// Returns true iff this character class contains no characters. + /// + /// This should never be true for a character class which was + /// constructed by the regex parser, but a notion of character + /// class emptiness can be useful for code that wants to + /// programmatically generate character classes. + pub fn is_empty(&self) -> bool { + self.set.is_empty() + } } /// An iterator over all ranges in a Unicode character class. @@ -998,6 +1008,16 @@ impl ClassBytes { pub fn is_all_ascii(&self) -> bool { self.set.intervals().last().map_or(true, |r| r.end <= 0x7F) } + + /// Returns true iff this character class contains no characters. + /// + /// This should never be true for a character class which was + /// constructed by the regex parser, but a notion of character + /// class emptiness can be useful for code that wants to + /// programmatically generate character classes. + pub fn is_empty(&self) -> bool { + self.set.is_empty() + } } /// An iterator over all ranges in a byte character class. diff --git a/src/analysis.rs b/src/analysis.rs new file mode 100644 index 0000000000..2c7b5959ef --- /dev/null +++ b/src/analysis.rs @@ -0,0 +1,547 @@ +use syntax::hir::{ + Hir, HirKind, Literal, ClassBytes, ClassBytesRange, + Class, Visitor, RepetitionRange, RepetitionKind +}; +use syntax::hir; +use utf8_ranges::Utf8Sequences; + +/// True iff the given expression is one-pass +pub fn is_onepass(expr: &Hir) -> bool { + hir::visit(expr, IsOnePassVisitor::new()).unwrap() +} + +struct IsOnePassVisitor(bool); + +impl Visitor for IsOnePassVisitor { + type Output = bool; + type Err = (); + + fn finish(self) -> Result { + Ok(self.0) + } + + fn visit_pre(&mut self, hir: &Hir) -> Result<(), ()> { + if !self.0 { + return Ok(()) + } + + match hir.kind() { + &HirKind::Concat(ref es) => self.check_concat(&es), + &HirKind::Alternation(ref es) => self.check_alternation(&es), + &HirKind::Repetition(ref rep) => { + if fset_of(&*rep.hir).is_empty() { + self.0 = false; + } + } + &HirKind::Class(ref cls) => self.check_cls(cls), + _ => () + } + + Ok(()) + } +} + +impl IsOnePassVisitor { + fn new() -> Self { + IsOnePassVisitor(true) + } + + fn check_concat(&mut self, es: &[Hir]) { + let mut empty_run = vec![]; + + for e in NestedConcat::new(es) { + // TODO(ethan):yakshaving factor the determination of when + // a regex accepts_empty out into a separate function, + // so that we don't compute the whole first set when we + // don't need to. + let fset = fset_of(e); + let is_rep = match e.kind() { + &HirKind::Repetition(_) => true, + _ => false, + }; + + empty_run.push(e); + if !(fset.accepts_empty || is_rep) { + // this is the last one in the run + break; + } + } + + if empty_run.len() > 0 { + self.0 = self.0 && !fsets_clash(&empty_run); + } + } + + fn check_alternation(&mut self, es: &[Hir]) { + self.0 = self.0 && !fsets_clash(&es.iter().collect::>()); + } + + // Unicode classes are really big alternatives from the byte + // oriented point of view. + // + // This function translates a unicode class into the + // byte space and checks for intersecting first sets. + fn check_cls(&mut self, cls: &Class) { + match cls { + &Class::Unicode(ref ucls) => { + let mut seen_char: [bool; 256] = [false; 256]; + + for cr in ucls.iter() { + for br in Utf8Sequences::new(cr.start(), cr.end()) { + let first = br.as_slice()[0]; + for b in first.start..(first.end+1) { + if seen_char[b as usize] { + self.0 = false; + return; + } + seen_char[b as usize] = true; + } + } + } + } + _ => {} // FALLTHROUGH + } + } + +} + +/// Check if a list of first sets is incompatable. +/// +/// O(n^2), but n will usually be quite small. +fn fsets_clash(es: &[&Hir]) -> bool { + for (i, e1) in es.iter().enumerate() { + for (j, e2) in es.iter().enumerate() { + if i != j { + let mut fset = fset_of(e1); + let fset2 = fset_of(e2); + + // For the regex /a|()+/, we don't have a way to + // differentiate the branches, so we are not onepass. + // + // We might be able to loosen this restriction by + // considering the expression after the alternative + // if there is one. + if fset.is_empty() || fset2.is_empty() { + return true; + } + + fset.intersect(&fset2); + if ! fset.is_empty() { + return true; + } + } + } + } + false +} + + +/// Compute the first set of a given regular expression. +/// +/// The first set of a regular expression is the set of all characters +/// which might begin it. This is a less general version of the +/// notion of a regular expression preview (the first set can be +/// thought of as the 1-preview of a regular expression). +/// +/// Note that first sets are byte-oriented because the DFA is +/// byte oriented. This means an expression like /Δ|δ/ is actually not +/// one-pass, even though there is clearly no non-determinism inherent +/// to the regex at a unicode code point level (big delta and little +/// delta start with the same byte). +fn fset_of(expr: &Hir) -> FirstSet { + fn singleton(b: u8) -> FirstSet { + let mut f = FirstSet::empty(); + f.push_bytes(ClassBytesRange::new(b, b)); + f + } + + match expr.kind() { + &HirKind::Empty => FirstSet::epsilon(), + &HirKind::Literal(ref lit) => { + match lit { + &Literal::Unicode(c) => singleton(first_byte(c)), + &Literal::Byte(b) => singleton(b), + } + } + &HirKind::Class(ref class) => { + match class { + &Class::Unicode(ref c) => { + // Get all the bytes which might begin this unicode + // class. + let mut cb = FirstSet::empty(); + for cr in c.iter() { + for br in Utf8Sequences::new(cr.start(), cr.end()) { + let first = br.as_slice()[0]; + cb.push_bytes( + ClassBytesRange::new(first.start, first.end)); + } + } + cb + } + &Class::Bytes(ref b) => + FirstSet::new(b.iter().map(|x| *x), false), + } + } + + // When an empty look (Anchor or WordBoundary) is at the start of + // a concatenation, we conservatively assume that the assertion + // will pass, so we just drop it. Then we can only get to this + // point if we are dealing with some sort of naked empty look. + // For now we just do the most conservative thing and say + // that such an emptylook could potentially match on any character. + &HirKind::Anchor(_) | &HirKind::WordBoundary(_) => FirstSet::anychar(), + + &HirKind::Repetition(ref rep) => { + let mut f = fset_of(&*rep.hir); + match rep.kind { + RepetitionKind::ZeroOrOne => f.accepts_empty = true, + RepetitionKind::ZeroOrMore => f.accepts_empty = true, + RepetitionKind::OneOrMore => {}, + RepetitionKind::Range(ref range) => { + match range { + &RepetitionRange::Exactly(0) + | &RepetitionRange::AtLeast(0) + | &RepetitionRange::Bounded(0, _) => + f.accepts_empty = true, + _ => {} + } + } + } + f + }, + &HirKind::Group(ref group) => fset_of(&group.hir), + + // The most involved case. We need to strip leading empty-looks + // as well as take the union of the first sets of the first n+1 + // expressions where n is the number of leading repetitions. + &HirKind::Concat(ref es) => { + let mut fset = FirstSet::empty(); + for (i, e) in es.iter().enumerate() { + match e.kind() { + &HirKind::Anchor(_) | &HirKind::WordBoundary(_) => { + // Ignore any leading emptylooks, but any in tail + // position have to be accounted for. + if i == es.len() - 1 { + fset.union(&FirstSet::anychar()); + } + } + _ => { + let inner_fset = fset_of(e); + fset.union(&inner_fset); + + if !inner_fset.accepts_empty() { + // We can stop accumulating after we stop seeing + // first sets which contain epsilon. + // Also, a contatination which terminated by + // one or more expressions which do not accept + // epsilon itself does not acceept epsilon. + fset.accepts_empty = false; + break; + } + } + } + } + fset + } + &HirKind::Alternation(ref es) => { + let mut fset = FirstSet::empty(); + for e in es { + fset.union(&fset_of(e)); + } + fset + } + } +} + +/// The first byte of a unicode code point. +/// +/// We only ever care about the first byte of a particular character, +/// because the onepass DFA is implemented in the byte space, not the +/// character space. This means, for example, that a branch between +/// lowercase delta and uppercase delta is actually non-deterministic. +fn first_byte(c: char) -> u8 { + let mut b: [u8; 4] = [0; 4]; + c.encode_utf8(&mut b); + b[0] +} + +/// A representation of all the possible ways a word in the language +/// of a regex could begin. ClassBytes has no way to express the empty +/// string, so we add an extra flag to indicate if a FirstSet includes +/// epsilon. Put in a more theoretical way all firstsets are subsets of +/// SIGMA `union` { epsilon }. +#[derive(Debug, PartialEq, Eq)] +struct FirstSet { + bytes: ClassBytes, + pub accepts_empty: bool, +} + +impl FirstSet { + fn empty() -> Self { + FirstSet { + bytes: ClassBytes::empty(), + accepts_empty: false, + } + } + + pub fn new(ranges: I, accepts_empty: bool) -> Self + where I: IntoIterator + { + FirstSet { + bytes: ClassBytes::new(ranges), + accepts_empty: accepts_empty, + } + } + + fn anychar() -> FirstSet { + let mut f = FirstSet::empty(); + f.push_bytes(ClassBytesRange::new(b'\0', b'\xFF')); + f + } + + fn epsilon() -> FirstSet { + FirstSet { + bytes: ClassBytes::empty(), + accepts_empty: true, + } + } + + fn push_bytes(&mut self, byte_range: ClassBytesRange) { + self.bytes.push(byte_range); + } + + fn union(&mut self, other: &FirstSet) { + self.bytes.union(&other.bytes); + self.accepts_empty = self.accepts_empty || other.accepts_empty; + } + + fn intersect(&mut self, other: &FirstSet) { + self.bytes.intersect(&other.bytes); + self.accepts_empty = self.accepts_empty && other.accepts_empty; + } + + fn is_empty(&self) -> bool { + self.bytes.is_empty() && !self.accepts_empty + } + + fn accepts_empty(&self) -> bool { + self.accepts_empty + } +} + +/// An iterator over a concatenation of expressions which +/// drills down into other embedded concatenations. +struct NestedConcat<'a>(Vec<(&'a [Hir], usize)>); +impl<'a> NestedConcat<'a> { + fn new(es: &'a [Hir]) -> Self { + NestedConcat(vec![(es, 0)]) + } +} +impl<'a> Iterator for NestedConcat<'a> { + type Item = &'a Hir; + + fn next(&mut self) -> Option<&'a Hir> { + if self.0.len() == 0 { + return None; + } + + let tip = self.0.len() - 1; + let (es, idx) = self.0[tip]; + + if idx >= es.len() { + self.0.pop(); + return self.next(); + } + + self.0[tip].1 += 1; + + match es[idx].kind() { + &HirKind::Concat(ref es) => { + self.0.push((es, 0)); + self.next() + } + _ => Some(&es[idx]), + } + } +} + +#[cfg(test)] +mod tests { + use syntax::Parser; + use syntax::hir::Hir; + use super::*; + + fn is_intersecting_fset(e1: &Hir, e2: &Hir) -> bool { + let mut fset = fset_of(e1); + fset.intersect(&fset_of(e2)); + ! fset.is_empty() + } + + // + // First Set intersection smoke tests + // + + #[test] + fn fset_lit() { + let e1 = Parser::new().parse("a").unwrap(); + let e2 = Parser::new().parse("a").unwrap(); + let e3 = Parser::new().parse("b").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_class() { + let e1 = Parser::new().parse("[a]").unwrap(); + let e2 = Parser::new().parse("[a]").unwrap(); + let e3 = Parser::new().parse("[b]").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_class_n() { + let e1 = Parser::new().parse("[xamn]").unwrap(); + let e2 = Parser::new().parse("[rlwa]").unwrap(); + let e3 = Parser::new().parse("[bcq]").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_alt() { + let e1 = Parser::new().parse("ab|bc|ad").unwrap(); + let e2 = Parser::new().parse("yyyy|am|zz").unwrap(); + let e3 = Parser::new().parse("cc|ww").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_group() { + let e1 = Parser::new().parse("(?:ab)").unwrap(); + let e2 = Parser::new().parse("(?:aq)").unwrap(); + let e3 = Parser::new().parse("(?:m)").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_concat() { + let e1 = Parser::new().parse("aa(?:nb)").unwrap(); + let e2 = Parser::new().parse("aa(?:rq)").unwrap(); + let e3 = Parser::new().parse("bb(?:m)").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_word_boundary_dropped() { + let e1 = Parser::new().parse(r"aa").unwrap(); + let e2 = Parser::new().parse(r"\baa").unwrap(); + let e3 = Parser::new().parse(r"\bbb").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_word_boundary_all() { + let e1 = Parser::new().parse(r"aa").unwrap(); + let e2 = Parser::new().parse(r"\b").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + } + + #[test] + fn fset_not_word_boundary_dropped() { + let e1 = Parser::new().parse(r"aa").unwrap(); + let e2 = Parser::new().parse(r"\Baa").unwrap(); + let e3 = Parser::new().parse(r"\Bbb").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_not_word_boundary_all() { + let e1 = Parser::new().parse(r"aa").unwrap(); + let e2 = Parser::new().parse(r"\B").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + } + + #[test] + fn fset_start_anchor_dropped() { + let e1 = Parser::new().parse(r"aa").unwrap(); + let e2 = Parser::new().parse(r"^aa").unwrap(); + let e3 = Parser::new().parse(r"^bb").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_terminal_emptylook_all() { + let e = Parser::new().parse(r"a*\b").unwrap(); + + let mut total_accept = FirstSet::anychar(); + total_accept.accepts_empty = true; + + assert_eq!(total_accept, fset_of(&e)); + } + + #[test] + fn fset_empty_alt() { + let e1 = Parser::new().parse(r"(?:a|())b").unwrap(); + let e2 = Parser::new().parse(r"b").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + } + + // + // onepass smoke tests + // + + // This test is pulled right from some of Russ Cox's + // comments on onepass regex. + // + // Note that Russ Cox's other example of a onepass regex + // (r"(\d+)-(\d+)") is actually not onepass for us because + // there is byte-level nondeterminism in the \d character + // class, and we care about things in the byte space rather + // than the character space. If you do a onepass engine at + // the character level, Cox's example is indeed onepass. + #[test] + fn is_onepass_smoke_test1() { + let e1 = Parser::new().parse(r"([^x]*)x(.*)").unwrap(); + let e2 = Parser::new().parse(r"(.*)x(.*)").unwrap(); + + assert!(is_onepass(&e1)); + assert!(!is_onepass(&e2)); + } + + #[test] + fn is_onepass_empty_alt() { + let e1 = Parser::new().parse(r"(a|())b").unwrap(); + let e2 = Parser::new().parse(r"(a|())a").unwrap(); + + assert!(is_onepass(&e1)); + assert!(!is_onepass(&e2)); + } + + #[test] + fn is_onepass_rep() { + let e1 = Parser::new().parse(r"a+a").unwrap(); + let e2 = Parser::new().parse(r"a*a").unwrap(); + + assert!(!is_onepass(&e1)); + assert!(!is_onepass(&e2)); + } +} diff --git a/src/lib.rs b/src/lib.rs index 9ca156084a..7d179dc66a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -668,6 +668,7 @@ mod re_unicode; mod sparse; #[cfg(any(regex_runtime_teddy_ssse3, regex_runtime_teddy_avx2))] mod vector; +mod analysis; /// The `internal` module exists to support suspicious activity, such as /// testing different matching engines and supporting the `regex-debug` CLI From 36698811e89c013b0b4459e8b021383c1f3d9c89 Mon Sep 17 00:00:00 2001 From: Ethan Pailes Date: Sun, 22 Apr 2018 19:50:52 -0400 Subject: [PATCH 2/8] Add a onepass DFA. This patch adds a onepass matcher, which is a DFA that has all the abilities of an NFA! There are lots of expressions that a onepass matcher can't handle, namely those cases where a regex contains non-determinism. The general approach we take is as follows: 1. Check if a regex is onepass using `src/onepass.rs::is_onepass`. 2. Compile a new regex program using the compiler with the bytes flag set. 3. Compile a onepass DFA from the program produced in step 2. We will roughly map each instruction to a state in the DFA, though instructions like `split` don't get states. a. Make a new transition table for the first instruction. b. For each child of the first instruction: - If it is a bytes instruction, add a transition to the table for every byte class in the instruction. - If it is an instruction which consumes zero input (like `EmptyLook` or `Save`), emit a job to a DAG asking to forward the first instruction state to the state for the non-consuming instruction. - Push the child instruction to a queue of instructions to process. c. Peel off an instruction from the queue and go back to step a, processing the instruction as if it was the first instruction. If the queue is empty, continue with step d. d. Topologically sort the forwarding jobs, and shuffle the transitions from the forwarding targets to the forwarding sources in topological order. e. Bake the intermediary transition tables down into a single flat vector. States which require some action (`EmptyLook` and `Save`) get an extra entry in the baked transition table that contains metadata instructing them on how to perform their actions. 4. Wait for the user to give us some input. 5. Execute the DFA: - The inner loop is basically: while at < text.len(): state_ptr = baked_table[text[at]] at += 1 - There is a lot of window dressing to handle special states. The idea of a onepass matcher comes from Russ Cox and his RE2 library. I haven't been as good about reading the RE2 source as I should have, but I've gotten the impression that the RE2 onepass matcher is more in the spirit of an NFA simulation without threads than a DFA. --- Cargo.toml | 5 + src/analysis.rs | 130 +++-- src/backtrack.rs | 2 +- src/exec.rs | 125 ++++- src/input.rs | 16 +- src/lib.rs | 1 + src/onepass.rs | 1188 +++++++++++++++++++++++++++++++++++++++++ src/pikevm.rs | 2 +- tests/onepass_unit.rs | 26 + tests/test_onepass.rs | 64 +++ 10 files changed, 1488 insertions(+), 71 deletions(-) create mode 100644 src/onepass.rs create mode 100644 tests/onepass_unit.rs create mode 100644 tests/test_onepass.rs diff --git a/Cargo.toml b/Cargo.toml index 2a3376e39a..134c04e532 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -114,6 +114,11 @@ name = "backtrack-bytes" path = "tests/test_crates_regex.rs" name = "crates-regex" +# Run the test suite on the onepass engine. +[[test]] +path = "tests/test_onepass.rs" +name = "onepass" + [profile.release] debug = true diff --git a/src/analysis.rs b/src/analysis.rs index 2c7b5959ef..8e5f05bb99 100644 --- a/src/analysis.rs +++ b/src/analysis.rs @@ -50,20 +50,15 @@ impl IsOnePassVisitor { let mut empty_run = vec![]; for e in NestedConcat::new(es) { - // TODO(ethan):yakshaving factor the determination of when - // a regex accepts_empty out into a separate function, - // so that we don't compute the whole first set when we - // don't need to. - let fset = fset_of(e); let is_rep = match e.kind() { &HirKind::Repetition(_) => true, _ => false, }; empty_run.push(e); - if !(fset.accepts_empty || is_rep) { - // this is the last one in the run - break; + if !(accepts_empty(e) || is_rep) { + self.0 = self.0 && !fsets_clash(&empty_run); + empty_run.clear(); } } @@ -76,7 +71,7 @@ impl IsOnePassVisitor { self.0 = self.0 && !fsets_clash(&es.iter().collect::>()); } - // Unicode classes are really big alternatives from the byte + // Unicode classes are really just big alternatives from the byte // oriented point of view. // // This function translates a unicode class into the @@ -99,7 +94,7 @@ impl IsOnePassVisitor { } } } - _ => {} // FALLTHROUGH + _ => {} } } @@ -115,16 +110,6 @@ fn fsets_clash(es: &[&Hir]) -> bool { let mut fset = fset_of(e1); let fset2 = fset_of(e2); - // For the regex /a|()+/, we don't have a way to - // differentiate the branches, so we are not onepass. - // - // We might be able to loosen this restriction by - // considering the expression after the alternative - // if there is one. - if fset.is_empty() || fset2.is_empty() { - return true; - } - fset.intersect(&fset2); if ! fset.is_empty() { return true; @@ -138,14 +123,14 @@ fn fsets_clash(es: &[&Hir]) -> bool { /// Compute the first set of a given regular expression. /// -/// The first set of a regular expression is the set of all characters +/// The first set of a regular expression is the set of all bytes /// which might begin it. This is a less general version of the /// notion of a regular expression preview (the first set can be /// thought of as the 1-preview of a regular expression). /// /// Note that first sets are byte-oriented because the DFA is /// byte oriented. This means an expression like /Δ|δ/ is actually not -/// one-pass, even though there is clearly no non-determinism inherent +/// onepass, even though there is clearly no non-determinism inherent /// to the regex at a unicode code point level (big delta and little /// delta start with the same byte). fn fset_of(expr: &Hir) -> FirstSet { @@ -155,7 +140,9 @@ fn fset_of(expr: &Hir) -> FirstSet { f } - match expr.kind() { + // First compute the set of characters that might begin + // the expression (ignoring epsilon for now). + let mut f_char_set = match expr.kind() { &HirKind::Empty => FirstSet::epsilon(), &HirKind::Literal(ref lit) => { match lit { @@ -191,29 +178,13 @@ fn fset_of(expr: &Hir) -> FirstSet { // that such an emptylook could potentially match on any character. &HirKind::Anchor(_) | &HirKind::WordBoundary(_) => FirstSet::anychar(), - &HirKind::Repetition(ref rep) => { - let mut f = fset_of(&*rep.hir); - match rep.kind { - RepetitionKind::ZeroOrOne => f.accepts_empty = true, - RepetitionKind::ZeroOrMore => f.accepts_empty = true, - RepetitionKind::OneOrMore => {}, - RepetitionKind::Range(ref range) => { - match range { - &RepetitionRange::Exactly(0) - | &RepetitionRange::AtLeast(0) - | &RepetitionRange::Bounded(0, _) => - f.accepts_empty = true, - _ => {} - } - } - } - f - }, + &HirKind::Repetition(ref rep) => fset_of(&rep.hir), &HirKind::Group(ref group) => fset_of(&group.hir), // The most involved case. We need to strip leading empty-looks // as well as take the union of the first sets of the first n+1 - // expressions where n is the number of leading repetitions. + // expressions where n is the number of leading expressions which + // accept the empty string. &HirKind::Concat(ref es) => { let mut fset = FirstSet::empty(); for (i, e) in es.iter().enumerate() { @@ -229,13 +200,9 @@ fn fset_of(expr: &Hir) -> FirstSet { let inner_fset = fset_of(e); fset.union(&inner_fset); - if !inner_fset.accepts_empty() { + if !accepts_empty(e) { // We can stop accumulating after we stop seeing // first sets which contain epsilon. - // Also, a contatination which terminated by - // one or more expressions which do not accept - // epsilon itself does not acceept epsilon. - fset.accepts_empty = false; break; } } @@ -250,13 +217,68 @@ fn fset_of(expr: &Hir) -> FirstSet { } fset } + }; + + f_char_set.accepts_empty = accepts_empty(expr); + f_char_set +} + +fn accepts_empty(expr: &Hir) -> bool { + match expr.kind() { + &HirKind::Empty => true, + &HirKind::Literal(_) => false, + &HirKind::Class(_) => false, + + // A naked empty look is a pretty weird thing because we + // normally strip them from the beginning of concatinations. + // We are just going to treat them like `.` + &HirKind::Anchor(_) | &HirKind::WordBoundary(_) => false, + + &HirKind::Repetition(ref rep) => { + match rep.kind { + RepetitionKind::ZeroOrOne => true, + RepetitionKind::ZeroOrMore => true, + RepetitionKind::OneOrMore => accepts_empty(&rep.hir), + RepetitionKind::Range(ref range) => { + match range { + &RepetitionRange::Exactly(0) + | &RepetitionRange::AtLeast(0) + | &RepetitionRange::Bounded(0, _) => true, + _ => accepts_empty(&rep.hir), + } + } + } + } + + &HirKind::Group(ref group) => accepts_empty(&group.hir), + + &HirKind::Concat(ref es) => { + let mut accepts: bool = true; + for e in es.iter() { + match e.kind() { + &HirKind::Anchor(_) | &HirKind::WordBoundary(_) => { + // Ignore any leading emptylooks. + } + _ => { + accepts = accepts && accepts_empty(&e); + } + } + + if !accepts { + break; + } + } + accepts + } + + &HirKind::Alternation(ref es) => es.iter().any(accepts_empty) } } /// The first byte of a unicode code point. /// -/// We only ever care about the first byte of a particular character, -/// because the onepass DFA is implemented in the byte space, not the +/// We only ever care about the first byte of a particular character +/// because the onepass DFA is implemented in the byte space not the /// character space. This means, for example, that a branch between /// lowercase delta and uppercase delta is actually non-deterministic. fn first_byte(c: char) -> u8 { @@ -323,10 +345,6 @@ impl FirstSet { fn is_empty(&self) -> bool { self.bytes.is_empty() && !self.accepts_empty } - - fn accepts_empty(&self) -> bool { - self.accepts_empty - } } /// An iterator over a concatenation of expressions which @@ -544,4 +562,10 @@ mod tests { assert!(!is_onepass(&e1)); assert!(!is_onepass(&e2)); } + + #[test] + fn is_onepass_clash_in_middle_of_concat() { + let e = Parser::new().parse(r"ab?b").unwrap(); + assert!(!is_onepass(&e)); + } } diff --git a/src/backtrack.rs b/src/backtrack.rs index 6e71e2c2f3..e49d724ed0 100644 --- a/src/backtrack.rs +++ b/src/backtrack.rs @@ -245,7 +245,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> { ip = inst.goto1; } EmptyLook(ref inst) => { - if self.input.is_empty_match(at, inst) { + if self.input.is_empty_match(at, inst.look) { ip = inst.goto; } else { return false; diff --git a/src/exec.rs b/src/exec.rs index 578289aa5c..ef2cc3db41 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -32,6 +32,7 @@ use re_set; use re_trait::{RegularExpression, Slot, Locations}; use re_unicode; use utf8::next_utf8; +use onepass::{OnePass, OnePassCompiler}; /// `Exec` manages the execution of a regular expression. /// @@ -81,6 +82,8 @@ struct ExecReadOnly { /// preceding `.*?`). This is used by the DFA to find the starting location /// of matches. dfa_reverse: Program, + /// A compiled onepass DFA. Always byte based. + onepass: Option, /// A set of suffix literals extracted from the regex. /// /// Prefix literals are stored on the `Program`, since they are used inside @@ -177,6 +180,16 @@ impl ExecBuilder { self } + /// Asks the matching engine to use a onepass DFA if possible. + /// + /// This overrides whatever was previously set via the `automatic`, + /// `nfa`, or `bounded_backtracking` methods. + pub fn onepass(mut self) -> Self { + self.match_type = Some(MatchType::OnePassDfa(Box::new(None))); + self + } + + /// Compiles byte based programs for use with the NFA matching engines. /// /// By default, the NFA engines match on Unicode scalar values. They can @@ -286,6 +299,7 @@ impl ExecBuilder { nfa: Program::new(), dfa: Program::new(), dfa_reverse: Program::new(), + onepass: None, suffixes: LiteralSearcher::empty(), match_type: MatchType::Nothing, }); @@ -320,10 +334,14 @@ impl ExecBuilder { dfa_reverse.dfa_size_limit = self.options.dfa_size_limit; let mut ro = ExecReadOnly { - res: self.options.pats, nfa: nfa, dfa: dfa, dfa_reverse: dfa_reverse, + onepass: OnePassCompiler::new( + &parsed.exprs, + &self.options, + self.only_utf8).and_then(|c| c.compile()).ok(), + res: self.options.pats, suffixes: LiteralSearcher::suffixes(suffixes), match_type: MatchType::Nothing, }; @@ -421,6 +439,21 @@ impl<'c> RegularExpression for ExecNoSync<'c> { dfa::Result::Quit => self.shortest_nfa(text, start), } } + MatchType::OnePassDfa(_) => { + debug_assert!(self.ro.onepass.is_some()); + + match self.ro.onepass { + Some(ref op) => { + let mut slots = vec![None; self.slots_len()]; + if op.exec(&mut slots, text, start) { + slots[0] + } else { + None + } + } + None => unreachable!(), + } + } MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start), MatchType::Nothing => None, } @@ -469,6 +502,17 @@ impl<'c> RegularExpression for ExecNoSync<'c> { dfa::Result::Quit => self.match_nfa(text, start), } } + MatchType::OnePassDfa(_) => { + debug_assert!(self.ro.onepass.is_some()); + + match self.ro.onepass { + Some(ref op) => { + let mut slots = vec![None; self.slots_len()]; + op.exec(&mut slots, text, start) + } + None => unreachable!(), + } + } MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start), MatchType::Nothing => false, } @@ -512,6 +556,21 @@ impl<'c> RegularExpression for ExecNoSync<'c> { } } } + MatchType::OnePassDfa(_) => { + debug_assert!(self.ro.onepass.is_some()); + + match self.ro.onepass { + Some(ref op) => { + let mut slots = vec![None; self.slots_len()]; + if op.exec(&mut slots, text, start) { + slots[0].and_then(|s1| slots[1].map(|s2| (s1, s2))) + } else { + None + } + } + None => unreachable!(), + } + } MatchType::Nfa(ty) => self.find_nfa(ty, text, start), MatchType::Nothing => None, MatchType::DfaMany => { @@ -534,7 +593,7 @@ impl<'c> RegularExpression for ExecNoSync<'c> { text: &[u8], start: usize, ) -> Option<(usize, usize)> { - let slots = locs.as_slots(); + let mut slots = locs.as_slots(); for slot in slots.iter_mut() { *slot = None; } @@ -591,6 +650,20 @@ impl<'c> RegularExpression for ExecNoSync<'c> { dfa::Result::Quit => self.captures_nfa(slots, text, start), } } + MatchType::OnePassDfa(_) => { + debug_assert!(self.ro.onepass.is_some()); + + match self.ro.onepass { + Some(ref op) => { + if op.exec(&mut slots, text, start) { + slots[0].and_then(|s1| slots[1].map(|s2| (s1, s2))) + } else { + None + } + } + None => unreachable!(), + } + } MatchType::Nfa(ty) => { self.captures_nfa_type(ty, slots, text, start) } @@ -1012,18 +1085,31 @@ impl<'c> ExecNoSync<'c> { matches: &mut [bool], text: &[u8], start: usize, + ) -> bool { + self.many_matches_at_match_type( + matches, text, start, &self.ro.match_type) + } + + /// Finds which regular expressions match the given text with a + /// specific match type. + fn many_matches_at_match_type( + &self, + matches: &mut [bool], + text: &[u8], + start: usize, + match_type: &MatchType, ) -> bool { use self::MatchType::*; if !self.is_anchor_end_match(text) { return false; } - match self.ro.match_type { - Literal(ty) => { + match match_type { + &Literal(ty) => { debug_assert_eq!(matches.len(), 1); matches[0] = self.find_literals(ty, text, start).is_some(); matches[0] } - Dfa | DfaAnchoredReverse | DfaSuffix | DfaMany => { + &Dfa | &DfaAnchoredReverse | &DfaSuffix | &DfaMany => { match dfa::Fsm::forward_many( &self.ro.dfa, self.cache, @@ -1044,8 +1130,17 @@ impl<'c> ExecNoSync<'c> { } } } - Nfa(ty) => self.exec_nfa(ty, matches, &mut [], false, text, start), - Nothing => false, + &OnePassDfa(ref fallback) => { + match **fallback { + Some(ref fb) => + self.many_matches_at_match_type( + matches, text, start, fb), + None => unreachable!( + "BUG: we must have a real fallback by now."), + } + } + &Nfa(ty) => self.exec_nfa(ty, matches, &mut [], false, text, start), + &Nothing => false, } } @@ -1141,6 +1236,17 @@ impl Clone for Exec { impl ExecReadOnly { fn choose_match_type(&self, hint: Option) -> MatchType { use self::MatchType::*; + // If we have been asked to use the onepass DFA, we still need + // to choose a fallback in the usual way. + if let Some(OnePassDfa(_)) = hint { + let fallback = self.choose_match_type(None); + if self.onepass.is_some() { + return OnePassDfa(Box::new(Some(fallback))); + } else { + return fallback; + } + } + if let Some(Nfa(_)) = hint { return hint.unwrap(); } @@ -1222,11 +1328,14 @@ impl ExecReadOnly { } } -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Debug)] enum MatchType { /// A single or multiple literal search. This is only used when the regex /// can be decomposed into unambiguous literal search. Literal(MatchLiteralType), + /// A onepass DFA search. If a onepass search is impossible, we just + /// fall back to an automatically chosen search. + OnePassDfa(Box>), /// A normal DFA search. Dfa, /// A reverse DFA search starting from the end of a haystack. diff --git a/src/input.rs b/src/input.rs index 56097bd562..158e26e930 100644 --- a/src/input.rs +++ b/src/input.rs @@ -17,7 +17,7 @@ use std::u32; use syntax; use literal::LiteralSearcher; -use prog::InstEmptyLook; +use prog::EmptyLook; use utf8::{decode_utf8, decode_last_utf8}; /// Represents a location in the input. @@ -92,7 +92,7 @@ pub trait Input { /// Return true if the given empty width instruction matches at the /// input position given. - fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool; + fn is_empty_match(&self, at: InputAt, look: EmptyLook) -> bool; /// Scan the input for a matching prefix. fn prefix_at( @@ -118,8 +118,8 @@ impl<'a, T: Input> Input for &'a T { fn previous_char(&self, at: InputAt) -> Char { (**self).previous_char(at) } - fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { - (**self).is_empty_match(at, empty) + fn is_empty_match(&self, at: InputAt, look: EmptyLook) -> bool { + (**self).is_empty_match(at, look) } fn prefix_at( @@ -173,9 +173,9 @@ impl<'t> Input for CharInput<'t> { decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() } - fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { + fn is_empty_match(&self, at: InputAt, look: EmptyLook) -> bool { use prog::EmptyLook::*; - match empty.look { + match look { StartLine => { let c = self.previous_char(at); at.pos() == 0 || c == '\n' @@ -265,9 +265,9 @@ impl<'t> Input for ByteInput<'t> { decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() } - fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { + fn is_empty_match(&self, at: InputAt, look: EmptyLook) -> bool { use prog::EmptyLook::*; - match empty.look { + match look { StartLine => { let c = self.previous_char(at); at.pos() == 0 || c == '\n' diff --git a/src/lib.rs b/src/lib.rs index 7d179dc66a..4a901f983b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -669,6 +669,7 @@ mod sparse; #[cfg(any(regex_runtime_teddy_ssse3, regex_runtime_teddy_avx2))] mod vector; mod analysis; +mod onepass; /// The `internal` module exists to support suspicious activity, such as /// testing different matching engines and supporting the `regex-debug` CLI diff --git a/src/onepass.rs b/src/onepass.rs new file mode 100644 index 0000000000..f6a7c68a70 --- /dev/null +++ b/src/onepass.rs @@ -0,0 +1,1188 @@ +/*! +A onepass regex tells us that there are no non-deterministic branches +in the regex which means that we can use a DFA to implement capture +groups without resorting to magic too deep! The main advantage of +a onepass DFA are: + +1. The potential exponential blowup from converting an NFA to a DFA + via the powerset construction goes away. The exponential blowup + comes from compound states, which are a result of non-determinism. + This means that we don't need to muck about with dynamic DFA + construction or caching. + +2. There are no compound states so, we can implement captures with + a DFA. The reason that a general DFA can't handle captures is + that you don't know what to do when the DFA reaches a compound + state which includes one capturing state, but also other states. + This means that the DFA is potentially in either a capturing + NFA state or some other NFA state. For a onepass regex there + will never be a compound state for the DFA, so captures can + be implemented right in the DFA. +*/ + +use std::fmt; +use std::collections::{HashMap, HashSet}; + +use prog::{Program, Inst, EmptyLook}; +use literal::LiteralSearcher; +use re_trait::Slot; +use input::{ByteInput, Input}; +use analysis; +use compile::Compiler; +use syntax::hir::Hir; +use re_builder::RegexOptions; +use Error; + +// flip to true for debugging +const TRACE: bool = false; +macro_rules! trace { + ($($tts:tt)*) => { + if TRACE { + println!($($tts)*); + } + } +} + +/// A OnePass DFA. +#[derive(Debug, Clone)] +pub struct OnePass { + /// The flattened transition table of all of the different + /// DFA states. + table: Vec, + /// The prefixes. + prefixes: LiteralSearcher, + /// The stride. + num_byte_classes: usize, + /// The byte classes of this regex. + byte_classes: Vec, + /// The starting state. + start_state: StatePtr, + /// True if the regex is anchored at the start. + is_anchored_start: bool, + /// True if the regex is anchored at the end. + is_anchored_end: bool, + /// True if this regex ought to only accept utf8 strings. + only_utf8: bool, +} + +impl fmt::Display for OnePass { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + writeln!(f, "is_anchored_start: {}", self.is_anchored_start)?; + writeln!(f, "is_anchored_end: {}", self.is_anchored_end)?; + writeln!(f, "START: {}\n", st_str(self.start_state))?; + + for i in 0..(self.table.len() / self.num_byte_classes) { + let addr = format!("{:x}", i * self.num_byte_classes); + let trans_start = i * self.num_byte_classes; + let trans_end = (i+1) * self.num_byte_classes; + let trans = &self.table[trans_start..trans_end]; + Self::fmt_line(addr, trans, f)?; + } + + Ok(()) + } +} + +impl OnePass { + /// Execute the one-pass DFA, populating the list of capture slots + /// as you go. + pub fn exec(&self, slots: &mut [Slot], text: &[u8], mut at: usize) -> bool { + trace!("execing on '{:?}'\n{}", text, self); + + if self.is_anchored_start { + at == 0 && self.exec_(text, at, slots) + } else { + // We are forced to just try every starting index. + // This is noticeably more painful than it is for a + // standard DFA because we must clear the capture slots. + // + // To try to cut down on the cost of zeroing the capture + // groups, we implement a very simple FSM that just + // repeatedly tests to see if the very first DFA + // state could make progress. + loop { + trace!("OnePass::exec Trying to match at={} text.len()={}", + at, text.len()); + if self.exec_(text, at, slots) { + return true; + } + + for s in slots.iter_mut() { + *s = None; + } + + at = self.exec_prefix(text, at + 1); + if at > text.len() { + return false; + } + } + } + } + + /// Given the input and a position in the input, return next + /// position where a match will actually make one character + /// of progress. + fn exec_prefix(&self, text: &[u8], mut at: usize) -> usize { + trace!("::exec_prefix at={}", at); + if at < text.len() && !self.prefixes.is_empty() { + at = at + self.prefixes + .find(&text[at..]) + .map(|(s, _)| s) + .unwrap_or(text.len()); + } else { + while at < text.len() { + let byte_class = self.byte_classes[text[at] as usize] as usize; + if self.table[byte_class] != STATE_DEAD { + break; + } + at += 1; + } + } + + trace!("::exec_prefix next-chance={}", at); + + at + } + + /// Execute the one-pass DFA, populating the list of capture slots + /// as you go. + /// + /// Returns true if there is a match and false otherwise. + #[inline] + fn exec_( + &self, + text: &[u8], + mut at: usize, + slots: &mut [Slot] + ) -> bool { + // We re-use the NFA input machinery for empty looks. We are + // really going to work directly on the byte slice though. + let input = ByteInput::new(text, self.only_utf8); + + let mut state_ptr = self.start_state; + let mut last_match: Slot = None; + + // The inner loop of the onepass DFA. + // + // We bend over backwards to make sure that the inner loop + // logically looks like: + // + // while at < text.len(): + // state_ptr = self.transitions[state_ptr + text[at]] + // at += 1 + // + // As usual, this is a horrible lie. The onepass DFA steals + // the byteclass compression trick from the lazy DFA, so there + // is an extra layer of indirection. Any special flags need to + // be handled, so we also need to check the STATE_SPECIAL mask + // at every step. Finally, we use a backstop instead of the + // actual text.len() to check when it is time to break out of + // the loop to facilitate loop unrolling, and to avoid an + // extra branch around when it is time to increment at. + // + // Note that the only difference between this loop and + // the drain loop below is where `at` gets incremented + // and loop unrolling. For bugs that are not related to + // either of those things, it is often easier to just comment + // this loop out and work on the drain loop. Once you've come + // up with the fix, you can transfer your work here. + let step_size = 1; + let backstop = text.len().checked_sub(step_size).unwrap_or(0); + while at < backstop { + if state_ptr & STATE_SPECIAL == 0 { + // This is a weird looking place to increment at. + // The reason we do so has to do with the odd + // representation of a DFA that we've chosen. + // Let's dump the simplest possible regex to unpack + // that. + // + // ```text + // > cd regex-debug + // > cargo run -- --onepass compile 'a' + // is_anchored_start: false + // is_anchored_end: false + // START: (0) + // + // 0: 0/D | 1/8 | 2/D | 3/D + // 4: 0/0 | 1/0 | 2/P | 3/P + // 8: 0/(c) | 1/(c) | 2/(c) | 3/(c) + // c: 0/M | 1/M | 2/M | 3/M + // 10: 0/1 | 1/1 | 2/P | 3/P + // ``` + // + // Our initial state is denoted (0) because it's transition + // table lives at self.table[0] and because it is a + // saving state. This means that it does not correspond + // to the consumption of any input, yet its transition + // table is derived from its child states. In this + // case its only child state is 8. When we transition + // to state 8, the assertion that the first byte be + // 97 has already passed. Then we can't just increment + // at after every input consuming state, as you might + // think at first. The assertions associated with a state + // really get checked right before we enter it, so the + // right thing to do is to increment at only when we + // enter an input consuming state. + // + // One might be concerned that this will cause us to + // skip over the very first byte, but we are saved by + // the fact that the first instruction is always a save + // instruction. + at += 1; + + let byte_class = self.byte_class(text, at); + trace!("::exec_ loop-byte st={} at={} bc={} byte={}", + st_str(state_ptr), at, byte_class, text[at]); + + // No need to mask because no flags are set. + state_ptr = self.follow(state_ptr as usize, byte_class); + } else { + // STATE_HALT and STATE_DEAD must always be checked + // first because they have STATE_ACTION and STATE_MATCH + // set, even though those flags don't apply. It would + // probably be better for performance to check them last, + // so it may be worthwhile to try to rejigger the + // representation of StatePtrs. + if state_ptr == STATE_HALT { + trace!("::exec_ loop-halt"); + break; + } else if state_ptr == STATE_DEAD { + trace!("::exec_ loop-dead"); + slots[FULL_MATCH_CAPTURE_END] = last_match; + return last_match.is_some(); + } + + if state_ptr & STATE_ACTION != 0 { + let byte_class = self.byte_class(text, at); + trace!("::exec_ loop-act st={} at={} bc={} byte={}", + st_str(state_ptr), at, byte_class, text[at]); + let match_state = state_ptr & STATE_MATCH != 0; + state_ptr = + self.act(input, at, slots, state_ptr, byte_class); + // only record a match if the action does not cause death + if state_ptr != STATE_DEAD && match_state { + trace!("::exec_ loop-act-match at={}", at); + last_match = Some(at); + } + } else { + debug_assert!(state_ptr & STATE_MATCH != 0); + at += 1; + let byte_class = self.byte_class(text, at); + trace!("::exec_ loop-match st={} at={} bc={} byte={}", + st_str(state_ptr), at, byte_class, text[at]); + last_match = Some(at); + state_ptr = self.follow( + (state_ptr & STATE_MAX) as usize, byte_class); + + } + } + } + + // + // Drain the input after the backstop. + // + + // First, bump the at pointer if we just passed a byte test. + if state_ptr & STATE_ACTION == 0 { + at += 1; + } + + while at < text.len() { + let byte_class = self.byte_class(text, at); + + if state_ptr & STATE_SPECIAL == 0 { + // NOTE: In the main loop we increment `at` and + // recompute the byte class here. It is not + // safe to do so in the drain loop because we + // might fly off the end of the input and + // get an out of bounds error. + trace!("::exec_ drain-byte st={} at={} bc={} byte={}", + st_str(state_ptr), at, byte_class, text[at]); + + // No need to mask because no flags are set. + state_ptr = self.follow(state_ptr as usize, byte_class); + } else { + // STATE_HALT and STATE_DEAD must always be checked + // first because they have STATE_ACTION and STATE_MATCH + // set, even though those flags don't apply. It would + // probably be better for performance to check them last, + // so it may be worthwhile to try to rejigger the + // representation of StatePtrs. + if state_ptr == STATE_HALT { + trace!("::exec_ drain-halt"); + break; + } else if state_ptr == STATE_DEAD { + trace!("::exec_ drain-dead"); + slots[FULL_MATCH_CAPTURE_END] = last_match; + return last_match.is_some(); + } + + if state_ptr & STATE_ACTION != 0 { + trace!("::exec_ drain-act st={} at={} bc={} byte={}", + st_str(state_ptr), at, byte_class, text[at]); + let match_state = state_ptr & STATE_MATCH != 0; + state_ptr = + self.act(input, at, slots, state_ptr, byte_class); + // only record a match if the action does not cause death + if state_ptr != STATE_DEAD && match_state { + trace!("::exec_ drain-act-match at={}", at); + last_match = Some(at); + } + } else { + debug_assert!(state_ptr & STATE_MATCH != 0); + trace!("::exec_ drain-match st={} at={} bc={} byte={}", + st_str(state_ptr), at, byte_class, text[at]); + last_match = Some(at); + state_ptr = self.follow( + (state_ptr & STATE_MAX) as usize, byte_class); + + } + } + + // We incur the cost of this extra branch in the drain + // loop because we need to make sure that we won't fly + // off the end of the string. + if state_ptr & STATE_ACTION == 0 { + at += 1; + } + } + + // + // Execute one last step in the magic EOF byte class + // + + // Set the byte class to be EOF + let byte_class = self.num_byte_classes - 1; + trace!("::exec eof st={} at={} bc={}", + st_str(state_ptr), at, byte_class); + + // One EOF step + if state_ptr & STATE_ACTION == 0 && state_ptr != STATE_DEAD { + if state_ptr & STATE_MATCH != 0 { + trace!("::exec_ eof-match st={} at={} last_match={:?}", + st_str(state_ptr), at, last_match); + last_match = Some(at); + } + state_ptr = self.table[ + (state_ptr & STATE_MAX) as usize + byte_class]; + } + + // Finally, drain any actions. + while state_ptr & STATE_ACTION != 0 && state_ptr != STATE_HALT { + trace!("::exec eof act st={}", st_str(state_ptr)); + let match_state = state_ptr & STATE_MATCH != 0; + state_ptr = self.act(input, at, slots, state_ptr, byte_class); + // only record a match if the action does not cause death + if state_ptr != STATE_DEAD && match_state { + trace!("::exec_ eof-act-match at={}", at); + last_match = Some(at); + } + } + + // + // Finally, we can figure out if we actually got a match. + // + + trace!("::exec_ determine-match st={} at={} last_match={:?} slots={:?}", + st_str(state_ptr), at, last_match, slots); + slots[FULL_MATCH_CAPTURE_END] = last_match; + return last_match.is_some(); + } + + #[inline] + fn act( + &self, + input: I, + at: usize, + slots: &mut [Slot], + state_ptr: StatePtr, + byte_class: usize, + ) -> StatePtr { + // We had better have been called with a state that actually + // needs to be acted on. + debug_assert!(state_ptr & STATE_ACTION != 0); + + let state_idx = (state_ptr & STATE_MAX) as usize; + let action_type = self.table[state_idx + self.num_byte_classes]; + + if action_type == Action::Save as StatePtr { + let slot_idx = self.table[state_idx + self.num_byte_classes + 1]; + trace!("::act saving slot {}", slot_idx); + slots[slot_idx as usize] = Some(at); + + self.follow(state_idx, byte_class) + } else { + let iat = input.at(at); + let look = match action_type { + x if x == Action::StartLine as StatePtr => EmptyLook::StartLine, + x if x == Action::EndLine as StatePtr => EmptyLook::EndLine, + x if x == Action::StartText as StatePtr => EmptyLook::StartText, + x if x == Action::EndText as StatePtr => EmptyLook::EndText, + x if x == Action::WordBoundary as StatePtr => + EmptyLook::WordBoundary, + x if x == Action::NotWordBoundary as StatePtr => + EmptyLook::NotWordBoundary, + x if x == Action::WordBoundaryAscii as StatePtr => + EmptyLook::WordBoundaryAscii, + x if x == Action::NotWordBoundaryAscii as StatePtr => + EmptyLook::NotWordBoundaryAscii, + _ => unreachable!("Bad action flag."), + }; + + trace!("::act look={:?}", look); + + if input.is_empty_match(iat, look) { + self.follow(state_idx, byte_class) + } else { + STATE_DEAD + } + } + + } + + #[inline] + fn byte_class(&self, text: &[u8], at: usize) -> usize { + self.byte_classes[text[at] as usize] as usize + } + + #[inline] + fn follow(&self, state_idx: usize, byte_class: usize) -> StatePtr { + self.table[state_idx + byte_class] + } + + fn fmt_line( + addr: String, + trans: &[StatePtr], + f: &mut fmt::Formatter, + ) -> Result<(), fmt::Error> { + writeln!(f, "{}: {}", addr, + trans.iter().enumerate() + .map(|(i, x)| format!("{}/{}", i, st_str(*x))) + .collect::>() + .join(" | "))?; + Ok(()) + } +} + +////////////////////////////////////////////////////////////////////////// +// // +// Compiler // +// // +////////////////////////////////////////////////////////////////////////// + +/// Compiler for a OnePass DFA +pub struct OnePassCompiler { + /// The flattened transition table AKA the baked form of the DFA. + table: Vec, + + num_byte_classes: usize, + only_utf8: bool, + + /// The program to be compiled. + prog: Program, + + /// A mapping from instruction indices to their transitions + transitions: Vec>, + + /// A mapping from instruction indices to flags indicating + /// if they should have the STATE_MATCH flag set. + accepting_states: Vec, +} + +#[derive(Debug)] +pub enum OnePassError { + /// This program can't be executed as a one-pass regex. + HasNondeterminism, + /// This program contains a cycle of instructions that consume + /// no input. Right now we can't handle that, but this restriction + /// may be lifted in the future. + ForwardingCycle, + /// There are too many instructions to deal with. + TooBig, + /// An error happened when we tried to compile the regex. + CompileError(Error), + /// We don't support multiple regex at once. + RegexSetUnsupported, + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +impl From for OnePassError { + fn from(e: Error) -> Self { + OnePassError::CompileError(e) + } +} + +impl OnePassCompiler { + /// Create a new OnePassCompiler for a given Hir. + /// Collect some metadata from the compiled program. + pub fn new( + es: &[Hir], + options: &RegexOptions, + only_utf8: bool, + ) -> Result { + if es.len() != 1 { + return Err(OnePassError::RegexSetUnsupported); + } + + if ! analysis::is_onepass(&es[0]) { + return Err(OnePassError::HasNondeterminism); + } + + let prog = Compiler::new() + .size_limit(options.size_limit) + .bytes(true) + .only_utf8(only_utf8) + .compile(es)?; + + let num_byte_classes = (prog.byte_classes[255] as usize) + 2; + + // We don't allow STATE_MAX to actually be used so that STATE_POISON + // remains a valid poison value. + let max_table_size = (STATE_MAX - 1) as usize; + let mut table_size: usize = 0; + for inst in prog.iter() { + table_size += num_byte_classes; + match inst { + &Inst::EmptyLook(_) | &Inst::Save(_) => + table_size += num_byte_classes, + _ => {} + } + if table_size > max_table_size { + return Err(OnePassError::TooBig); + } + } + + trace!("new compiler for:\n{:?}", prog); + Ok(OnePassCompiler { + table: vec![], + num_byte_classes: num_byte_classes, + only_utf8: only_utf8, + + transitions: { + let mut x = Vec::new(); + for _ in 0..prog.len() { + x.push(None); + } + x + }, + accepting_states: vec![false; prog.len()], + prog: prog, + }) + } + + /// Attempt to compile the regex to a OnePass DFA + pub fn compile(mut self) -> Result { + // A DAG of forwarding relationships indicating when + // a state needs to be forwarded to an Action state + // once that Action state has been fully constructed. + let mut forwards = Forwards::new(); + + // Compute the prioritized transition tables for all of the + // instructions which get states. + let mut state_edge = vec![0]; + while let Some(i) = state_edge.pop() { + state_edge.extend(self.inst_trans(i, &mut forwards)?); + } + + // Solve the dependency relationships between all the + // forwarding directives that were emitted by inst_trans. + for fwd in forwards.into_iter_topo() { + self.perform_forward(fwd?); + } + + // Now emit the transitions in a form that we can actually + // execute. + self.bake_transitions(); + + Ok(OnePass { + table: self.table, + prefixes: self.prog.prefixes, + num_byte_classes: self.num_byte_classes, + byte_classes: self.prog.byte_classes, + start_state: 0 | STATE_ACTION, + is_anchored_start: self.prog.is_anchored_start, + is_anchored_end: self.prog.is_anchored_end, + only_utf8: self.only_utf8, + }) + } + + /// Compile the stage 1 transition table for the state corresponding + /// to the given instruction. + /// + /// The result of `inst_trans` will end up in `self.transitions`. + /// + /// Returns a list of child instructions which must be compiled + /// via `inst_trans`. + fn inst_trans( + &mut self, + inst_idx: usize, + forwards: &mut Forwards, + ) -> Result, OnePassError> { + trace!("::inst_trans inst_idx={}", inst_idx); + + if self.transitions[inst_idx].is_some() { + return Ok(vec![]); + } + + // Iterate over the children, visiting lower priority + // children first. + let mut resume = match &self.prog[inst_idx] { + &Inst::Save(ref inst) => vec![inst.goto], + &Inst::EmptyLook(ref inst) => vec![inst.goto], + &Inst::Bytes(ref inst) => vec![inst.goto], + &Inst::Split(ref inst) => vec![inst.goto1, inst.goto2], + &Inst::Match(_) => return Ok(vec![]), // no kids + &Inst::Ranges(_) | &Inst::Char(_) => unreachable!(), + }; + + let mut trans = TransitionTable( + vec![Transition { tgt: TransitionTarget::Die, priority: 0 }; + self.num_byte_classes]); + + // Start at priority 1 because everything is higher priority than + // the initial list of `TransitionTarget::Die` pointers. + let mut priority = 1; + + let mut children = vec![]; + while let Some(child_idx) = resume.pop() { + match &self.prog[child_idx] { + &Inst::EmptyLook(_) | &Inst::Save(_) => { + forwards.forward(inst_idx, child_idx, priority); + children.push(child_idx); + } + &Inst::Bytes(ref inst) => { + // Weird usize casting shenanigans because a Bytes + // instruction has inclusive ranges, but rust uses + // closed-open ranges. + for byte in (inst.start as usize)..(inst.end as usize + 1) { + let byte = byte as u8; + let bc = self.prog.byte_classes[byte as usize]; + trans.0[bc as usize] = Transition { + tgt: TransitionTarget::BytesInst(child_idx), + priority: priority + }; + } + children.push(child_idx); + } + &Inst::Split(ref inst) => { + resume.push(inst.goto1); + resume.push(inst.goto2); + } + &Inst::Match(_) => { + self.accepting_states[inst_idx] = true; + for t in trans.0.iter_mut() { + // Note that we go from lowest to highest + // priority, so we don't have to worry about + // clobbering higher priority transitions here. + *t = Transition { + tgt: TransitionTarget::Match, + priority: priority + }; + } + } + &Inst::Ranges(_) | &Inst::Char(_) => unreachable!(), + } + priority += 1; + } + + self.transitions[inst_idx] = Some(trans); + + Ok(children) + } + + /// Execute a forwarding job. + /// + /// To make that a little more concrete, consider the program snippet: + /// + /// 0000: Bytes(a, a) + /// 0001: Save(2) + /// 0002: Bytes(b, b) + /// + /// Here the state for `Bytes(a, a)` needs to transition to + /// the state for `Save(2)`, but it does not know when to do + /// so. The right answer is that it should transition to + /// the `Save(2)` state when it sees a `b`, but it is hard + /// to know what children `Save(2)` has from where `Bytes(a, a)` + /// stands. To handle this we just emit a forwarding job + /// that says "when you know enough about the `Save(2)` state, + /// please forward `Bytes(a, a)` to `Save(2)`.". We need to use + /// a full DAG for this because there could be multiple forwarding + /// states in a row: + /// + /// 0000: Bytes(a, a) + /// 0001: Save(2) + /// 0002: Save(3) + /// 0003: Bytes(b, b) + /// + /// Here we will end up with two forwarding jobs: + /// + /// 1. Forward from `Bytes(a, a)` to `Save(2)`. + /// 2. Forward from `Save(2)` to `Save(3)`. + /// + /// Which we structure as a dag that looks like: + /// + /// (2) --> (1) + /// + /// The arrow flows in a funny direction because we want the jobs + /// with no dependencies to live at the roots of the DAG so that + /// we can process them first. + fn perform_forward(&mut self, fwd: Forward) { + debug_assert!(fwd.copy_to != fwd.copy_from); + + let tgt = match &self.prog[fwd.copy_from] { + &Inst::EmptyLook(_) | &Inst::Save(_) => + TransitionTarget::ActionInst(fwd.copy_from), + _ => TransitionTarget::BytesInst(fwd.copy_from), + }; + + // Get a pair of mutable references to the two different + // transition tables in borrow checker approved fashion. + let (copy_to_ts, copy_from_ts) = if fwd.copy_to < fwd.copy_from { + let (stub, tail) = self.transitions.split_at_mut(fwd.copy_from); + (&mut stub[fwd.copy_to], &mut tail[0]) + } else { + let (stub, tail) = self.transitions.split_at_mut(fwd.copy_to); + (&mut tail[0], &mut stub[fwd.copy_from]) + }; + let (copy_to_ts, copy_from_ts) = match (copy_to_ts, copy_from_ts) { + (&mut Some(ref mut copy_to_ts), &mut Some(ref copy_from_ts)) => { + (copy_to_ts, copy_from_ts) + } + _ => unreachable!("forwards must be between real nodes."), + }; + + // now shuffle the transitions + for (from_t, to_t) in copy_from_ts.0.iter().zip(copy_to_ts.0.iter_mut()) { + if from_t.tgt == TransitionTarget::Die { + continue; + } + if to_t.priority > fwd.priority { + continue; + } + + // we should never encounter equal priorities + debug_assert!(to_t.priority != fwd.priority); + + *to_t = Transition { + tgt: tgt.clone(), + priority: fwd.priority, + }; + } + + // Finally, if a match instruction is reachable through + // a save fwd (which can never fail), the from state is accepting. + match &self.prog[fwd.copy_from] { + &Inst::Save(_) => { + self.accepting_states[fwd.copy_to] = + self.accepting_states[fwd.copy_from]; + } + _ => {} + } + } + + /// Once all the per-instruction transition tables have been worked + /// out, we can bake them into the single flat transition table we + /// are going to use for the actual DFA. This function creates the + /// baked form, storing it in `self.table`. + fn bake_transitions(&mut self) { + // pre-compute the state indices + let mut state_starts = Vec::with_capacity(self.prog.len()); + let mut off = 0; + for inst_idx in 0..self.prog.len() { + state_starts.push(off); + if self.transitions[inst_idx].is_some() { + off += self.num_byte_classes; + + match &self.prog[inst_idx] { + &Inst::EmptyLook(_) | &Inst::Save(_) => { + off += self.num_byte_classes; + } + _ => {} + } + } + } + + let ptr_of = |c: &OnePassCompiler, i: usize| { + let mut p = state_starts[i] as StatePtr; + if c.accepting_states[i] { + p |= STATE_MATCH; + } + p + }; + + self.table.reserve(state_starts[state_starts.len() - 1] + + self.num_byte_classes); + for inst_idx in 0..self.prog.len() { + let mut trans = Vec::with_capacity(self.num_byte_classes * 2); + + match &self.transitions[inst_idx] { + &None => continue, + &Some(ref ttab) => { + for t in ttab.0.iter() { + trans.push(match t.tgt { + TransitionTarget::Match => STATE_HALT, + + TransitionTarget::Die => STATE_DEAD, + TransitionTarget::BytesInst(i) => ptr_of(self, i), + TransitionTarget::ActionInst(i) => + ptr_of(self, i) | STATE_ACTION, + }); + } + } + } + + self.table.extend(trans); + + // emit all the right window dressing for the action, if + // there is one. + match &self.prog[inst_idx] { + &Inst::Save(ref inst) => { + debug_assert!(self.num_byte_classes >= 2); + + let mut save_args = vec![ + Action::Save as StatePtr, + inst.slot as StatePtr]; + save_args.extend(vec![STATE_POISON; + self.num_byte_classes - 2]); + self.table.extend(save_args); + } + &Inst::EmptyLook(ref inst) => { + let mut el_args = vec![self.empty_look_action(inst.look)]; + el_args.extend(vec![STATE_POISON; + self.num_byte_classes - 1]); + self.table.extend(el_args); + } + _ => {} + } + } + } + + fn empty_look_action(&self, el: EmptyLook) -> StatePtr { + match el { + EmptyLook::StartLine => Action::StartLine as StatePtr, + EmptyLook::EndLine => Action::EndLine as StatePtr, + EmptyLook::StartText => Action::StartText as StatePtr, + EmptyLook::EndText => Action::EndText as StatePtr, + EmptyLook::WordBoundary => Action::WordBoundary as StatePtr, + EmptyLook::NotWordBoundary => Action::NotWordBoundary as StatePtr, + EmptyLook::WordBoundaryAscii => + Action::WordBoundaryAscii as StatePtr, + EmptyLook::NotWordBoundaryAscii => + Action::NotWordBoundaryAscii as StatePtr, + } + } +} + +/// A mapping from byte classes to target states annotated +/// with transition priority. An intermediary representation. +struct TransitionTable(Vec); + +#[derive(Debug, Clone)] +struct Transition { + tgt: TransitionTarget, + priority: usize, +} + +#[derive(Debug, Clone, Eq, PartialEq)] +enum TransitionTarget { + Die, + Match, + BytesInst(usize), + ActionInst(usize), +} + + +/// A (hopefully) DAG of forwarding jobs. +#[derive(Debug, Clone)] +struct Forwards { + jobs: Vec, + + // the edges of the DAG + e_out: HashMap>, + e_in: HashMap>, + + /// A mapping from instructions to forwarding jobs which + /// want to copy to them. + inst_copy_tos: HashMap>, + /// A mapping from instructions to forwarding jobs which + /// want to copy from them. + inst_copy_froms: HashMap>, + + /// We really care about the root set, but it is much easier to + /// keep track of its inverse in an online way. + not_root_set: HashSet, +} + +impl Forwards { + pub fn new() -> Self { + Forwards { + jobs: vec![], + + e_out: HashMap::new(), + e_in: HashMap::new(), + + inst_copy_tos: HashMap::new(), + inst_copy_froms: HashMap::new(), + + not_root_set: HashSet::new(), + } + } + + /// Forward the state indexed by `forward_from` to the state + /// indexed by `forward_to` once we have enough info to do so. + pub fn forward( + &mut self, + forward_from: usize, + forward_to: usize, + priority: usize + ) { + trace!("::forward from={} to={}", forward_from, forward_to); + + let fidx = self.jobs.len(); + self.jobs.push(Forward { + copy_to: forward_from, + copy_from: forward_to, + priority: priority, + }); + + // Note the endpoints of this forward + self.inst_copy_tos.entry(forward_from).or_insert(vec![]).push(fidx); + self.inst_copy_froms.entry(forward_to).or_insert(vec![]).push(fidx); + + // For every forwarding job that we depend on completing + // before this job, add an edge flowing from the dependency + // to this job. + match self.inst_copy_tos.get(&forward_to) { + Some(dependencies) => { + trace!("dependencies = {:?}", dependencies); + for dep in dependencies.iter() { + Self::edge( + &mut self.e_out, &mut self.e_in, + &mut self.not_root_set, *dep, fidx); + } + } + None => {} + } + + // For every job which depends on this job, + // add an edge which flows from this job to the dependant + // job. + match self.inst_copy_froms.get(&forward_from) { + Some(dependants) => { + for dep in dependants.iter() { + Self::edge( + &mut self.e_out, &mut self.e_in, + &mut self.not_root_set, fidx, *dep); + } + } + None => {} + } + } + + // An associated function to please the borrow checker. gross. + fn edge( + e_out: &mut HashMap>, + e_in: &mut HashMap>, + not_root_set: &mut HashSet, + out_node: usize, + in_node: usize + ) { + e_out.entry(out_node).or_insert(vec![]).push(in_node); + e_in.entry(in_node).or_insert(vec![]).push(out_node); + not_root_set.insert(in_node); + } + + pub fn into_iter_topo(self) -> Topo { + let mut root_set = vec![]; + for n in 0..self.jobs.len() { + if ! self.not_root_set.contains(&n) { + root_set.push(n); + } + } + + trace!("::into_iter_topo jobs={:?}", self.jobs); + trace!("::into_iter_topo e_out={:?}", self.e_out); + trace!("::into_iter_topo e_in={:?}", self.e_in); + trace!("::into_iter_topo root_set={:?}", root_set); + + Topo { + jobs: self.jobs, + e_out: self.e_out, + e_in: self.e_in, + root_set: root_set, + } + } +} + + +/// A job asking the state indicated by `copy_to` to be rewritten +/// to point to the state indicated by `copy_from` whenever the +/// `copy_from` state could make progress. +#[derive(Debug, Clone)] +struct Forward { + copy_to: usize, + copy_from: usize, + priority: usize, +} + +/// An iterator that returns forwarding directives in topological order +/// using Kahn's Algorithm. +struct Topo { + jobs: Vec, + e_out: HashMap>, + e_in: HashMap>, + root_set: Vec, +} + +impl Iterator for Topo { + type Item = Result; + fn next(&mut self) -> Option> { + if let Some(next_job) = self.root_set.pop() { + let tgts = self.e_out.get(&next_job).unwrap_or(&vec![]).clone(); + for tgt in tgts.iter() { + self.rm_edge(next_job, *tgt); + + // If tgt has no incoming edges, add it to the root set. + if ! self.e_in.get(tgt).is_some() { + self.root_set.push(*tgt); + } + } + + Some(Ok(self.jobs[next_job].clone())) + } else { + if self.e_out.len() != 0 || self.e_in.len() != 0 { + Some(Err(OnePassError::ForwardingCycle)) + } else { + None + } + } + } +} + +impl Topo { + fn rm_edge(&mut self, node_out: usize, node_in: usize) { + let mut rm = false; + match self.e_out.get_mut(&node_out) { + Some(tgts) => { + let in_pos = tgts.iter().position(|t| *t == node_in); + match in_pos { + Some(p) => { tgts.remove(p); }, + None => debug_assert!(false), + } + + if tgts.len() == 0 { + rm = true; + } + } + None => debug_assert!(false), + } + if rm { + self.e_out.remove(&node_out); + } + + rm = false; + match self.e_in.get_mut(&node_in) { + Some(tgts) => { + let out_pos = tgts.iter().position(|t| *t == node_out); + match out_pos { + Some(p) => { tgts.remove(p); }, + None => debug_assert!(false), + } + + if tgts.len() == 0 { + rm = true; + } + } + None => debug_assert!(false), + } + if rm { + self.e_in.remove(&node_in); + } + } +} + +////////////////////////////////////////////////////////////////////////// +// // +// State Encoding // +// // +// This is mostly stolen from the lazy DFA. STATE_ACTION is a onepass // +// thing. // +// // +////////////////////////////////////////////////////////////////////////// + +type StatePtr = u32; + +fn st_str(st: StatePtr) -> String { + if st == STATE_DEAD { + "D".to_string() + } else if st == STATE_POISON { + "P".to_string() + } else if st == STATE_HALT { + "H".to_string() + } else if st & STATE_ACTION != 0 && st & STATE_MATCH != 0 { + format!("(M{:x})", st & STATE_MAX) + } else if st & STATE_ACTION != 0 { + format!("({:x})", st & STATE_MAX) + } else if st & STATE_MATCH != 0 { + format!("M{:x}", st & STATE_MAX) + } else { + format!("{:x}", st & STATE_MAX) + } +} + +/// The ACTION state means that the DFA needs to take some +/// action that will be specified by the first two StatePtrs +/// in a special transition table entry just below the transition +/// table for the ACTION state. An ACTION might include checking +/// some zero-width assertion about the input, or it might include +/// saving a value to a capture slots. +const STATE_ACTION: StatePtr = 1 << 31; + +/// An action which might need to be taken for a special state. +enum Action { + Save, + StartLine, + EndLine, + StartText, + EndText, + WordBoundary, + NotWordBoundary, + WordBoundaryAscii, + NotWordBoundaryAscii, +} + +/// A match state means that the regex has successfully matched. +const STATE_MATCH: StatePtr = 1 << 30; + +/// POISON is a state pointer that should never be touched. +/// We use it to pad invalid argument slots to ACTION states. +const STATE_POISON: StatePtr = !0; + +/// A dead state means that the state has been computed and it is known that +/// once it is entered, no future match can ever occur. +/// +/// It is not valid to dereference STATE_DEAD. +const STATE_DEAD: StatePtr = STATE_MATCH + 1; + +/// HALT indicates that the machine ought to halt execution. It differs +/// from DEAD only in that an accepting state that transitions to HALT +/// still accepts, while an accepting state which transitions to DEAD +/// does not. +const STATE_HALT: StatePtr = STATE_ACTION + 1; + +/// The maximum state pointer. This is useful to mask out the "valid" state +/// pointer from a state with the "start" or "match" bits set. +const STATE_MAX: StatePtr = STATE_MATCH - 1; + +/// STATE_SPECIAL is a bitmask useful for checking if we are dealing +/// with a special case, or if we can keep chugging away at the inner +/// loop. +const STATE_SPECIAL: StatePtr = STATE_MATCH | STATE_ACTION; + +const FULL_MATCH_CAPTURE_END: usize = 1; diff --git a/src/pikevm.rs b/src/pikevm.rs index 80d44717ae..a82bb959fb 100644 --- a/src/pikevm.rs +++ b/src/pikevm.rs @@ -322,7 +322,7 @@ impl<'r, I: Input> Fsm<'r, I> { nlist.set.insert(ip); match self.prog[ip] { EmptyLook(ref inst) => { - if self.input.is_empty_match(at, inst) { + if self.input.is_empty_match(at, inst.look) { ip = inst.goto; } } diff --git a/tests/onepass_unit.rs b/tests/onepass_unit.rs new file mode 100644 index 0000000000..b69197a251 --- /dev/null +++ b/tests/onepass_unit.rs @@ -0,0 +1,26 @@ + +// +// Just some unit tests that I found it useful to focus on while +// debugging the onepass DFA. Mostly these are simplifications +// of existing tests, so their value is not that huge, but +// why throw out tests that have been useful in the past. +// This is definitely not an appropriate permanent home +// for them. I should ask @burntsushi about where a better place +// for them would be (maybe in misc.rs?). Alternatively I could +// not be lazy and just actually try to grok each of the test +// modules. +// + + +mat!(trailing_repeat, "ab(?:ab)?", "abac", Some((0, 2))); + +// Currently fail to compile because empty branches are not allowed! +// Yay! In the future we might have to worry about this though. +// +// mat!(trailing_alt_with_empty_branch, "ab(?:ab|)", "abac", Some((0, 2))); +// mat!(trailing_lazy_alt_with_empty_branch, "ab(?:|ab)", "abab", Some((0, 2))); + +matiter!(match_multi_rep_4, r"(?m)(?:^a)+", "aaa\naaa\naaa", + (0, 1), (4, 5), (8, 9)); + +mat!(startline_a_rep, r"(?m)(?:^a)+", "aaa", Some((0, 1))); diff --git a/tests/test_onepass.rs b/tests/test_onepass.rs new file mode 100644 index 0000000000..5d2d90cbdb --- /dev/null +++ b/tests/test_onepass.rs @@ -0,0 +1,64 @@ +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![cfg_attr(feature = "pattern", feature(pattern))] + +extern crate rand; +extern crate regex; + +macro_rules! regex_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new($re).onepass().build().map(|e| e.into_regex()) + }} +} + +macro_rules! regex { + ($re:expr) => { + regex_new!($re).unwrap() + } +} + +// Even though we don't support regex sets, we should still provide a +// constructor for them that sets the onepass flag in order to make +// sure that we properly fall back to a different impl. +macro_rules! regex_set_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new_many($re).onepass().build().map(|e| e.into_regex_set()) + }} +} + +macro_rules! regex_set { + ($res:expr) => { + regex_set_new!($res).unwrap() + } +} + +// Must come before other module definitions. +include!("macros_str.rs"); +include!("macros.rs"); + +mod api; +mod api_str; +mod crazy; +mod flags; +mod fowler; +mod multiline; +mod noparse; +mod regression; +mod replace; +mod searcher; +mod set; +mod suffix_reverse; +mod unicode; +mod word_boundary; +mod word_boundary_unicode; +mod onepass_unit; From b6fab619ba7ce427a1a27df87f588d27e308a958 Mon Sep 17 00:00:00 2001 From: Ethan Pailes Date: Thu, 20 Sep 2018 22:08:31 -0400 Subject: [PATCH 3/8] Add doc comment to is_onepass --- src/analysis.rs | 46 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/src/analysis.rs b/src/analysis.rs index 8e5f05bb99..1ad89c654a 100644 --- a/src/analysis.rs +++ b/src/analysis.rs @@ -5,7 +5,49 @@ use syntax::hir::{ use syntax::hir; use utf8_ranges::Utf8Sequences; -/// True iff the given expression is one-pass +/// True iff the given expression is onepass +/// +/// The general approach here is to find all the places in +/// the given Hir where any sort of branching occurs, +/// and examine the start of each expression at the branch +/// to see if there is an ambiguity. +/// +/// For example, given the regex `a|b`, we would examine +/// both branches of the alternation `a` and `b` and +/// notice that they don't clash, so the regex is onepass. +/// On the other hand the branches of `a|a` do clash, +/// so that regex is not onepass. +/// +/// Alternations are not the only branch points in a regex. +/// We also have to make sure to consider repetitions like +/// `a*a`, which is not onepass because there is no way +/// to tell whether we have to loop back to the repeated +/// expression or continue on by looking at just one byte. +/// `a*b` is onepass because you can figure out what to do. +/// If you see an `a`, go back to the start of the loop, +/// and if you see a `b` continue onward. +/// +/// A third, more subtle case is the case of concatenations +/// of expressions where some of the expressions can +/// accept the empty string. Consider `a(b|)ba`. This +/// regex is not onepass because it is not clear what to +/// do upon seeing the input `ab`. The problem is that `(b|)` +/// and `ba` clash with one other. +/// +/// To get a bit more specific about what it means for two +/// expressions to clash, we introduce the concept of first +/// sets. The first set of an expression is the set of +/// bytes which might begin a word in the language of that +/// expression. If the expression can accept the empty string, +/// the first set takes note of that as well. +/// +/// To handle these three cases, we use a visitor to +/// find the alternations, repetitions, and concatenations. +/// Whenever we find one of the above cases, we compute +/// the first set of the various branches involved, +/// then check to see if the first sets intersect. If +/// we ever find a non-empty intersection, the regex +/// is not onepass. pub fn is_onepass(expr: &Hir) -> bool { hir::visit(expr, IsOnePassVisitor::new()).unwrap() } @@ -100,7 +142,7 @@ impl IsOnePassVisitor { } -/// Check if a list of first sets is incompatable. +/// Check if a list of first sets is incompatible. /// /// O(n^2), but n will usually be quite small. fn fsets_clash(es: &[&Hir]) -> bool { From 5ff7c80e40cfbcde8255a917db4b51db47775ba8 Mon Sep 17 00:00:00 2001 From: Ethan Pailes Date: Fri, 21 Sep 2018 10:53:55 -0400 Subject: [PATCH 4/8] rewrite NestedConcat to avoid stack space usage liniear in the size of the input. --- src/analysis.rs | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/src/analysis.rs b/src/analysis.rs index 1ad89c654a..25d180074e 100644 --- a/src/analysis.rs +++ b/src/analysis.rs @@ -401,26 +401,29 @@ impl<'a> Iterator for NestedConcat<'a> { type Item = &'a Hir; fn next(&mut self) -> Option<&'a Hir> { - if self.0.len() == 0 { - return None; - } + loop { + if self.0.len() == 0 { + return None; + } - let tip = self.0.len() - 1; - let (es, idx) = self.0[tip]; + let tip = self.0.len() - 1; + let (es, idx) = self.0[tip]; - if idx >= es.len() { - self.0.pop(); - return self.next(); - } + if idx >= es.len() { + self.0.pop(); + continue; + } - self.0[tip].1 += 1; + self.0[tip].1 += 1; - match es[idx].kind() { - &HirKind::Concat(ref es) => { - self.0.push((es, 0)); - self.next() + match es[idx].kind() { + &HirKind::Concat(ref es) => { + self.0.push((es, 0)); + continue; + // self.next() + } + _ => return Some(&es[idx]), } - _ => Some(&es[idx]), } } } From 6355ce519b115f1b4c3bcbaac1b824e1e5151309 Mon Sep 17 00:00:00 2001 From: Ethan Pailes Date: Sat, 22 Sep 2018 07:07:02 -0400 Subject: [PATCH 5/8] refactor is_onepass to be part of Hir construction. --- regex-syntax/Cargo.toml | 2 + regex-syntax/src/hir/mod.rs | 583 +++++++++++++++++++++++++++++++++- regex-syntax/src/lib.rs | 1 + src/analysis.rs | 616 ------------------------------------ src/lib.rs | 1 - src/onepass.rs | 3 +- 6 files changed, 573 insertions(+), 633 deletions(-) delete mode 100644 src/analysis.rs diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index ef562eca0e..2e9724c1aa 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -11,3 +11,5 @@ workspace = ".." [dependencies] ucd-util = "0.1.0" +# For extracting the first set from a unicode class +utf8-ranges = "1.0.1" diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index fb82dd9425..fe3f4254c5 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -20,14 +20,15 @@ use std::u8; use ast::Span; use hir::interval::{Interval, IntervalSet, IntervalSetIter}; use unicode; +use utf8_ranges::Utf8Sequences; pub use hir::visitor::{Visitor, visit}; mod interval; +mod visitor; pub mod literal; pub mod print; pub mod translate; -mod visitor; /// An error that can occur while translating an `Ast` to a `Hir`. #[derive(Clone, Debug, Eq, PartialEq)] @@ -217,7 +218,9 @@ impl Hir { /// /// An empty HIR expression always matches, including the empty string. pub fn empty() -> Hir { - let mut info = HirInfo::new(); + let mut info = HirInfo::new(FirstSet::epsilon()); + info.first_set.accepts_empty = true; + info.set_onepass(true); info.set_always_utf8(true); info.set_all_assertions(true); info.set_anchored_start(false); @@ -240,8 +243,9 @@ impl Hir { if let Literal::Byte(b) = lit { assert!(b > 0x7F); } - - let mut info = HirInfo::new(); + let mut info = HirInfo::new(Self::literal_first_set(&lit)); + info.first_set.accepts_empty = false; + info.set_onepass(true); info.set_always_utf8(lit.is_unicode()); info.set_all_assertions(false); info.set_anchored_start(false); @@ -254,10 +258,24 @@ impl Hir { info: info, } } + fn literal_first_set(lit: &Literal) -> FirstSet { + fn singleton(b: u8) -> FirstSet { + let mut f = FirstSet::empty(); + f.push_bytes(ClassBytesRange::new(b, b)); + f + } + + match lit { + &Literal::Unicode(c) => singleton(first_byte(c)), + &Literal::Byte(b) => singleton(b), + } + } /// Creates a class HIR expression. pub fn class(class: Class) -> Hir { - let mut info = HirInfo::new(); + let mut info = HirInfo::new(Self::class_first_set(&class)); + info.first_set.accepts_empty = false; + info.set_onepass(Self::class_is_onepass(&class)); info.set_always_utf8(class.is_always_utf8()); info.set_all_assertions(false); info.set_anchored_start(false); @@ -270,10 +288,67 @@ impl Hir { info: info, } } + fn class_first_set(class: &Class) -> FirstSet { + match class { + &Class::Unicode(ref c) => { + // Get all the bytes which might begin this unicode + // class. + let mut cb = FirstSet::empty(); + for cr in c.iter() { + for br in Utf8Sequences::new(cr.start(), cr.end()) { + let first = br.as_slice()[0]; + cb.push_bytes( + ClassBytesRange::new(first.start, first.end)); + } + } + cb + } + &Class::Bytes(ref b) => + FirstSet::new(b.iter().map(|x| *x), false), + } + } + // Unicode classes are really just big alternatives from the byte + // oriented point of view. + // + // This function translates a unicode class into the + // byte space and checks for intersecting first sets. + // + // Byte classes are always onepass + fn class_is_onepass(cls: &Class) -> bool { + match cls { + &Class::Unicode(ref ucls) => { + let mut seen_char: [bool; 256] = [false; 256]; + + for cr in ucls.iter() { + for br in Utf8Sequences::new(cr.start(), cr.end()) { + let first = br.as_slice()[0]; + for b in first.start..(first.end+1) { + if seen_char[b as usize] { + return false; + } + seen_char[b as usize] = true; + } + } + } + } + _ => {} + } + + true + } + /// Creates an anchor assertion HIR expression. pub fn anchor(anchor: Anchor) -> Hir { - let mut info = HirInfo::new(); + // When an empty look (Anchor or WordBoundary) is at the start of + // a concatenation, we conservatively assume that the assertion + // will pass, so we just scan forward to the next Hir to + // compute the first set. Then this first set won't spread + // to concatenations that an anchor starts. + let mut info = HirInfo::new(FirstSet::anychar()); + + info.first_set.accepts_empty = true; + info.set_onepass(true); info.set_always_utf8(true); info.set_all_assertions(true); info.set_anchored_start(false); @@ -297,7 +372,11 @@ impl Hir { /// Creates a word boundary assertion HIR expression. pub fn word_boundary(word_boundary: WordBoundary) -> Hir { - let mut info = HirInfo::new(); + // We use FirstSet::anychar here for the same reason as + // we do for an anchor. + let mut info = HirInfo::new(FirstSet::anychar()); + info.first_set.accepts_empty = false; + info.set_onepass(true); info.set_always_utf8(true); info.set_all_assertions(true); info.set_anchored_start(false); @@ -319,7 +398,18 @@ impl Hir { /// Creates a repetition HIR expression. pub fn repetition(rep: Repetition) -> Hir { - let mut info = HirInfo::new(); + let mut info = HirInfo::new( + rep.hir.info.first_set.clone()); + + info.first_set.accepts_empty = + Self::repetition_accepts_empty(&rep); + info.set_onepass( + rep.hir.info.is_onepass() && + // If we are repeating an expression with no + // trigger bytes, DFA construction will run into + // trouble when it tries to figure out the forwarding + // cycle. + !rep.hir.info.first_set.is_empty()); info.set_always_utf8(rep.hir.is_always_utf8()); info.set_all_assertions(rep.hir.is_all_assertions()); // If this operator can match the empty string, then it can never @@ -338,10 +428,30 @@ impl Hir { info: info, } } + fn repetition_accepts_empty(rep: &Repetition) -> bool { + match rep.kind { + RepetitionKind::ZeroOrOne => true, + RepetitionKind::ZeroOrMore => true, + RepetitionKind::OneOrMore => rep.hir.info.first_set.accepts_empty, + RepetitionKind::Range(ref range) => { + match range { + &RepetitionRange::Exactly(0) + | &RepetitionRange::AtLeast(0) + | &RepetitionRange::Bounded(0, _) => true, + _ => rep.hir.info.first_set.accepts_empty, + } + } + } + } /// Creates a group HIR expression. pub fn group(group: Group) -> Hir { - let mut info = HirInfo::new(); + let mut info = HirInfo::new( + group.hir.info.first_set.clone()); + + // We already know accepts_empty from our inner Hir. + // No need to compute it here. + info.set_onepass(group.hir.info.is_onepass()); info.set_always_utf8(group.hir.is_always_utf8()); info.set_all_assertions(group.hir.is_all_assertions()); info.set_anchored_start(group.hir.is_anchored_start()); @@ -363,7 +473,9 @@ impl Hir { 0 => Hir::empty(), 1 => exprs.pop().unwrap(), _ => { - let mut info = HirInfo::new(); + let mut info = HirInfo::new( + Self::concat_first_set(&exprs)); + info.set_onepass(Self::concat_is_onepass(&exprs)); info.set_always_utf8(true); info.set_all_assertions(true); info.set_any_anchored_start(false); @@ -425,6 +537,68 @@ impl Hir { } } } + fn concat_first_set(es: &[Hir]) -> FirstSet { + debug_assert!(es.len() >= 2); + + let mut fset = FirstSet::empty(); + for (i, e) in NestedConcat::new(es).enumerate() { + match e.kind() { + &HirKind::Anchor(_) | &HirKind::WordBoundary(_) + if i < es.len() - 1 => { + continue; + } + _ => {} // FALLTHROUGH + } + + fset.union(&e.info.first_set); + + if ! e.info.first_set.accepts_empty { + fset.accepts_empty = false; + // We can stop accumulating after we stop seeing + // first sets which contain epsilon. + break; + } + } + fset + } + fn concat_is_onepass(es: &[Hir]) -> bool { + let mut empty_run = vec![]; + + for (i, e) in NestedConcat::new(es).enumerate() { + match e.kind() { + &HirKind::Anchor(_) | &HirKind::WordBoundary(_) + if i < es.len() - 1 => { + continue; + } + _ => {} // FALLTHROUGH + } + + if !e.info.is_onepass() { + return false; + } + + let is_real_rep = match e.kind() { + &HirKind::Repetition(ref rep) => { + match rep.kind { + RepetitionKind::Range( + RepetitionRange::Exactly(_)) => false, + _ => true, + } + }, + _ => false, + }; + + empty_run.push(e); + if !(e.info.first_set.accepts_empty || is_real_rep) { + if FirstSet::fsets_clash_ref(&empty_run) { + return false; + } + empty_run.clear(); + } + } + + ! FirstSet::fsets_clash_ref(&empty_run) + } /// Returns the alternation of the given expressions. /// @@ -434,7 +608,12 @@ impl Hir { 0 => Hir::empty(), 1 => exprs.pop().unwrap(), _ => { - let mut info = HirInfo::new(); + let mut info = HirInfo::new( + Self::alternation_first_set(&exprs)); + // The union operate should make sure that we have + // the correct value for accepts_empty here. There is + // no special fixup like in concat. + info.set_onepass(!FirstSet::fsets_clash_value(&exprs)); info.set_always_utf8(true); info.set_all_assertions(true); info.set_anchored_start(true); @@ -477,6 +656,13 @@ impl Hir { } } } + fn alternation_first_set(es: &[Hir]) -> FirstSet { + let mut fset = FirstSet::empty(); + for e in es { + fset.union(&e.info.first_set); + } + fset + } /// Build an HIR expression for `.`. /// @@ -520,6 +706,12 @@ impl Hir { } } + /// Return true if and only if this HIR contains no byte-level + /// ambiguities. + pub fn is_onepass(&self) -> bool { + self.info.is_onepass() + } + /// Return true if and only if this HIR will always match valid UTF-8. /// /// When this returns false, then it is possible for this HIR expression @@ -1114,6 +1306,147 @@ impl fmt::Debug for ClassBytesRange { } } +/// A representation of all the possible ways a word in the language +/// of a regex could begin. ClassBytes has no way to express the empty +/// string, so we add an extra flag to indicate if a FirstSet includes +/// epsilon. Put in a more theoretical way all firstsets are subsets of +/// SIGMA `union` { epsilon }. +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct FirstSet { + /// The bytes which might start a word in the language. + bytes: ClassBytes, + /// True iff the language accepts the empty string. + accepts_empty: bool, +} + +/// A macro to define the fsets_clash associated functions, +/// parameterized over the type of the inner slice. This lets +/// us avoid allocating an extra vector when we check +/// alternations for onepassness. +macro_rules! def_fsets_clash { + ($fun_name:ident, $slice_inner:ty) => { + /// Check if a list of first sets is incompatible. + fn $fun_name(es: &[$slice_inner]) -> bool { + let mut seen_so_far = FirstSet::empty(); + + for e in es.iter() { + let mut snapshot = seen_so_far.clone(); + snapshot.intersect(&e.info.first_set); + if ! snapshot.is_empty() { + return true; + } + + seen_so_far.union(&e.info.first_set); + } + + false + } + } +} +impl FirstSet { + /// A convenience method to create a FirstSet which accepts + /// nothing. + fn empty() -> Self { + FirstSet { + bytes: ClassBytes::empty(), + accepts_empty: false, + } + } + + /// A convenience method to create a FirstSet which accepts + /// everything, but can't be empty. + fn anychar() -> FirstSet { + let mut f = FirstSet::empty(); + f.push_bytes(ClassBytesRange::new(b'\0', b'\xFF')); + f + } + + /// A convenience method to create a FirstSet which accepts + /// nothing but the empty string. + fn epsilon() -> FirstSet { + FirstSet { + bytes: ClassBytes::empty(), + accepts_empty: true, + } + } + + /// Direct constructor. + pub fn new(ranges: I, accepts_empty: bool) -> Self + where I: IntoIterator + { + FirstSet { + bytes: ClassBytes::new(ranges), + accepts_empty: accepts_empty, + } + } + + /// Add a byte range to the byte ranges that the FirstSet + /// accepts. + fn push_bytes(&mut self, byte_range: ClassBytesRange) { + self.bytes.push(byte_range); + } + + /// Take the set union of two FirstSets, mutating the lhs + /// to contain the result. + fn union(&mut self, other: &FirstSet) { + self.bytes.union(&other.bytes); + self.accepts_empty = self.accepts_empty || other.accepts_empty; + } + + /// Take the set intersection of two FirstSets, mutating the lhs + /// to contain the result. + fn intersect(&mut self, other: &FirstSet) { + self.bytes.intersect(&other.bytes); + self.accepts_empty = self.accepts_empty && other.accepts_empty; + } + + /// True iff the FirstSet accepts nothing, not even the empty string. + fn is_empty(&self) -> bool { + self.bytes.is_empty() && !self.accepts_empty + } + + def_fsets_clash!(fsets_clash_ref, &Hir); + def_fsets_clash!(fsets_clash_value, Hir); +} + +/// An iterator over a concatenation of expressions which +/// drills down into other embedded concatenations. +struct NestedConcat<'a>(Vec<(&'a [Hir], usize)>); +impl<'a> NestedConcat<'a> { + fn new(es: &'a [Hir]) -> Self { + NestedConcat(vec![(es, 0)]) + } +} +impl<'a> Iterator for NestedConcat<'a> { + type Item = &'a Hir; + + fn next(&mut self) -> Option<&'a Hir> { + loop { + if self.0.len() == 0 { + return None; + } + + let tip = self.0.len() - 1; + let (es, idx) = self.0[tip]; + + if idx >= es.len() { + self.0.pop(); + continue; + } + + self.0[tip].1 += 1; + + match es[idx].kind() { + &HirKind::Concat(ref es) => { + self.0.push((es, 0)); + continue; + } + _ => return Some(&es[idx]), + } + } + } +} + /// The high-level intermediate representation for an anchor assertion. /// /// A matching anchor assertion is always zero-length. @@ -1320,6 +1653,9 @@ struct HirInfo { /// If more attributes need to be added, it is OK to increase the size of /// this as appropriate. bools: u8, + /// A description of how words in the language of this expression + /// might start. + first_set: FirstSet, } // A simple macro for defining bitfield accessors/mutators. @@ -1340,9 +1676,10 @@ macro_rules! define_bool { } impl HirInfo { - fn new() -> HirInfo { + fn new(fs: FirstSet) -> HirInfo { HirInfo { bools: 0, + first_set: fs, } } @@ -1353,11 +1690,25 @@ impl HirInfo { define_bool!(4, is_any_anchored_start, set_any_anchored_start); define_bool!(5, is_any_anchored_end, set_any_anchored_end); define_bool!(6, is_match_empty, set_match_empty); + define_bool!(7, is_onepass, set_onepass); +} + +/// The first byte of a unicode code point. +/// +/// We only ever care about the first byte of a particular character +/// because the onepass DFA is implemented in the byte space not the +/// character space. This means, for example, that a branch between +/// lowercase delta and uppercase delta is actually non-deterministic. +fn first_byte(c: char) -> u8 { + let mut b: [u8; 4] = [0; 4]; + c.encode_utf8(&mut b); + b[0] } #[cfg(test)] mod tests { use super::*; + use parser::Parser; fn uclass(ranges: &[(char, char)]) -> ClassUnicode { let ranges: Vec = ranges @@ -2053,11 +2404,11 @@ mod tests { expr = Hir { kind: HirKind::Concat(vec![expr]), - info: HirInfo::new(), + info: HirInfo::new(FirstSet::empty()), }; expr = Hir { kind: HirKind::Alternation(vec![expr]), - info: HirInfo::new(), + info: HirInfo::new(FirstSet::empty()), }; } assert!(!expr.kind.is_empty()); @@ -2072,4 +2423,208 @@ mod tests { .join() .unwrap(); } + + // + // First Set intersection smoke tests + // + + fn is_intersecting_fset(e1: &Hir, e2: &Hir) -> bool { + let mut fset = e1.info.first_set.clone(); + fset.intersect(&e2.info.first_set); + ! fset.is_empty() + } + + #[test] + fn fset_lit() { + let e1 = Parser::new().parse("a").unwrap(); + let e2 = Parser::new().parse("a").unwrap(); + let e3 = Parser::new().parse("b").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_class() { + let e1 = Parser::new().parse("[a]").unwrap(); + let e2 = Parser::new().parse("[a]").unwrap(); + let e3 = Parser::new().parse("[b]").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_class_n() { + let e1 = Parser::new().parse("[xamn]").unwrap(); + let e2 = Parser::new().parse("[rlwa]").unwrap(); + let e3 = Parser::new().parse("[bcq]").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_alt() { + let e1 = Parser::new().parse("ab|bc|ad").unwrap(); + let e2 = Parser::new().parse("yyyy|am|zz").unwrap(); + let e3 = Parser::new().parse("cc|ww").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_group() { + let e1 = Parser::new().parse("(?:ab)").unwrap(); + let e2 = Parser::new().parse("(?:aq)").unwrap(); + let e3 = Parser::new().parse("(?:m)").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_concat() { + let e1 = Parser::new().parse("aa(?:nb)").unwrap(); + let e2 = Parser::new().parse("aa(?:rq)").unwrap(); + let e3 = Parser::new().parse("bb(?:m)").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_word_boundary_dropped() { + let e1 = Parser::new().parse(r"aa").unwrap(); + let e2 = Parser::new().parse(r"\baa").unwrap(); + let e3 = Parser::new().parse(r"\bbb").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_word_boundary_all() { + let e1 = Parser::new().parse(r"aa").unwrap(); + let e2 = Parser::new().parse(r"\b").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + } + + #[test] + fn fset_not_word_boundary_dropped() { + let e1 = Parser::new().parse(r"aa").unwrap(); + let e2 = Parser::new().parse(r"\Baa").unwrap(); + let e3 = Parser::new().parse(r"\Bbb").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_not_word_boundary_all() { + let e1 = Parser::new().parse(r"aa").unwrap(); + let e2 = Parser::new().parse(r"\B").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + } + + #[test] + fn fset_start_anchor_dropped() { + let e1 = Parser::new().parse(r"aa").unwrap(); + let e2 = Parser::new().parse(r"^aa").unwrap(); + let e3 = Parser::new().parse(r"^bb").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_terminal_emptylook_all() { + let e = Parser::new().parse(r"a*\b").unwrap(); + + assert_eq!(FirstSet::anychar(), e.info.first_set); + } + + #[test] + fn fset_empty_alt() { + let e1 = Parser::new().parse(r"(?:a|())b").unwrap(); + let e2 = Parser::new().parse(r"b").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + } + + // + // Onepass Unit Tests + // + + macro_rules! test_onepass { + ($fun_name:ident, $re_str:expr) => { + #[test] + fn $fun_name() { + let e = Parser::new().parse($re_str).unwrap(); + assert!( + e.info.is_onepass(), + "info={:?}", e.info); + } + } + } + + macro_rules! test_not_onepass { + ($fun_name:ident, $re_str:expr) => { + #[test] + fn $fun_name() { + let e = Parser::new().parse($re_str).unwrap(); + assert!( + !e.info.is_onepass(), + "info={:?}", e.info); + } + } + } + + test_onepass!(onepass_smoke_1_, r"[^x]x(.*)"); + test_not_onepass!(onepass_smoke_2_, r"(.*)x(.*)"); + + test_not_onepass!(onepass_alt_1_, r"a|b|c|a|d"); + test_not_onepass!(onepass_alt_2_, r"a|b|c|((m|a|x)|g)|d"); + test_onepass!(onepass_alt_3_, r"a|b|c|x|d"); + test_onepass!(onepass_alt_4_, r"a|b|c|((m|x)|g)|d"); + + test_not_onepass!(onepass_not_in_rust, r"(\d+)-(\d+)"); + + test_onepass!(onepass_empty_alt_1_, r"(a|())b"); + test_not_onepass!(onepass_empty_alt_2_, r"(a|())a"); + + test_not_onepass!(onepass_rep_1_, r"a*a"); + test_not_onepass!(onepass_rep_2_, r"a+a"); + test_not_onepass!(onepass_rep_3_, r"a{4,8}a"); + test_not_onepass!(onepass_rep_4_, r"a{4,}a"); + test_onepass!(onepass_rep_5_, r"a{4}a"); + test_not_onepass!(onepass_rep_6_, r"a?a"); + + test_onepass!(onepass_rep_7_, r"a*b"); + test_onepass!(onepass_rep_8_, r"a+b"); + test_onepass!(onepass_rep_9_, r"a{4,8}b"); + test_onepass!(onepass_rep_10_, r"a{4,}b"); + test_onepass!(onepass_rep_11_, r"a{4}b"); + test_onepass!(onepass_rep_12_, r"a?b"); + + test_not_onepass!(onepass_concat_middle_1_, r"ab?bc"); + test_onepass!(onepass_concat_middle_2_, r"a(?:b|c)dc"); + + test_not_onepass!(onepass_unicode_class_1_, r"\d"); + test_not_onepass!(onepass_unicode_class_2_, r"\s"); + test_not_onepass!(onepass_unicode_class_3_, r"\w"); + test_not_onepass!(onepass_unicode_class_4_, r"inthe\wmiddle"); + + test_not_onepass!(onepass_unicode_clash_1_, r"Δ|δ"); + + test_not_onepass!(onepass_empty_assert_1_, r"a|^a"); + test_onepass!(onepass_empty_assert_2_, r"\ba"); + test_onepass!(onepass_empty_assert_3_, r"^a"); + test_onepass!(onepass_empty_assert_4_, r"a$"); + + test_not_onepass!(onepass_naked_empty_assert_1_, r"\w|a"); + } diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index f47ad9ce43..5e026dc8e1 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -106,6 +106,7 @@ done automatically in the `regex` crate. #![deny(missing_docs)] extern crate ucd_util; +extern crate utf8_ranges; pub use error::{Error, Result}; pub use parser::{Parser, ParserBuilder}; diff --git a/src/analysis.rs b/src/analysis.rs deleted file mode 100644 index 25d180074e..0000000000 --- a/src/analysis.rs +++ /dev/null @@ -1,616 +0,0 @@ -use syntax::hir::{ - Hir, HirKind, Literal, ClassBytes, ClassBytesRange, - Class, Visitor, RepetitionRange, RepetitionKind -}; -use syntax::hir; -use utf8_ranges::Utf8Sequences; - -/// True iff the given expression is onepass -/// -/// The general approach here is to find all the places in -/// the given Hir where any sort of branching occurs, -/// and examine the start of each expression at the branch -/// to see if there is an ambiguity. -/// -/// For example, given the regex `a|b`, we would examine -/// both branches of the alternation `a` and `b` and -/// notice that they don't clash, so the regex is onepass. -/// On the other hand the branches of `a|a` do clash, -/// so that regex is not onepass. -/// -/// Alternations are not the only branch points in a regex. -/// We also have to make sure to consider repetitions like -/// `a*a`, which is not onepass because there is no way -/// to tell whether we have to loop back to the repeated -/// expression or continue on by looking at just one byte. -/// `a*b` is onepass because you can figure out what to do. -/// If you see an `a`, go back to the start of the loop, -/// and if you see a `b` continue onward. -/// -/// A third, more subtle case is the case of concatenations -/// of expressions where some of the expressions can -/// accept the empty string. Consider `a(b|)ba`. This -/// regex is not onepass because it is not clear what to -/// do upon seeing the input `ab`. The problem is that `(b|)` -/// and `ba` clash with one other. -/// -/// To get a bit more specific about what it means for two -/// expressions to clash, we introduce the concept of first -/// sets. The first set of an expression is the set of -/// bytes which might begin a word in the language of that -/// expression. If the expression can accept the empty string, -/// the first set takes note of that as well. -/// -/// To handle these three cases, we use a visitor to -/// find the alternations, repetitions, and concatenations. -/// Whenever we find one of the above cases, we compute -/// the first set of the various branches involved, -/// then check to see if the first sets intersect. If -/// we ever find a non-empty intersection, the regex -/// is not onepass. -pub fn is_onepass(expr: &Hir) -> bool { - hir::visit(expr, IsOnePassVisitor::new()).unwrap() -} - -struct IsOnePassVisitor(bool); - -impl Visitor for IsOnePassVisitor { - type Output = bool; - type Err = (); - - fn finish(self) -> Result { - Ok(self.0) - } - - fn visit_pre(&mut self, hir: &Hir) -> Result<(), ()> { - if !self.0 { - return Ok(()) - } - - match hir.kind() { - &HirKind::Concat(ref es) => self.check_concat(&es), - &HirKind::Alternation(ref es) => self.check_alternation(&es), - &HirKind::Repetition(ref rep) => { - if fset_of(&*rep.hir).is_empty() { - self.0 = false; - } - } - &HirKind::Class(ref cls) => self.check_cls(cls), - _ => () - } - - Ok(()) - } -} - -impl IsOnePassVisitor { - fn new() -> Self { - IsOnePassVisitor(true) - } - - fn check_concat(&mut self, es: &[Hir]) { - let mut empty_run = vec![]; - - for e in NestedConcat::new(es) { - let is_rep = match e.kind() { - &HirKind::Repetition(_) => true, - _ => false, - }; - - empty_run.push(e); - if !(accepts_empty(e) || is_rep) { - self.0 = self.0 && !fsets_clash(&empty_run); - empty_run.clear(); - } - } - - if empty_run.len() > 0 { - self.0 = self.0 && !fsets_clash(&empty_run); - } - } - - fn check_alternation(&mut self, es: &[Hir]) { - self.0 = self.0 && !fsets_clash(&es.iter().collect::>()); - } - - // Unicode classes are really just big alternatives from the byte - // oriented point of view. - // - // This function translates a unicode class into the - // byte space and checks for intersecting first sets. - fn check_cls(&mut self, cls: &Class) { - match cls { - &Class::Unicode(ref ucls) => { - let mut seen_char: [bool; 256] = [false; 256]; - - for cr in ucls.iter() { - for br in Utf8Sequences::new(cr.start(), cr.end()) { - let first = br.as_slice()[0]; - for b in first.start..(first.end+1) { - if seen_char[b as usize] { - self.0 = false; - return; - } - seen_char[b as usize] = true; - } - } - } - } - _ => {} - } - } - -} - -/// Check if a list of first sets is incompatible. -/// -/// O(n^2), but n will usually be quite small. -fn fsets_clash(es: &[&Hir]) -> bool { - for (i, e1) in es.iter().enumerate() { - for (j, e2) in es.iter().enumerate() { - if i != j { - let mut fset = fset_of(e1); - let fset2 = fset_of(e2); - - fset.intersect(&fset2); - if ! fset.is_empty() { - return true; - } - } - } - } - false -} - - -/// Compute the first set of a given regular expression. -/// -/// The first set of a regular expression is the set of all bytes -/// which might begin it. This is a less general version of the -/// notion of a regular expression preview (the first set can be -/// thought of as the 1-preview of a regular expression). -/// -/// Note that first sets are byte-oriented because the DFA is -/// byte oriented. This means an expression like /Δ|δ/ is actually not -/// onepass, even though there is clearly no non-determinism inherent -/// to the regex at a unicode code point level (big delta and little -/// delta start with the same byte). -fn fset_of(expr: &Hir) -> FirstSet { - fn singleton(b: u8) -> FirstSet { - let mut f = FirstSet::empty(); - f.push_bytes(ClassBytesRange::new(b, b)); - f - } - - // First compute the set of characters that might begin - // the expression (ignoring epsilon for now). - let mut f_char_set = match expr.kind() { - &HirKind::Empty => FirstSet::epsilon(), - &HirKind::Literal(ref lit) => { - match lit { - &Literal::Unicode(c) => singleton(first_byte(c)), - &Literal::Byte(b) => singleton(b), - } - } - &HirKind::Class(ref class) => { - match class { - &Class::Unicode(ref c) => { - // Get all the bytes which might begin this unicode - // class. - let mut cb = FirstSet::empty(); - for cr in c.iter() { - for br in Utf8Sequences::new(cr.start(), cr.end()) { - let first = br.as_slice()[0]; - cb.push_bytes( - ClassBytesRange::new(first.start, first.end)); - } - } - cb - } - &Class::Bytes(ref b) => - FirstSet::new(b.iter().map(|x| *x), false), - } - } - - // When an empty look (Anchor or WordBoundary) is at the start of - // a concatenation, we conservatively assume that the assertion - // will pass, so we just drop it. Then we can only get to this - // point if we are dealing with some sort of naked empty look. - // For now we just do the most conservative thing and say - // that such an emptylook could potentially match on any character. - &HirKind::Anchor(_) | &HirKind::WordBoundary(_) => FirstSet::anychar(), - - &HirKind::Repetition(ref rep) => fset_of(&rep.hir), - &HirKind::Group(ref group) => fset_of(&group.hir), - - // The most involved case. We need to strip leading empty-looks - // as well as take the union of the first sets of the first n+1 - // expressions where n is the number of leading expressions which - // accept the empty string. - &HirKind::Concat(ref es) => { - let mut fset = FirstSet::empty(); - for (i, e) in es.iter().enumerate() { - match e.kind() { - &HirKind::Anchor(_) | &HirKind::WordBoundary(_) => { - // Ignore any leading emptylooks, but any in tail - // position have to be accounted for. - if i == es.len() - 1 { - fset.union(&FirstSet::anychar()); - } - } - _ => { - let inner_fset = fset_of(e); - fset.union(&inner_fset); - - if !accepts_empty(e) { - // We can stop accumulating after we stop seeing - // first sets which contain epsilon. - break; - } - } - } - } - fset - } - &HirKind::Alternation(ref es) => { - let mut fset = FirstSet::empty(); - for e in es { - fset.union(&fset_of(e)); - } - fset - } - }; - - f_char_set.accepts_empty = accepts_empty(expr); - f_char_set -} - -fn accepts_empty(expr: &Hir) -> bool { - match expr.kind() { - &HirKind::Empty => true, - &HirKind::Literal(_) => false, - &HirKind::Class(_) => false, - - // A naked empty look is a pretty weird thing because we - // normally strip them from the beginning of concatinations. - // We are just going to treat them like `.` - &HirKind::Anchor(_) | &HirKind::WordBoundary(_) => false, - - &HirKind::Repetition(ref rep) => { - match rep.kind { - RepetitionKind::ZeroOrOne => true, - RepetitionKind::ZeroOrMore => true, - RepetitionKind::OneOrMore => accepts_empty(&rep.hir), - RepetitionKind::Range(ref range) => { - match range { - &RepetitionRange::Exactly(0) - | &RepetitionRange::AtLeast(0) - | &RepetitionRange::Bounded(0, _) => true, - _ => accepts_empty(&rep.hir), - } - } - } - } - - &HirKind::Group(ref group) => accepts_empty(&group.hir), - - &HirKind::Concat(ref es) => { - let mut accepts: bool = true; - for e in es.iter() { - match e.kind() { - &HirKind::Anchor(_) | &HirKind::WordBoundary(_) => { - // Ignore any leading emptylooks. - } - _ => { - accepts = accepts && accepts_empty(&e); - } - } - - if !accepts { - break; - } - } - accepts - } - - &HirKind::Alternation(ref es) => es.iter().any(accepts_empty) - } -} - -/// The first byte of a unicode code point. -/// -/// We only ever care about the first byte of a particular character -/// because the onepass DFA is implemented in the byte space not the -/// character space. This means, for example, that a branch between -/// lowercase delta and uppercase delta is actually non-deterministic. -fn first_byte(c: char) -> u8 { - let mut b: [u8; 4] = [0; 4]; - c.encode_utf8(&mut b); - b[0] -} - -/// A representation of all the possible ways a word in the language -/// of a regex could begin. ClassBytes has no way to express the empty -/// string, so we add an extra flag to indicate if a FirstSet includes -/// epsilon. Put in a more theoretical way all firstsets are subsets of -/// SIGMA `union` { epsilon }. -#[derive(Debug, PartialEq, Eq)] -struct FirstSet { - bytes: ClassBytes, - pub accepts_empty: bool, -} - -impl FirstSet { - fn empty() -> Self { - FirstSet { - bytes: ClassBytes::empty(), - accepts_empty: false, - } - } - - pub fn new(ranges: I, accepts_empty: bool) -> Self - where I: IntoIterator - { - FirstSet { - bytes: ClassBytes::new(ranges), - accepts_empty: accepts_empty, - } - } - - fn anychar() -> FirstSet { - let mut f = FirstSet::empty(); - f.push_bytes(ClassBytesRange::new(b'\0', b'\xFF')); - f - } - - fn epsilon() -> FirstSet { - FirstSet { - bytes: ClassBytes::empty(), - accepts_empty: true, - } - } - - fn push_bytes(&mut self, byte_range: ClassBytesRange) { - self.bytes.push(byte_range); - } - - fn union(&mut self, other: &FirstSet) { - self.bytes.union(&other.bytes); - self.accepts_empty = self.accepts_empty || other.accepts_empty; - } - - fn intersect(&mut self, other: &FirstSet) { - self.bytes.intersect(&other.bytes); - self.accepts_empty = self.accepts_empty && other.accepts_empty; - } - - fn is_empty(&self) -> bool { - self.bytes.is_empty() && !self.accepts_empty - } -} - -/// An iterator over a concatenation of expressions which -/// drills down into other embedded concatenations. -struct NestedConcat<'a>(Vec<(&'a [Hir], usize)>); -impl<'a> NestedConcat<'a> { - fn new(es: &'a [Hir]) -> Self { - NestedConcat(vec![(es, 0)]) - } -} -impl<'a> Iterator for NestedConcat<'a> { - type Item = &'a Hir; - - fn next(&mut self) -> Option<&'a Hir> { - loop { - if self.0.len() == 0 { - return None; - } - - let tip = self.0.len() - 1; - let (es, idx) = self.0[tip]; - - if idx >= es.len() { - self.0.pop(); - continue; - } - - self.0[tip].1 += 1; - - match es[idx].kind() { - &HirKind::Concat(ref es) => { - self.0.push((es, 0)); - continue; - // self.next() - } - _ => return Some(&es[idx]), - } - } - } -} - -#[cfg(test)] -mod tests { - use syntax::Parser; - use syntax::hir::Hir; - use super::*; - - fn is_intersecting_fset(e1: &Hir, e2: &Hir) -> bool { - let mut fset = fset_of(e1); - fset.intersect(&fset_of(e2)); - ! fset.is_empty() - } - - // - // First Set intersection smoke tests - // - - #[test] - fn fset_lit() { - let e1 = Parser::new().parse("a").unwrap(); - let e2 = Parser::new().parse("a").unwrap(); - let e3 = Parser::new().parse("b").unwrap(); - - assert!(is_intersecting_fset(&e1, &e2)); - assert!(!is_intersecting_fset(&e1, &e3)); - } - - #[test] - fn fset_class() { - let e1 = Parser::new().parse("[a]").unwrap(); - let e2 = Parser::new().parse("[a]").unwrap(); - let e3 = Parser::new().parse("[b]").unwrap(); - - assert!(is_intersecting_fset(&e1, &e2)); - assert!(!is_intersecting_fset(&e1, &e3)); - } - - #[test] - fn fset_class_n() { - let e1 = Parser::new().parse("[xamn]").unwrap(); - let e2 = Parser::new().parse("[rlwa]").unwrap(); - let e3 = Parser::new().parse("[bcq]").unwrap(); - - assert!(is_intersecting_fset(&e1, &e2)); - assert!(!is_intersecting_fset(&e1, &e3)); - } - - #[test] - fn fset_alt() { - let e1 = Parser::new().parse("ab|bc|ad").unwrap(); - let e2 = Parser::new().parse("yyyy|am|zz").unwrap(); - let e3 = Parser::new().parse("cc|ww").unwrap(); - - assert!(is_intersecting_fset(&e1, &e2)); - assert!(!is_intersecting_fset(&e1, &e3)); - } - - #[test] - fn fset_group() { - let e1 = Parser::new().parse("(?:ab)").unwrap(); - let e2 = Parser::new().parse("(?:aq)").unwrap(); - let e3 = Parser::new().parse("(?:m)").unwrap(); - - assert!(is_intersecting_fset(&e1, &e2)); - assert!(!is_intersecting_fset(&e1, &e3)); - } - - #[test] - fn fset_concat() { - let e1 = Parser::new().parse("aa(?:nb)").unwrap(); - let e2 = Parser::new().parse("aa(?:rq)").unwrap(); - let e3 = Parser::new().parse("bb(?:m)").unwrap(); - - assert!(is_intersecting_fset(&e1, &e2)); - assert!(!is_intersecting_fset(&e1, &e3)); - } - - #[test] - fn fset_word_boundary_dropped() { - let e1 = Parser::new().parse(r"aa").unwrap(); - let e2 = Parser::new().parse(r"\baa").unwrap(); - let e3 = Parser::new().parse(r"\bbb").unwrap(); - - assert!(is_intersecting_fset(&e1, &e2)); - assert!(!is_intersecting_fset(&e1, &e3)); - } - - #[test] - fn fset_word_boundary_all() { - let e1 = Parser::new().parse(r"aa").unwrap(); - let e2 = Parser::new().parse(r"\b").unwrap(); - - assert!(is_intersecting_fset(&e1, &e2)); - } - - #[test] - fn fset_not_word_boundary_dropped() { - let e1 = Parser::new().parse(r"aa").unwrap(); - let e2 = Parser::new().parse(r"\Baa").unwrap(); - let e3 = Parser::new().parse(r"\Bbb").unwrap(); - - assert!(is_intersecting_fset(&e1, &e2)); - assert!(!is_intersecting_fset(&e1, &e3)); - } - - #[test] - fn fset_not_word_boundary_all() { - let e1 = Parser::new().parse(r"aa").unwrap(); - let e2 = Parser::new().parse(r"\B").unwrap(); - - assert!(is_intersecting_fset(&e1, &e2)); - } - - #[test] - fn fset_start_anchor_dropped() { - let e1 = Parser::new().parse(r"aa").unwrap(); - let e2 = Parser::new().parse(r"^aa").unwrap(); - let e3 = Parser::new().parse(r"^bb").unwrap(); - - assert!(is_intersecting_fset(&e1, &e2)); - assert!(!is_intersecting_fset(&e1, &e3)); - } - - #[test] - fn fset_terminal_emptylook_all() { - let e = Parser::new().parse(r"a*\b").unwrap(); - - let mut total_accept = FirstSet::anychar(); - total_accept.accepts_empty = true; - - assert_eq!(total_accept, fset_of(&e)); - } - - #[test] - fn fset_empty_alt() { - let e1 = Parser::new().parse(r"(?:a|())b").unwrap(); - let e2 = Parser::new().parse(r"b").unwrap(); - - assert!(is_intersecting_fset(&e1, &e2)); - } - - // - // onepass smoke tests - // - - // This test is pulled right from some of Russ Cox's - // comments on onepass regex. - // - // Note that Russ Cox's other example of a onepass regex - // (r"(\d+)-(\d+)") is actually not onepass for us because - // there is byte-level nondeterminism in the \d character - // class, and we care about things in the byte space rather - // than the character space. If you do a onepass engine at - // the character level, Cox's example is indeed onepass. - #[test] - fn is_onepass_smoke_test1() { - let e1 = Parser::new().parse(r"([^x]*)x(.*)").unwrap(); - let e2 = Parser::new().parse(r"(.*)x(.*)").unwrap(); - - assert!(is_onepass(&e1)); - assert!(!is_onepass(&e2)); - } - - #[test] - fn is_onepass_empty_alt() { - let e1 = Parser::new().parse(r"(a|())b").unwrap(); - let e2 = Parser::new().parse(r"(a|())a").unwrap(); - - assert!(is_onepass(&e1)); - assert!(!is_onepass(&e2)); - } - - #[test] - fn is_onepass_rep() { - let e1 = Parser::new().parse(r"a+a").unwrap(); - let e2 = Parser::new().parse(r"a*a").unwrap(); - - assert!(!is_onepass(&e1)); - assert!(!is_onepass(&e2)); - } - - #[test] - fn is_onepass_clash_in_middle_of_concat() { - let e = Parser::new().parse(r"ab?b").unwrap(); - assert!(!is_onepass(&e)); - } -} diff --git a/src/lib.rs b/src/lib.rs index 4a901f983b..d35042492e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -668,7 +668,6 @@ mod re_unicode; mod sparse; #[cfg(any(regex_runtime_teddy_ssse3, regex_runtime_teddy_avx2))] mod vector; -mod analysis; mod onepass; /// The `internal` module exists to support suspicious activity, such as diff --git a/src/onepass.rs b/src/onepass.rs index f6a7c68a70..442204b6b2 100644 --- a/src/onepass.rs +++ b/src/onepass.rs @@ -27,7 +27,6 @@ use prog::{Program, Inst, EmptyLook}; use literal::LiteralSearcher; use re_trait::Slot; use input::{ByteInput, Input}; -use analysis; use compile::Compiler; use syntax::hir::Hir; use re_builder::RegexOptions; @@ -530,7 +529,7 @@ impl OnePassCompiler { return Err(OnePassError::RegexSetUnsupported); } - if ! analysis::is_onepass(&es[0]) { + if ! es[0].is_onepass() { return Err(OnePassError::HasNondeterminism); } From 6ec93a6f992d7a5cda3afe290c70cd43e3401aff Mon Sep 17 00:00:00 2001 From: Ethan Pailes Date: Sun, 23 Sep 2018 03:59:26 -0400 Subject: [PATCH 6/8] Revert "refactor is_onepass to be part of Hir construction." This reverts commit 6355ce519b115f1b4c3bcbaac1b824e1e5151309. --- regex-syntax/Cargo.toml | 2 - regex-syntax/src/hir/mod.rs | 583 +--------------------------------- regex-syntax/src/lib.rs | 1 - src/analysis.rs | 616 ++++++++++++++++++++++++++++++++++++ src/lib.rs | 1 + src/onepass.rs | 3 +- 6 files changed, 633 insertions(+), 573 deletions(-) create mode 100644 src/analysis.rs diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index 2e9724c1aa..ef562eca0e 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -11,5 +11,3 @@ workspace = ".." [dependencies] ucd-util = "0.1.0" -# For extracting the first set from a unicode class -utf8-ranges = "1.0.1" diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index fe3f4254c5..fb82dd9425 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -20,15 +20,14 @@ use std::u8; use ast::Span; use hir::interval::{Interval, IntervalSet, IntervalSetIter}; use unicode; -use utf8_ranges::Utf8Sequences; pub use hir::visitor::{Visitor, visit}; mod interval; -mod visitor; pub mod literal; pub mod print; pub mod translate; +mod visitor; /// An error that can occur while translating an `Ast` to a `Hir`. #[derive(Clone, Debug, Eq, PartialEq)] @@ -218,9 +217,7 @@ impl Hir { /// /// An empty HIR expression always matches, including the empty string. pub fn empty() -> Hir { - let mut info = HirInfo::new(FirstSet::epsilon()); - info.first_set.accepts_empty = true; - info.set_onepass(true); + let mut info = HirInfo::new(); info.set_always_utf8(true); info.set_all_assertions(true); info.set_anchored_start(false); @@ -243,9 +240,8 @@ impl Hir { if let Literal::Byte(b) = lit { assert!(b > 0x7F); } - let mut info = HirInfo::new(Self::literal_first_set(&lit)); - info.first_set.accepts_empty = false; - info.set_onepass(true); + + let mut info = HirInfo::new(); info.set_always_utf8(lit.is_unicode()); info.set_all_assertions(false); info.set_anchored_start(false); @@ -258,24 +254,10 @@ impl Hir { info: info, } } - fn literal_first_set(lit: &Literal) -> FirstSet { - fn singleton(b: u8) -> FirstSet { - let mut f = FirstSet::empty(); - f.push_bytes(ClassBytesRange::new(b, b)); - f - } - - match lit { - &Literal::Unicode(c) => singleton(first_byte(c)), - &Literal::Byte(b) => singleton(b), - } - } /// Creates a class HIR expression. pub fn class(class: Class) -> Hir { - let mut info = HirInfo::new(Self::class_first_set(&class)); - info.first_set.accepts_empty = false; - info.set_onepass(Self::class_is_onepass(&class)); + let mut info = HirInfo::new(); info.set_always_utf8(class.is_always_utf8()); info.set_all_assertions(false); info.set_anchored_start(false); @@ -288,67 +270,10 @@ impl Hir { info: info, } } - fn class_first_set(class: &Class) -> FirstSet { - match class { - &Class::Unicode(ref c) => { - // Get all the bytes which might begin this unicode - // class. - let mut cb = FirstSet::empty(); - for cr in c.iter() { - for br in Utf8Sequences::new(cr.start(), cr.end()) { - let first = br.as_slice()[0]; - cb.push_bytes( - ClassBytesRange::new(first.start, first.end)); - } - } - cb - } - &Class::Bytes(ref b) => - FirstSet::new(b.iter().map(|x| *x), false), - } - } - // Unicode classes are really just big alternatives from the byte - // oriented point of view. - // - // This function translates a unicode class into the - // byte space and checks for intersecting first sets. - // - // Byte classes are always onepass - fn class_is_onepass(cls: &Class) -> bool { - match cls { - &Class::Unicode(ref ucls) => { - let mut seen_char: [bool; 256] = [false; 256]; - - for cr in ucls.iter() { - for br in Utf8Sequences::new(cr.start(), cr.end()) { - let first = br.as_slice()[0]; - for b in first.start..(first.end+1) { - if seen_char[b as usize] { - return false; - } - seen_char[b as usize] = true; - } - } - } - } - _ => {} - } - - true - } - /// Creates an anchor assertion HIR expression. pub fn anchor(anchor: Anchor) -> Hir { - // When an empty look (Anchor or WordBoundary) is at the start of - // a concatenation, we conservatively assume that the assertion - // will pass, so we just scan forward to the next Hir to - // compute the first set. Then this first set won't spread - // to concatenations that an anchor starts. - let mut info = HirInfo::new(FirstSet::anychar()); - - info.first_set.accepts_empty = true; - info.set_onepass(true); + let mut info = HirInfo::new(); info.set_always_utf8(true); info.set_all_assertions(true); info.set_anchored_start(false); @@ -372,11 +297,7 @@ impl Hir { /// Creates a word boundary assertion HIR expression. pub fn word_boundary(word_boundary: WordBoundary) -> Hir { - // We use FirstSet::anychar here for the same reason as - // we do for an anchor. - let mut info = HirInfo::new(FirstSet::anychar()); - info.first_set.accepts_empty = false; - info.set_onepass(true); + let mut info = HirInfo::new(); info.set_always_utf8(true); info.set_all_assertions(true); info.set_anchored_start(false); @@ -398,18 +319,7 @@ impl Hir { /// Creates a repetition HIR expression. pub fn repetition(rep: Repetition) -> Hir { - let mut info = HirInfo::new( - rep.hir.info.first_set.clone()); - - info.first_set.accepts_empty = - Self::repetition_accepts_empty(&rep); - info.set_onepass( - rep.hir.info.is_onepass() && - // If we are repeating an expression with no - // trigger bytes, DFA construction will run into - // trouble when it tries to figure out the forwarding - // cycle. - !rep.hir.info.first_set.is_empty()); + let mut info = HirInfo::new(); info.set_always_utf8(rep.hir.is_always_utf8()); info.set_all_assertions(rep.hir.is_all_assertions()); // If this operator can match the empty string, then it can never @@ -428,30 +338,10 @@ impl Hir { info: info, } } - fn repetition_accepts_empty(rep: &Repetition) -> bool { - match rep.kind { - RepetitionKind::ZeroOrOne => true, - RepetitionKind::ZeroOrMore => true, - RepetitionKind::OneOrMore => rep.hir.info.first_set.accepts_empty, - RepetitionKind::Range(ref range) => { - match range { - &RepetitionRange::Exactly(0) - | &RepetitionRange::AtLeast(0) - | &RepetitionRange::Bounded(0, _) => true, - _ => rep.hir.info.first_set.accepts_empty, - } - } - } - } /// Creates a group HIR expression. pub fn group(group: Group) -> Hir { - let mut info = HirInfo::new( - group.hir.info.first_set.clone()); - - // We already know accepts_empty from our inner Hir. - // No need to compute it here. - info.set_onepass(group.hir.info.is_onepass()); + let mut info = HirInfo::new(); info.set_always_utf8(group.hir.is_always_utf8()); info.set_all_assertions(group.hir.is_all_assertions()); info.set_anchored_start(group.hir.is_anchored_start()); @@ -473,9 +363,7 @@ impl Hir { 0 => Hir::empty(), 1 => exprs.pop().unwrap(), _ => { - let mut info = HirInfo::new( - Self::concat_first_set(&exprs)); - info.set_onepass(Self::concat_is_onepass(&exprs)); + let mut info = HirInfo::new(); info.set_always_utf8(true); info.set_all_assertions(true); info.set_any_anchored_start(false); @@ -537,68 +425,6 @@ impl Hir { } } } - fn concat_first_set(es: &[Hir]) -> FirstSet { - debug_assert!(es.len() >= 2); - - let mut fset = FirstSet::empty(); - for (i, e) in NestedConcat::new(es).enumerate() { - match e.kind() { - &HirKind::Anchor(_) | &HirKind::WordBoundary(_) - if i < es.len() - 1 => { - continue; - } - _ => {} // FALLTHROUGH - } - - fset.union(&e.info.first_set); - - if ! e.info.first_set.accepts_empty { - fset.accepts_empty = false; - // We can stop accumulating after we stop seeing - // first sets which contain epsilon. - break; - } - } - fset - } - fn concat_is_onepass(es: &[Hir]) -> bool { - let mut empty_run = vec![]; - - for (i, e) in NestedConcat::new(es).enumerate() { - match e.kind() { - &HirKind::Anchor(_) | &HirKind::WordBoundary(_) - if i < es.len() - 1 => { - continue; - } - _ => {} // FALLTHROUGH - } - - if !e.info.is_onepass() { - return false; - } - - let is_real_rep = match e.kind() { - &HirKind::Repetition(ref rep) => { - match rep.kind { - RepetitionKind::Range( - RepetitionRange::Exactly(_)) => false, - _ => true, - } - }, - _ => false, - }; - - empty_run.push(e); - if !(e.info.first_set.accepts_empty || is_real_rep) { - if FirstSet::fsets_clash_ref(&empty_run) { - return false; - } - empty_run.clear(); - } - } - - ! FirstSet::fsets_clash_ref(&empty_run) - } /// Returns the alternation of the given expressions. /// @@ -608,12 +434,7 @@ impl Hir { 0 => Hir::empty(), 1 => exprs.pop().unwrap(), _ => { - let mut info = HirInfo::new( - Self::alternation_first_set(&exprs)); - // The union operate should make sure that we have - // the correct value for accepts_empty here. There is - // no special fixup like in concat. - info.set_onepass(!FirstSet::fsets_clash_value(&exprs)); + let mut info = HirInfo::new(); info.set_always_utf8(true); info.set_all_assertions(true); info.set_anchored_start(true); @@ -656,13 +477,6 @@ impl Hir { } } } - fn alternation_first_set(es: &[Hir]) -> FirstSet { - let mut fset = FirstSet::empty(); - for e in es { - fset.union(&e.info.first_set); - } - fset - } /// Build an HIR expression for `.`. /// @@ -706,12 +520,6 @@ impl Hir { } } - /// Return true if and only if this HIR contains no byte-level - /// ambiguities. - pub fn is_onepass(&self) -> bool { - self.info.is_onepass() - } - /// Return true if and only if this HIR will always match valid UTF-8. /// /// When this returns false, then it is possible for this HIR expression @@ -1306,147 +1114,6 @@ impl fmt::Debug for ClassBytesRange { } } -/// A representation of all the possible ways a word in the language -/// of a regex could begin. ClassBytes has no way to express the empty -/// string, so we add an extra flag to indicate if a FirstSet includes -/// epsilon. Put in a more theoretical way all firstsets are subsets of -/// SIGMA `union` { epsilon }. -#[derive(Debug, PartialEq, Eq, Clone)] -pub struct FirstSet { - /// The bytes which might start a word in the language. - bytes: ClassBytes, - /// True iff the language accepts the empty string. - accepts_empty: bool, -} - -/// A macro to define the fsets_clash associated functions, -/// parameterized over the type of the inner slice. This lets -/// us avoid allocating an extra vector when we check -/// alternations for onepassness. -macro_rules! def_fsets_clash { - ($fun_name:ident, $slice_inner:ty) => { - /// Check if a list of first sets is incompatible. - fn $fun_name(es: &[$slice_inner]) -> bool { - let mut seen_so_far = FirstSet::empty(); - - for e in es.iter() { - let mut snapshot = seen_so_far.clone(); - snapshot.intersect(&e.info.first_set); - if ! snapshot.is_empty() { - return true; - } - - seen_so_far.union(&e.info.first_set); - } - - false - } - } -} -impl FirstSet { - /// A convenience method to create a FirstSet which accepts - /// nothing. - fn empty() -> Self { - FirstSet { - bytes: ClassBytes::empty(), - accepts_empty: false, - } - } - - /// A convenience method to create a FirstSet which accepts - /// everything, but can't be empty. - fn anychar() -> FirstSet { - let mut f = FirstSet::empty(); - f.push_bytes(ClassBytesRange::new(b'\0', b'\xFF')); - f - } - - /// A convenience method to create a FirstSet which accepts - /// nothing but the empty string. - fn epsilon() -> FirstSet { - FirstSet { - bytes: ClassBytes::empty(), - accepts_empty: true, - } - } - - /// Direct constructor. - pub fn new(ranges: I, accepts_empty: bool) -> Self - where I: IntoIterator - { - FirstSet { - bytes: ClassBytes::new(ranges), - accepts_empty: accepts_empty, - } - } - - /// Add a byte range to the byte ranges that the FirstSet - /// accepts. - fn push_bytes(&mut self, byte_range: ClassBytesRange) { - self.bytes.push(byte_range); - } - - /// Take the set union of two FirstSets, mutating the lhs - /// to contain the result. - fn union(&mut self, other: &FirstSet) { - self.bytes.union(&other.bytes); - self.accepts_empty = self.accepts_empty || other.accepts_empty; - } - - /// Take the set intersection of two FirstSets, mutating the lhs - /// to contain the result. - fn intersect(&mut self, other: &FirstSet) { - self.bytes.intersect(&other.bytes); - self.accepts_empty = self.accepts_empty && other.accepts_empty; - } - - /// True iff the FirstSet accepts nothing, not even the empty string. - fn is_empty(&self) -> bool { - self.bytes.is_empty() && !self.accepts_empty - } - - def_fsets_clash!(fsets_clash_ref, &Hir); - def_fsets_clash!(fsets_clash_value, Hir); -} - -/// An iterator over a concatenation of expressions which -/// drills down into other embedded concatenations. -struct NestedConcat<'a>(Vec<(&'a [Hir], usize)>); -impl<'a> NestedConcat<'a> { - fn new(es: &'a [Hir]) -> Self { - NestedConcat(vec![(es, 0)]) - } -} -impl<'a> Iterator for NestedConcat<'a> { - type Item = &'a Hir; - - fn next(&mut self) -> Option<&'a Hir> { - loop { - if self.0.len() == 0 { - return None; - } - - let tip = self.0.len() - 1; - let (es, idx) = self.0[tip]; - - if idx >= es.len() { - self.0.pop(); - continue; - } - - self.0[tip].1 += 1; - - match es[idx].kind() { - &HirKind::Concat(ref es) => { - self.0.push((es, 0)); - continue; - } - _ => return Some(&es[idx]), - } - } - } -} - /// The high-level intermediate representation for an anchor assertion. /// /// A matching anchor assertion is always zero-length. @@ -1653,9 +1320,6 @@ struct HirInfo { /// If more attributes need to be added, it is OK to increase the size of /// this as appropriate. bools: u8, - /// A description of how words in the language of this expression - /// might start. - first_set: FirstSet, } // A simple macro for defining bitfield accessors/mutators. @@ -1676,10 +1340,9 @@ macro_rules! define_bool { } impl HirInfo { - fn new(fs: FirstSet) -> HirInfo { + fn new() -> HirInfo { HirInfo { bools: 0, - first_set: fs, } } @@ -1690,25 +1353,11 @@ impl HirInfo { define_bool!(4, is_any_anchored_start, set_any_anchored_start); define_bool!(5, is_any_anchored_end, set_any_anchored_end); define_bool!(6, is_match_empty, set_match_empty); - define_bool!(7, is_onepass, set_onepass); -} - -/// The first byte of a unicode code point. -/// -/// We only ever care about the first byte of a particular character -/// because the onepass DFA is implemented in the byte space not the -/// character space. This means, for example, that a branch between -/// lowercase delta and uppercase delta is actually non-deterministic. -fn first_byte(c: char) -> u8 { - let mut b: [u8; 4] = [0; 4]; - c.encode_utf8(&mut b); - b[0] } #[cfg(test)] mod tests { use super::*; - use parser::Parser; fn uclass(ranges: &[(char, char)]) -> ClassUnicode { let ranges: Vec = ranges @@ -2404,11 +2053,11 @@ mod tests { expr = Hir { kind: HirKind::Concat(vec![expr]), - info: HirInfo::new(FirstSet::empty()), + info: HirInfo::new(), }; expr = Hir { kind: HirKind::Alternation(vec![expr]), - info: HirInfo::new(FirstSet::empty()), + info: HirInfo::new(), }; } assert!(!expr.kind.is_empty()); @@ -2423,208 +2072,4 @@ mod tests { .join() .unwrap(); } - - // - // First Set intersection smoke tests - // - - fn is_intersecting_fset(e1: &Hir, e2: &Hir) -> bool { - let mut fset = e1.info.first_set.clone(); - fset.intersect(&e2.info.first_set); - ! fset.is_empty() - } - - #[test] - fn fset_lit() { - let e1 = Parser::new().parse("a").unwrap(); - let e2 = Parser::new().parse("a").unwrap(); - let e3 = Parser::new().parse("b").unwrap(); - - assert!(is_intersecting_fset(&e1, &e2)); - assert!(!is_intersecting_fset(&e1, &e3)); - } - - #[test] - fn fset_class() { - let e1 = Parser::new().parse("[a]").unwrap(); - let e2 = Parser::new().parse("[a]").unwrap(); - let e3 = Parser::new().parse("[b]").unwrap(); - - assert!(is_intersecting_fset(&e1, &e2)); - assert!(!is_intersecting_fset(&e1, &e3)); - } - - #[test] - fn fset_class_n() { - let e1 = Parser::new().parse("[xamn]").unwrap(); - let e2 = Parser::new().parse("[rlwa]").unwrap(); - let e3 = Parser::new().parse("[bcq]").unwrap(); - - assert!(is_intersecting_fset(&e1, &e2)); - assert!(!is_intersecting_fset(&e1, &e3)); - } - - #[test] - fn fset_alt() { - let e1 = Parser::new().parse("ab|bc|ad").unwrap(); - let e2 = Parser::new().parse("yyyy|am|zz").unwrap(); - let e3 = Parser::new().parse("cc|ww").unwrap(); - - assert!(is_intersecting_fset(&e1, &e2)); - assert!(!is_intersecting_fset(&e1, &e3)); - } - - #[test] - fn fset_group() { - let e1 = Parser::new().parse("(?:ab)").unwrap(); - let e2 = Parser::new().parse("(?:aq)").unwrap(); - let e3 = Parser::new().parse("(?:m)").unwrap(); - - assert!(is_intersecting_fset(&e1, &e2)); - assert!(!is_intersecting_fset(&e1, &e3)); - } - - #[test] - fn fset_concat() { - let e1 = Parser::new().parse("aa(?:nb)").unwrap(); - let e2 = Parser::new().parse("aa(?:rq)").unwrap(); - let e3 = Parser::new().parse("bb(?:m)").unwrap(); - - assert!(is_intersecting_fset(&e1, &e2)); - assert!(!is_intersecting_fset(&e1, &e3)); - } - - #[test] - fn fset_word_boundary_dropped() { - let e1 = Parser::new().parse(r"aa").unwrap(); - let e2 = Parser::new().parse(r"\baa").unwrap(); - let e3 = Parser::new().parse(r"\bbb").unwrap(); - - assert!(is_intersecting_fset(&e1, &e2)); - assert!(!is_intersecting_fset(&e1, &e3)); - } - - #[test] - fn fset_word_boundary_all() { - let e1 = Parser::new().parse(r"aa").unwrap(); - let e2 = Parser::new().parse(r"\b").unwrap(); - - assert!(is_intersecting_fset(&e1, &e2)); - } - - #[test] - fn fset_not_word_boundary_dropped() { - let e1 = Parser::new().parse(r"aa").unwrap(); - let e2 = Parser::new().parse(r"\Baa").unwrap(); - let e3 = Parser::new().parse(r"\Bbb").unwrap(); - - assert!(is_intersecting_fset(&e1, &e2)); - assert!(!is_intersecting_fset(&e1, &e3)); - } - - #[test] - fn fset_not_word_boundary_all() { - let e1 = Parser::new().parse(r"aa").unwrap(); - let e2 = Parser::new().parse(r"\B").unwrap(); - - assert!(is_intersecting_fset(&e1, &e2)); - } - - #[test] - fn fset_start_anchor_dropped() { - let e1 = Parser::new().parse(r"aa").unwrap(); - let e2 = Parser::new().parse(r"^aa").unwrap(); - let e3 = Parser::new().parse(r"^bb").unwrap(); - - assert!(is_intersecting_fset(&e1, &e2)); - assert!(!is_intersecting_fset(&e1, &e3)); - } - - #[test] - fn fset_terminal_emptylook_all() { - let e = Parser::new().parse(r"a*\b").unwrap(); - - assert_eq!(FirstSet::anychar(), e.info.first_set); - } - - #[test] - fn fset_empty_alt() { - let e1 = Parser::new().parse(r"(?:a|())b").unwrap(); - let e2 = Parser::new().parse(r"b").unwrap(); - - assert!(is_intersecting_fset(&e1, &e2)); - } - - // - // Onepass Unit Tests - // - - macro_rules! test_onepass { - ($fun_name:ident, $re_str:expr) => { - #[test] - fn $fun_name() { - let e = Parser::new().parse($re_str).unwrap(); - assert!( - e.info.is_onepass(), - "info={:?}", e.info); - } - } - } - - macro_rules! test_not_onepass { - ($fun_name:ident, $re_str:expr) => { - #[test] - fn $fun_name() { - let e = Parser::new().parse($re_str).unwrap(); - assert!( - !e.info.is_onepass(), - "info={:?}", e.info); - } - } - } - - test_onepass!(onepass_smoke_1_, r"[^x]x(.*)"); - test_not_onepass!(onepass_smoke_2_, r"(.*)x(.*)"); - - test_not_onepass!(onepass_alt_1_, r"a|b|c|a|d"); - test_not_onepass!(onepass_alt_2_, r"a|b|c|((m|a|x)|g)|d"); - test_onepass!(onepass_alt_3_, r"a|b|c|x|d"); - test_onepass!(onepass_alt_4_, r"a|b|c|((m|x)|g)|d"); - - test_not_onepass!(onepass_not_in_rust, r"(\d+)-(\d+)"); - - test_onepass!(onepass_empty_alt_1_, r"(a|())b"); - test_not_onepass!(onepass_empty_alt_2_, r"(a|())a"); - - test_not_onepass!(onepass_rep_1_, r"a*a"); - test_not_onepass!(onepass_rep_2_, r"a+a"); - test_not_onepass!(onepass_rep_3_, r"a{4,8}a"); - test_not_onepass!(onepass_rep_4_, r"a{4,}a"); - test_onepass!(onepass_rep_5_, r"a{4}a"); - test_not_onepass!(onepass_rep_6_, r"a?a"); - - test_onepass!(onepass_rep_7_, r"a*b"); - test_onepass!(onepass_rep_8_, r"a+b"); - test_onepass!(onepass_rep_9_, r"a{4,8}b"); - test_onepass!(onepass_rep_10_, r"a{4,}b"); - test_onepass!(onepass_rep_11_, r"a{4}b"); - test_onepass!(onepass_rep_12_, r"a?b"); - - test_not_onepass!(onepass_concat_middle_1_, r"ab?bc"); - test_onepass!(onepass_concat_middle_2_, r"a(?:b|c)dc"); - - test_not_onepass!(onepass_unicode_class_1_, r"\d"); - test_not_onepass!(onepass_unicode_class_2_, r"\s"); - test_not_onepass!(onepass_unicode_class_3_, r"\w"); - test_not_onepass!(onepass_unicode_class_4_, r"inthe\wmiddle"); - - test_not_onepass!(onepass_unicode_clash_1_, r"Δ|δ"); - - test_not_onepass!(onepass_empty_assert_1_, r"a|^a"); - test_onepass!(onepass_empty_assert_2_, r"\ba"); - test_onepass!(onepass_empty_assert_3_, r"^a"); - test_onepass!(onepass_empty_assert_4_, r"a$"); - - test_not_onepass!(onepass_naked_empty_assert_1_, r"\w|a"); - } diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index 5e026dc8e1..f47ad9ce43 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -106,7 +106,6 @@ done automatically in the `regex` crate. #![deny(missing_docs)] extern crate ucd_util; -extern crate utf8_ranges; pub use error::{Error, Result}; pub use parser::{Parser, ParserBuilder}; diff --git a/src/analysis.rs b/src/analysis.rs new file mode 100644 index 0000000000..25d180074e --- /dev/null +++ b/src/analysis.rs @@ -0,0 +1,616 @@ +use syntax::hir::{ + Hir, HirKind, Literal, ClassBytes, ClassBytesRange, + Class, Visitor, RepetitionRange, RepetitionKind +}; +use syntax::hir; +use utf8_ranges::Utf8Sequences; + +/// True iff the given expression is onepass +/// +/// The general approach here is to find all the places in +/// the given Hir where any sort of branching occurs, +/// and examine the start of each expression at the branch +/// to see if there is an ambiguity. +/// +/// For example, given the regex `a|b`, we would examine +/// both branches of the alternation `a` and `b` and +/// notice that they don't clash, so the regex is onepass. +/// On the other hand the branches of `a|a` do clash, +/// so that regex is not onepass. +/// +/// Alternations are not the only branch points in a regex. +/// We also have to make sure to consider repetitions like +/// `a*a`, which is not onepass because there is no way +/// to tell whether we have to loop back to the repeated +/// expression or continue on by looking at just one byte. +/// `a*b` is onepass because you can figure out what to do. +/// If you see an `a`, go back to the start of the loop, +/// and if you see a `b` continue onward. +/// +/// A third, more subtle case is the case of concatenations +/// of expressions where some of the expressions can +/// accept the empty string. Consider `a(b|)ba`. This +/// regex is not onepass because it is not clear what to +/// do upon seeing the input `ab`. The problem is that `(b|)` +/// and `ba` clash with one other. +/// +/// To get a bit more specific about what it means for two +/// expressions to clash, we introduce the concept of first +/// sets. The first set of an expression is the set of +/// bytes which might begin a word in the language of that +/// expression. If the expression can accept the empty string, +/// the first set takes note of that as well. +/// +/// To handle these three cases, we use a visitor to +/// find the alternations, repetitions, and concatenations. +/// Whenever we find one of the above cases, we compute +/// the first set of the various branches involved, +/// then check to see if the first sets intersect. If +/// we ever find a non-empty intersection, the regex +/// is not onepass. +pub fn is_onepass(expr: &Hir) -> bool { + hir::visit(expr, IsOnePassVisitor::new()).unwrap() +} + +struct IsOnePassVisitor(bool); + +impl Visitor for IsOnePassVisitor { + type Output = bool; + type Err = (); + + fn finish(self) -> Result { + Ok(self.0) + } + + fn visit_pre(&mut self, hir: &Hir) -> Result<(), ()> { + if !self.0 { + return Ok(()) + } + + match hir.kind() { + &HirKind::Concat(ref es) => self.check_concat(&es), + &HirKind::Alternation(ref es) => self.check_alternation(&es), + &HirKind::Repetition(ref rep) => { + if fset_of(&*rep.hir).is_empty() { + self.0 = false; + } + } + &HirKind::Class(ref cls) => self.check_cls(cls), + _ => () + } + + Ok(()) + } +} + +impl IsOnePassVisitor { + fn new() -> Self { + IsOnePassVisitor(true) + } + + fn check_concat(&mut self, es: &[Hir]) { + let mut empty_run = vec![]; + + for e in NestedConcat::new(es) { + let is_rep = match e.kind() { + &HirKind::Repetition(_) => true, + _ => false, + }; + + empty_run.push(e); + if !(accepts_empty(e) || is_rep) { + self.0 = self.0 && !fsets_clash(&empty_run); + empty_run.clear(); + } + } + + if empty_run.len() > 0 { + self.0 = self.0 && !fsets_clash(&empty_run); + } + } + + fn check_alternation(&mut self, es: &[Hir]) { + self.0 = self.0 && !fsets_clash(&es.iter().collect::>()); + } + + // Unicode classes are really just big alternatives from the byte + // oriented point of view. + // + // This function translates a unicode class into the + // byte space and checks for intersecting first sets. + fn check_cls(&mut self, cls: &Class) { + match cls { + &Class::Unicode(ref ucls) => { + let mut seen_char: [bool; 256] = [false; 256]; + + for cr in ucls.iter() { + for br in Utf8Sequences::new(cr.start(), cr.end()) { + let first = br.as_slice()[0]; + for b in first.start..(first.end+1) { + if seen_char[b as usize] { + self.0 = false; + return; + } + seen_char[b as usize] = true; + } + } + } + } + _ => {} + } + } + +} + +/// Check if a list of first sets is incompatible. +/// +/// O(n^2), but n will usually be quite small. +fn fsets_clash(es: &[&Hir]) -> bool { + for (i, e1) in es.iter().enumerate() { + for (j, e2) in es.iter().enumerate() { + if i != j { + let mut fset = fset_of(e1); + let fset2 = fset_of(e2); + + fset.intersect(&fset2); + if ! fset.is_empty() { + return true; + } + } + } + } + false +} + + +/// Compute the first set of a given regular expression. +/// +/// The first set of a regular expression is the set of all bytes +/// which might begin it. This is a less general version of the +/// notion of a regular expression preview (the first set can be +/// thought of as the 1-preview of a regular expression). +/// +/// Note that first sets are byte-oriented because the DFA is +/// byte oriented. This means an expression like /Δ|δ/ is actually not +/// onepass, even though there is clearly no non-determinism inherent +/// to the regex at a unicode code point level (big delta and little +/// delta start with the same byte). +fn fset_of(expr: &Hir) -> FirstSet { + fn singleton(b: u8) -> FirstSet { + let mut f = FirstSet::empty(); + f.push_bytes(ClassBytesRange::new(b, b)); + f + } + + // First compute the set of characters that might begin + // the expression (ignoring epsilon for now). + let mut f_char_set = match expr.kind() { + &HirKind::Empty => FirstSet::epsilon(), + &HirKind::Literal(ref lit) => { + match lit { + &Literal::Unicode(c) => singleton(first_byte(c)), + &Literal::Byte(b) => singleton(b), + } + } + &HirKind::Class(ref class) => { + match class { + &Class::Unicode(ref c) => { + // Get all the bytes which might begin this unicode + // class. + let mut cb = FirstSet::empty(); + for cr in c.iter() { + for br in Utf8Sequences::new(cr.start(), cr.end()) { + let first = br.as_slice()[0]; + cb.push_bytes( + ClassBytesRange::new(first.start, first.end)); + } + } + cb + } + &Class::Bytes(ref b) => + FirstSet::new(b.iter().map(|x| *x), false), + } + } + + // When an empty look (Anchor or WordBoundary) is at the start of + // a concatenation, we conservatively assume that the assertion + // will pass, so we just drop it. Then we can only get to this + // point if we are dealing with some sort of naked empty look. + // For now we just do the most conservative thing and say + // that such an emptylook could potentially match on any character. + &HirKind::Anchor(_) | &HirKind::WordBoundary(_) => FirstSet::anychar(), + + &HirKind::Repetition(ref rep) => fset_of(&rep.hir), + &HirKind::Group(ref group) => fset_of(&group.hir), + + // The most involved case. We need to strip leading empty-looks + // as well as take the union of the first sets of the first n+1 + // expressions where n is the number of leading expressions which + // accept the empty string. + &HirKind::Concat(ref es) => { + let mut fset = FirstSet::empty(); + for (i, e) in es.iter().enumerate() { + match e.kind() { + &HirKind::Anchor(_) | &HirKind::WordBoundary(_) => { + // Ignore any leading emptylooks, but any in tail + // position have to be accounted for. + if i == es.len() - 1 { + fset.union(&FirstSet::anychar()); + } + } + _ => { + let inner_fset = fset_of(e); + fset.union(&inner_fset); + + if !accepts_empty(e) { + // We can stop accumulating after we stop seeing + // first sets which contain epsilon. + break; + } + } + } + } + fset + } + &HirKind::Alternation(ref es) => { + let mut fset = FirstSet::empty(); + for e in es { + fset.union(&fset_of(e)); + } + fset + } + }; + + f_char_set.accepts_empty = accepts_empty(expr); + f_char_set +} + +fn accepts_empty(expr: &Hir) -> bool { + match expr.kind() { + &HirKind::Empty => true, + &HirKind::Literal(_) => false, + &HirKind::Class(_) => false, + + // A naked empty look is a pretty weird thing because we + // normally strip them from the beginning of concatinations. + // We are just going to treat them like `.` + &HirKind::Anchor(_) | &HirKind::WordBoundary(_) => false, + + &HirKind::Repetition(ref rep) => { + match rep.kind { + RepetitionKind::ZeroOrOne => true, + RepetitionKind::ZeroOrMore => true, + RepetitionKind::OneOrMore => accepts_empty(&rep.hir), + RepetitionKind::Range(ref range) => { + match range { + &RepetitionRange::Exactly(0) + | &RepetitionRange::AtLeast(0) + | &RepetitionRange::Bounded(0, _) => true, + _ => accepts_empty(&rep.hir), + } + } + } + } + + &HirKind::Group(ref group) => accepts_empty(&group.hir), + + &HirKind::Concat(ref es) => { + let mut accepts: bool = true; + for e in es.iter() { + match e.kind() { + &HirKind::Anchor(_) | &HirKind::WordBoundary(_) => { + // Ignore any leading emptylooks. + } + _ => { + accepts = accepts && accepts_empty(&e); + } + } + + if !accepts { + break; + } + } + accepts + } + + &HirKind::Alternation(ref es) => es.iter().any(accepts_empty) + } +} + +/// The first byte of a unicode code point. +/// +/// We only ever care about the first byte of a particular character +/// because the onepass DFA is implemented in the byte space not the +/// character space. This means, for example, that a branch between +/// lowercase delta and uppercase delta is actually non-deterministic. +fn first_byte(c: char) -> u8 { + let mut b: [u8; 4] = [0; 4]; + c.encode_utf8(&mut b); + b[0] +} + +/// A representation of all the possible ways a word in the language +/// of a regex could begin. ClassBytes has no way to express the empty +/// string, so we add an extra flag to indicate if a FirstSet includes +/// epsilon. Put in a more theoretical way all firstsets are subsets of +/// SIGMA `union` { epsilon }. +#[derive(Debug, PartialEq, Eq)] +struct FirstSet { + bytes: ClassBytes, + pub accepts_empty: bool, +} + +impl FirstSet { + fn empty() -> Self { + FirstSet { + bytes: ClassBytes::empty(), + accepts_empty: false, + } + } + + pub fn new(ranges: I, accepts_empty: bool) -> Self + where I: IntoIterator + { + FirstSet { + bytes: ClassBytes::new(ranges), + accepts_empty: accepts_empty, + } + } + + fn anychar() -> FirstSet { + let mut f = FirstSet::empty(); + f.push_bytes(ClassBytesRange::new(b'\0', b'\xFF')); + f + } + + fn epsilon() -> FirstSet { + FirstSet { + bytes: ClassBytes::empty(), + accepts_empty: true, + } + } + + fn push_bytes(&mut self, byte_range: ClassBytesRange) { + self.bytes.push(byte_range); + } + + fn union(&mut self, other: &FirstSet) { + self.bytes.union(&other.bytes); + self.accepts_empty = self.accepts_empty || other.accepts_empty; + } + + fn intersect(&mut self, other: &FirstSet) { + self.bytes.intersect(&other.bytes); + self.accepts_empty = self.accepts_empty && other.accepts_empty; + } + + fn is_empty(&self) -> bool { + self.bytes.is_empty() && !self.accepts_empty + } +} + +/// An iterator over a concatenation of expressions which +/// drills down into other embedded concatenations. +struct NestedConcat<'a>(Vec<(&'a [Hir], usize)>); +impl<'a> NestedConcat<'a> { + fn new(es: &'a [Hir]) -> Self { + NestedConcat(vec![(es, 0)]) + } +} +impl<'a> Iterator for NestedConcat<'a> { + type Item = &'a Hir; + + fn next(&mut self) -> Option<&'a Hir> { + loop { + if self.0.len() == 0 { + return None; + } + + let tip = self.0.len() - 1; + let (es, idx) = self.0[tip]; + + if idx >= es.len() { + self.0.pop(); + continue; + } + + self.0[tip].1 += 1; + + match es[idx].kind() { + &HirKind::Concat(ref es) => { + self.0.push((es, 0)); + continue; + // self.next() + } + _ => return Some(&es[idx]), + } + } + } +} + +#[cfg(test)] +mod tests { + use syntax::Parser; + use syntax::hir::Hir; + use super::*; + + fn is_intersecting_fset(e1: &Hir, e2: &Hir) -> bool { + let mut fset = fset_of(e1); + fset.intersect(&fset_of(e2)); + ! fset.is_empty() + } + + // + // First Set intersection smoke tests + // + + #[test] + fn fset_lit() { + let e1 = Parser::new().parse("a").unwrap(); + let e2 = Parser::new().parse("a").unwrap(); + let e3 = Parser::new().parse("b").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_class() { + let e1 = Parser::new().parse("[a]").unwrap(); + let e2 = Parser::new().parse("[a]").unwrap(); + let e3 = Parser::new().parse("[b]").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_class_n() { + let e1 = Parser::new().parse("[xamn]").unwrap(); + let e2 = Parser::new().parse("[rlwa]").unwrap(); + let e3 = Parser::new().parse("[bcq]").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_alt() { + let e1 = Parser::new().parse("ab|bc|ad").unwrap(); + let e2 = Parser::new().parse("yyyy|am|zz").unwrap(); + let e3 = Parser::new().parse("cc|ww").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_group() { + let e1 = Parser::new().parse("(?:ab)").unwrap(); + let e2 = Parser::new().parse("(?:aq)").unwrap(); + let e3 = Parser::new().parse("(?:m)").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_concat() { + let e1 = Parser::new().parse("aa(?:nb)").unwrap(); + let e2 = Parser::new().parse("aa(?:rq)").unwrap(); + let e3 = Parser::new().parse("bb(?:m)").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_word_boundary_dropped() { + let e1 = Parser::new().parse(r"aa").unwrap(); + let e2 = Parser::new().parse(r"\baa").unwrap(); + let e3 = Parser::new().parse(r"\bbb").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_word_boundary_all() { + let e1 = Parser::new().parse(r"aa").unwrap(); + let e2 = Parser::new().parse(r"\b").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + } + + #[test] + fn fset_not_word_boundary_dropped() { + let e1 = Parser::new().parse(r"aa").unwrap(); + let e2 = Parser::new().parse(r"\Baa").unwrap(); + let e3 = Parser::new().parse(r"\Bbb").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_not_word_boundary_all() { + let e1 = Parser::new().parse(r"aa").unwrap(); + let e2 = Parser::new().parse(r"\B").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + } + + #[test] + fn fset_start_anchor_dropped() { + let e1 = Parser::new().parse(r"aa").unwrap(); + let e2 = Parser::new().parse(r"^aa").unwrap(); + let e3 = Parser::new().parse(r"^bb").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_terminal_emptylook_all() { + let e = Parser::new().parse(r"a*\b").unwrap(); + + let mut total_accept = FirstSet::anychar(); + total_accept.accepts_empty = true; + + assert_eq!(total_accept, fset_of(&e)); + } + + #[test] + fn fset_empty_alt() { + let e1 = Parser::new().parse(r"(?:a|())b").unwrap(); + let e2 = Parser::new().parse(r"b").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + } + + // + // onepass smoke tests + // + + // This test is pulled right from some of Russ Cox's + // comments on onepass regex. + // + // Note that Russ Cox's other example of a onepass regex + // (r"(\d+)-(\d+)") is actually not onepass for us because + // there is byte-level nondeterminism in the \d character + // class, and we care about things in the byte space rather + // than the character space. If you do a onepass engine at + // the character level, Cox's example is indeed onepass. + #[test] + fn is_onepass_smoke_test1() { + let e1 = Parser::new().parse(r"([^x]*)x(.*)").unwrap(); + let e2 = Parser::new().parse(r"(.*)x(.*)").unwrap(); + + assert!(is_onepass(&e1)); + assert!(!is_onepass(&e2)); + } + + #[test] + fn is_onepass_empty_alt() { + let e1 = Parser::new().parse(r"(a|())b").unwrap(); + let e2 = Parser::new().parse(r"(a|())a").unwrap(); + + assert!(is_onepass(&e1)); + assert!(!is_onepass(&e2)); + } + + #[test] + fn is_onepass_rep() { + let e1 = Parser::new().parse(r"a+a").unwrap(); + let e2 = Parser::new().parse(r"a*a").unwrap(); + + assert!(!is_onepass(&e1)); + assert!(!is_onepass(&e2)); + } + + #[test] + fn is_onepass_clash_in_middle_of_concat() { + let e = Parser::new().parse(r"ab?b").unwrap(); + assert!(!is_onepass(&e)); + } +} diff --git a/src/lib.rs b/src/lib.rs index d35042492e..4a901f983b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -668,6 +668,7 @@ mod re_unicode; mod sparse; #[cfg(any(regex_runtime_teddy_ssse3, regex_runtime_teddy_avx2))] mod vector; +mod analysis; mod onepass; /// The `internal` module exists to support suspicious activity, such as diff --git a/src/onepass.rs b/src/onepass.rs index 442204b6b2..f6a7c68a70 100644 --- a/src/onepass.rs +++ b/src/onepass.rs @@ -27,6 +27,7 @@ use prog::{Program, Inst, EmptyLook}; use literal::LiteralSearcher; use re_trait::Slot; use input::{ByteInput, Input}; +use analysis; use compile::Compiler; use syntax::hir::Hir; use re_builder::RegexOptions; @@ -529,7 +530,7 @@ impl OnePassCompiler { return Err(OnePassError::RegexSetUnsupported); } - if ! es[0].is_onepass() { + if ! analysis::is_onepass(&es[0]) { return Err(OnePassError::HasNondeterminism); } From b5b8f82fbde01d5a5b4689fdd44d246d785c93b3 Mon Sep 17 00:00:00 2001 From: Ethan Pailes Date: Sun, 23 Sep 2018 06:25:28 -0400 Subject: [PATCH 7/8] switch is_onepass to direct recursion instead of visitor. --- src/analysis.rs | 416 ++++++++++++++++++++++++------------------------ 1 file changed, 207 insertions(+), 209 deletions(-) diff --git a/src/analysis.rs b/src/analysis.rs index 25d180074e..8f4f6ceb34 100644 --- a/src/analysis.rs +++ b/src/analysis.rs @@ -1,8 +1,7 @@ use syntax::hir::{ Hir, HirKind, Literal, ClassBytes, ClassBytesRange, - Class, Visitor, RepetitionRange, RepetitionKind + Class, RepetitionRange, RepetitionKind }; -use syntax::hir; use utf8_ranges::Utf8Sequences; /// True iff the given expression is onepass @@ -49,120 +48,9 @@ use utf8_ranges::Utf8Sequences; /// we ever find a non-empty intersection, the regex /// is not onepass. pub fn is_onepass(expr: &Hir) -> bool { - hir::visit(expr, IsOnePassVisitor::new()).unwrap() + fset_of(expr).is_onepass } -struct IsOnePassVisitor(bool); - -impl Visitor for IsOnePassVisitor { - type Output = bool; - type Err = (); - - fn finish(self) -> Result { - Ok(self.0) - } - - fn visit_pre(&mut self, hir: &Hir) -> Result<(), ()> { - if !self.0 { - return Ok(()) - } - - match hir.kind() { - &HirKind::Concat(ref es) => self.check_concat(&es), - &HirKind::Alternation(ref es) => self.check_alternation(&es), - &HirKind::Repetition(ref rep) => { - if fset_of(&*rep.hir).is_empty() { - self.0 = false; - } - } - &HirKind::Class(ref cls) => self.check_cls(cls), - _ => () - } - - Ok(()) - } -} - -impl IsOnePassVisitor { - fn new() -> Self { - IsOnePassVisitor(true) - } - - fn check_concat(&mut self, es: &[Hir]) { - let mut empty_run = vec![]; - - for e in NestedConcat::new(es) { - let is_rep = match e.kind() { - &HirKind::Repetition(_) => true, - _ => false, - }; - - empty_run.push(e); - if !(accepts_empty(e) || is_rep) { - self.0 = self.0 && !fsets_clash(&empty_run); - empty_run.clear(); - } - } - - if empty_run.len() > 0 { - self.0 = self.0 && !fsets_clash(&empty_run); - } - } - - fn check_alternation(&mut self, es: &[Hir]) { - self.0 = self.0 && !fsets_clash(&es.iter().collect::>()); - } - - // Unicode classes are really just big alternatives from the byte - // oriented point of view. - // - // This function translates a unicode class into the - // byte space and checks for intersecting first sets. - fn check_cls(&mut self, cls: &Class) { - match cls { - &Class::Unicode(ref ucls) => { - let mut seen_char: [bool; 256] = [false; 256]; - - for cr in ucls.iter() { - for br in Utf8Sequences::new(cr.start(), cr.end()) { - let first = br.as_slice()[0]; - for b in first.start..(first.end+1) { - if seen_char[b as usize] { - self.0 = false; - return; - } - seen_char[b as usize] = true; - } - } - } - } - _ => {} - } - } - -} - -/// Check if a list of first sets is incompatible. -/// -/// O(n^2), but n will usually be quite small. -fn fsets_clash(es: &[&Hir]) -> bool { - for (i, e1) in es.iter().enumerate() { - for (j, e2) in es.iter().enumerate() { - if i != j { - let mut fset = fset_of(e1); - let fset2 = fset_of(e2); - - fset.intersect(&fset2); - if ! fset.is_empty() { - return true; - } - } - } - } - false -} - - /// Compute the first set of a given regular expression. /// /// The first set of a regular expression is the set of all bytes @@ -182,9 +70,7 @@ fn fset_of(expr: &Hir) -> FirstSet { f } - // First compute the set of characters that might begin - // the expression (ignoring epsilon for now). - let mut f_char_set = match expr.kind() { + match expr.kind() { &HirKind::Empty => FirstSet::epsilon(), &HirKind::Literal(ref lit) => { match lit { @@ -193,7 +79,7 @@ fn fset_of(expr: &Hir) -> FirstSet { } } &HirKind::Class(ref class) => { - match class { + let mut fset = match class { &Class::Unicode(ref c) => { // Get all the bytes which might begin this unicode // class. @@ -209,7 +95,10 @@ fn fset_of(expr: &Hir) -> FirstSet { } &Class::Bytes(ref b) => FirstSet::new(b.iter().map(|x| *x), false), - } + }; + + fset.is_onepass = class_is_onepass(class); + fset } // When an empty look (Anchor or WordBoundary) is at the start of @@ -220,7 +109,25 @@ fn fset_of(expr: &Hir) -> FirstSet { // that such an emptylook could potentially match on any character. &HirKind::Anchor(_) | &HirKind::WordBoundary(_) => FirstSet::anychar(), - &HirKind::Repetition(ref rep) => fset_of(&rep.hir), + &HirKind::Repetition(ref rep) => { + let mut fset = fset_of(&rep.hir); + + fset.accepts_empty = match rep.kind { + RepetitionKind::ZeroOrOne => true, + RepetitionKind::ZeroOrMore => true, + RepetitionKind::OneOrMore => fset.accepts_empty, + RepetitionKind::Range(ref range) => { + match range { + &RepetitionRange::Exactly(0) + | &RepetitionRange::AtLeast(0) + | &RepetitionRange::Bounded(0, _) => true, + _ => fset.accepts_empty, + } + } + }; + + fset + }, &HirKind::Group(ref group) => fset_of(&group.hir), // The most involved case. We need to strip leading empty-looks @@ -229,6 +136,10 @@ fn fset_of(expr: &Hir) -> FirstSet { // accept the empty string. &HirKind::Concat(ref es) => { let mut fset = FirstSet::empty(); + let mut inner_fsets = Vec::with_capacity(es.len()); + for e in es.iter() { + inner_fsets.push(fset_of(e)); + } for (i, e) in es.iter().enumerate() { match e.kind() { &HirKind::Anchor(_) | &HirKind::WordBoundary(_) => { @@ -236,13 +147,14 @@ fn fset_of(expr: &Hir) -> FirstSet { // position have to be accounted for. if i == es.len() - 1 { fset.union(&FirstSet::anychar()); + fset.accepts_empty = false; } } _ => { - let inner_fset = fset_of(e); - fset.union(&inner_fset); + fset.union(&inner_fsets[i]); - if !accepts_empty(e) { + if !inner_fsets[i].accepts_empty { + fset.accepts_empty = false; // We can stop accumulating after we stop seeing // first sets which contain epsilon. break; @@ -250,71 +162,94 @@ fn fset_of(expr: &Hir) -> FirstSet { } } } + + fset.is_onepass = concat_is_onepass(es, &inner_fsets); + fset } &HirKind::Alternation(ref es) => { let mut fset = FirstSet::empty(); - for e in es { - fset.union(&fset_of(e)); + let mut inner_fsets = Vec::with_capacity(es.len()); + for (i, e) in es.iter().enumerate() { + inner_fsets.push(fset_of(e)); + fset.union(&inner_fsets[i]); } + + fset.is_onepass = !FirstSet::fsets_clash_value(&inner_fsets); + fset } - }; - - f_char_set.accepts_empty = accepts_empty(expr); - f_char_set + } } -fn accepts_empty(expr: &Hir) -> bool { - match expr.kind() { - &HirKind::Empty => true, - &HirKind::Literal(_) => false, - &HirKind::Class(_) => false, - - // A naked empty look is a pretty weird thing because we - // normally strip them from the beginning of concatinations. - // We are just going to treat them like `.` - &HirKind::Anchor(_) | &HirKind::WordBoundary(_) => false, - - &HirKind::Repetition(ref rep) => { - match rep.kind { - RepetitionKind::ZeroOrOne => true, - RepetitionKind::ZeroOrMore => true, - RepetitionKind::OneOrMore => accepts_empty(&rep.hir), - RepetitionKind::Range(ref range) => { - match range { - &RepetitionRange::Exactly(0) - | &RepetitionRange::AtLeast(0) - | &RepetitionRange::Bounded(0, _) => true, - _ => accepts_empty(&rep.hir), +// Unicode classes are really just big alternatives from the byte +// oriented point of view. +// +// This function translates a unicode class into the +// byte space and checks for intersecting first sets. +// +// Byte classes are always onepass +fn class_is_onepass(cls: &Class) -> bool { + match cls { + &Class::Unicode(ref ucls) => { + let mut seen_char: [bool; 256] = [false; 256]; + + for cr in ucls.iter() { + for br in Utf8Sequences::new(cr.start(), cr.end()) { + let first = br.as_slice()[0]; + for b in first.start..(first.end+1) { + if seen_char[b as usize] { + return false; + } + seen_char[b as usize] = true; } } } } + _ => {} + } - &HirKind::Group(ref group) => accepts_empty(&group.hir), + true +} - &HirKind::Concat(ref es) => { - let mut accepts: bool = true; - for e in es.iter() { - match e.kind() { - &HirKind::Anchor(_) | &HirKind::WordBoundary(_) => { - // Ignore any leading emptylooks. - } - _ => { - accepts = accepts && accepts_empty(&e); - } - } +fn concat_is_onepass(es: &[Hir], inner_fsets: &[FirstSet]) -> bool { + let mut empty_run = vec![]; - if !accepts { - break; + for (i, e) in NestedConcat::new(es).enumerate() { + match e.kind() { + &HirKind::Anchor(_) | &HirKind::WordBoundary(_) => { + if i < es.len() - 1 { + continue; } } - accepts + _ => {} // FALLTHROUGH + } + + if !inner_fsets[i].is_onepass { + return false; } - &HirKind::Alternation(ref es) => es.iter().any(accepts_empty) + let is_real_rep = match e.kind() { + &HirKind::Repetition(ref rep) => { + match rep.kind { + RepetitionKind::Range( + RepetitionRange::Exactly(_)) => false, + _ => true, + } + }, + _ => false, + }; + + empty_run.push(&inner_fsets[i]); + if !(inner_fsets[i].accepts_empty || is_real_rep) { + if FirstSet::fsets_clash_ref(&empty_run) { + return false; + } + empty_run.clear(); + } } + + ! FirstSet::fsets_clash_ref(&empty_run) } /// The first byte of a unicode code point. @@ -334,17 +269,43 @@ fn first_byte(c: char) -> u8 { /// string, so we add an extra flag to indicate if a FirstSet includes /// epsilon. Put in a more theoretical way all firstsets are subsets of /// SIGMA `union` { epsilon }. -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, PartialEq, Eq, Clone)] struct FirstSet { bytes: ClassBytes, - pub accepts_empty: bool, + accepts_empty: bool, + is_onepass: bool, } +/// A macro to define the fsets_clash associated functions, +/// parameterized over the type of the inner slice. This lets +/// us avoid allocating an extra vector when we check +/// alternations for onepassness. +macro_rules! def_fsets_clash { + ($fun_name:ident, $slice_inner:ty) => { + /// Check if a list of first sets is incompatible. + fn $fun_name(fsets: &[$slice_inner]) -> bool { + let mut seen_so_far = FirstSet::empty(); + + for fset in fsets.iter() { + let mut snapshot = seen_so_far.clone(); + snapshot.intersect(&fset); + if ! snapshot.is_empty() { + return true; + } + + seen_so_far.union(&fset); + } + + false + } + } +} impl FirstSet { fn empty() -> Self { FirstSet { bytes: ClassBytes::empty(), accepts_empty: false, + is_onepass: true, } } @@ -354,6 +315,7 @@ impl FirstSet { FirstSet { bytes: ClassBytes::new(ranges), accepts_empty: accepts_empty, + is_onepass: true, } } @@ -367,6 +329,7 @@ impl FirstSet { FirstSet { bytes: ClassBytes::empty(), accepts_empty: true, + is_onepass: true, } } @@ -387,6 +350,9 @@ impl FirstSet { fn is_empty(&self) -> bool { self.bytes.is_empty() && !self.accepts_empty } + + def_fsets_clash!(fsets_clash_ref, &FirstSet); + def_fsets_clash!(fsets_clash_value, FirstSet); } /// An iterator over a concatenation of expressions which @@ -420,7 +386,6 @@ impl<'a> Iterator for NestedConcat<'a> { &HirKind::Concat(ref es) => { self.0.push((es, 0)); continue; - // self.next() } _ => return Some(&es[idx]), } @@ -551,15 +516,24 @@ mod tests { } #[test] - fn fset_terminal_emptylook_all() { + fn fset_terminal_emptylook_all_1_() { let e = Parser::new().parse(r"a*\b").unwrap(); + let mut fset = FirstSet::anychar(); + fset.is_onepass = false; + + assert_eq!(fset, fset_of(&e), "\n\n{:?}\n\n", e); + } - let mut total_accept = FirstSet::anychar(); - total_accept.accepts_empty = true; + #[test] + fn fset_terminal_emptylook_all_2_() { + let e = Parser::new().parse(r"(a*)\b").unwrap(); + let mut fset = FirstSet::anychar(); + fset.is_onepass = false; - assert_eq!(total_accept, fset_of(&e)); + assert_eq!(fset, fset_of(&e), "\n\n{:?}\n\n", e); } + #[test] fn fset_empty_alt() { let e1 = Parser::new().parse(r"(?:a|())b").unwrap(); @@ -572,45 +546,69 @@ mod tests { // onepass smoke tests // - // This test is pulled right from some of Russ Cox's - // comments on onepass regex. - // - // Note that Russ Cox's other example of a onepass regex - // (r"(\d+)-(\d+)") is actually not onepass for us because - // there is byte-level nondeterminism in the \d character - // class, and we care about things in the byte space rather - // than the character space. If you do a onepass engine at - // the character level, Cox's example is indeed onepass. - #[test] - fn is_onepass_smoke_test1() { - let e1 = Parser::new().parse(r"([^x]*)x(.*)").unwrap(); - let e2 = Parser::new().parse(r"(.*)x(.*)").unwrap(); + macro_rules! test_onepass { + ($fun_name:ident, $re_str:expr) => { + #[test] + fn $fun_name() { + let e = Parser::new().parse($re_str).unwrap(); + let fset = fset_of(&e); + assert!(fset.is_onepass, "fset={:?}", fset); + } + } + } - assert!(is_onepass(&e1)); - assert!(!is_onepass(&e2)); + macro_rules! test_not_onepass { + ($fun_name:ident, $re_str:expr) => { + #[test] + fn $fun_name() { + let e = Parser::new().parse($re_str).unwrap(); + let fset = fset_of(&e); + assert!(!fset.is_onepass, "fset={:?}", fset); + } + } } - #[test] - fn is_onepass_empty_alt() { - let e1 = Parser::new().parse(r"(a|())b").unwrap(); - let e2 = Parser::new().parse(r"(a|())a").unwrap(); + test_onepass!(onepass_smoke_1_, r"[^x]x(.*)"); + test_not_onepass!(onepass_smoke_2_, r"(.*)x(.*)"); - assert!(is_onepass(&e1)); - assert!(!is_onepass(&e2)); - } - - #[test] - fn is_onepass_rep() { - let e1 = Parser::new().parse(r"a+a").unwrap(); - let e2 = Parser::new().parse(r"a*a").unwrap(); + test_not_onepass!(onepass_alt_1_, r"a|b|c|a|d"); + test_not_onepass!(onepass_alt_2_, r"a|b|c|((m|a|x)|g)|d"); + test_onepass!(onepass_alt_3_, r"a|b|c|x|d"); + test_onepass!(onepass_alt_4_, r"a|b|c|((m|x)|g)|d"); - assert!(!is_onepass(&e1)); - assert!(!is_onepass(&e2)); - } + test_not_onepass!(onepass_not_in_rust, r"(\d+)-(\d+)"); - #[test] - fn is_onepass_clash_in_middle_of_concat() { - let e = Parser::new().parse(r"ab?b").unwrap(); - assert!(!is_onepass(&e)); - } + test_onepass!(onepass_empty_alt_1_, r"(a|())b"); + test_not_onepass!(onepass_empty_alt_2_, r"(a|())a"); + + test_not_onepass!(onepass_rep_1_, r"a*a"); + test_not_onepass!(onepass_rep_2_, r"a+a"); + test_not_onepass!(onepass_rep_3_, r"a{4,8}a"); + test_not_onepass!(onepass_rep_4_, r"a{4,}a"); + test_onepass!(onepass_rep_5_, r"a{4}a"); + test_not_onepass!(onepass_rep_6_, r"a?a"); + + test_onepass!(onepass_rep_7_, r"a*b"); + test_onepass!(onepass_rep_8_, r"a+b"); + test_onepass!(onepass_rep_9_, r"a{4,8}b"); + test_onepass!(onepass_rep_10_, r"a{4,}b"); + test_onepass!(onepass_rep_11_, r"a{4}b"); + test_onepass!(onepass_rep_12_, r"a?b"); + + test_not_onepass!(onepass_concat_middle_1_, r"ab?bc"); + test_onepass!(onepass_concat_middle_2_, r"a(?:b|c)dc"); + + test_not_onepass!(onepass_unicode_class_1_, r"\d"); + test_not_onepass!(onepass_unicode_class_2_, r"\s"); + test_not_onepass!(onepass_unicode_class_3_, r"\w"); + test_not_onepass!(onepass_unicode_class_4_, r"inthe\wmiddle"); + + test_not_onepass!(onepass_unicode_clash_1_, r"Δ|δ"); + + test_not_onepass!(onepass_empty_assert_1_, r"a|^a"); + test_onepass!(onepass_empty_assert_2_, r"\ba"); + test_onepass!(onepass_empty_assert_3_, r"^a"); + test_onepass!(onepass_empty_assert_4_, r"a$"); + + test_not_onepass!(onepass_naked_empty_assert_1_, r"\w|a"); } From b31e36fd7da2d187b2545919f8dadc042ff9f970 Mon Sep 17 00:00:00 2001 From: Ethan Pailes Date: Sun, 23 Sep 2018 06:34:50 -0400 Subject: [PATCH 8/8] remove one allocation per instruction by directly mutating state_edge --- src/onepass.rs | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/onepass.rs b/src/onepass.rs index f6a7c68a70..c8bbfcae07 100644 --- a/src/onepass.rs +++ b/src/onepass.rs @@ -587,7 +587,7 @@ impl OnePassCompiler { // instructions which get states. let mut state_edge = vec![0]; while let Some(i) = state_edge.pop() { - state_edge.extend(self.inst_trans(i, &mut forwards)?); + self.inst_trans(i, &mut forwards, &mut state_edge)?; } // Solve the dependency relationships between all the @@ -623,11 +623,13 @@ impl OnePassCompiler { &mut self, inst_idx: usize, forwards: &mut Forwards, - ) -> Result, OnePassError> { + state_edge: &mut Vec, + ) -> Result<(), OnePassError> { trace!("::inst_trans inst_idx={}", inst_idx); if self.transitions[inst_idx].is_some() { - return Ok(vec![]); + // we've already computed the transition table for this state. + return Ok(()); } // Iterate over the children, visiting lower priority @@ -637,7 +639,7 @@ impl OnePassCompiler { &Inst::EmptyLook(ref inst) => vec![inst.goto], &Inst::Bytes(ref inst) => vec![inst.goto], &Inst::Split(ref inst) => vec![inst.goto1, inst.goto2], - &Inst::Match(_) => return Ok(vec![]), // no kids + &Inst::Match(_) => return Ok(()), // no kids &Inst::Ranges(_) | &Inst::Char(_) => unreachable!(), }; @@ -649,12 +651,11 @@ impl OnePassCompiler { // the initial list of `TransitionTarget::Die` pointers. let mut priority = 1; - let mut children = vec![]; while let Some(child_idx) = resume.pop() { match &self.prog[child_idx] { &Inst::EmptyLook(_) | &Inst::Save(_) => { forwards.forward(inst_idx, child_idx, priority); - children.push(child_idx); + state_edge.push(child_idx); } &Inst::Bytes(ref inst) => { // Weird usize casting shenanigans because a Bytes @@ -668,7 +669,7 @@ impl OnePassCompiler { priority: priority }; } - children.push(child_idx); + state_edge.push(child_idx); } &Inst::Split(ref inst) => { resume.push(inst.goto1); @@ -693,7 +694,7 @@ impl OnePassCompiler { self.transitions[inst_idx] = Some(trans); - Ok(children) + Ok(()) } /// Execute a forwarding job.