From 0b5a4bf256454bafaf7cb2177cf15acaff5b3502 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 3 Aug 2023 11:11:15 -0400 Subject: [PATCH 1/8] automata: add new 'WhichCaptures' config This is the first step in fixing a regression in memory usage. The underlying problem is that regex-automata now natively supports multi-pattern regexes *with* capturing support. Unfortunately though, this overall doesn't work too well with the current design of the PikeVM, because the amount of memory used is `len(captures) * len(states)`. So basically, as the regex and number of captures increases, the amount of memory used gets quite high. This is new functionality that we hope to improve upon over time, so it's not too big of a deal on its own. But it turns out this impacts previous uses of RegexSet that have capture groups. The old implementation just ignored these capture groups because they weren't supported in a RegexSet, and thus there were no memory problems. But in the new implementation, nothing tells it that it's okay to ignore the capture groups. So it winds up allocating space for them even though the RegexSet APIs don't provide any of that functionality. So my plan to fix this is to introduce a new configuration knob for controlling more granularly which capture states are compiled into the NFA. Previously we only supported "all of them" or "none of them." This commit adds a new (backwards compatible) knob that also permits "just implicit groups." That is, one capture group per pattern. This hopefully leads to less memory usage overall. (Well, it will certaintly be less, but hopefully it's a big reduction.) We don't actually change anything here. We just add a new `Config::which_captures` knob, implement the existing `Config::captures` in terms of `Config::which_captures` and deprecate `Config::captures`. If this winds up not being sufficient, then we may need to adapt the PikeVM to work without any capture groups at all and instead just report which patterns match. Which is... probably fine? --- regex-automata/src/dfa/dense.rs | 5 +- regex-automata/src/hybrid/dfa.rs | 5 +- regex-automata/src/meta/strategy.rs | 11 +- regex-automata/src/nfa/thompson/compiler.rs | 174 +++++++++++++++++--- regex-automata/src/nfa/thompson/mod.rs | 2 +- regex-automata/src/nfa/thompson/nfa.rs | 8 +- regex-automata/src/util/captures.rs | 16 +- 7 files changed, 182 insertions(+), 39 deletions(-) diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index 75ca85e6e..6da865f97 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -1170,7 +1170,10 @@ impl Builder { .clone() // We can always forcefully disable captures because DFAs do not // support them. - .configure(thompson::Config::new().captures(false)) + .configure( + thompson::Config::new() + .which_captures(thompson::WhichCaptures::None), + ) .build_many(patterns) .map_err(BuildError::nfa)?; self.build_from_nfa(&nfa) diff --git a/regex-automata/src/hybrid/dfa.rs b/regex-automata/src/hybrid/dfa.rs index 86963248f..67261c1a3 100644 --- a/regex-automata/src/hybrid/dfa.rs +++ b/regex-automata/src/hybrid/dfa.rs @@ -3973,7 +3973,10 @@ impl Builder { .clone() // We can always forcefully disable captures because DFAs do not // support them. - .configure(thompson::Config::new().captures(false)) + .configure( + thompson::Config::new() + .which_captures(thompson::WhichCaptures::None), + ) .build_many(patterns) .map_err(BuildError::nfa)?; self.build_from_nfa(nfa) diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index aa1d61ef3..52a501bf6 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -13,7 +13,7 @@ use crate::{ regex::{Cache, RegexInfo}, reverse_inner, wrappers, }, - nfa::thompson::{self, NFA}, + nfa::thompson::{self, WhichCaptures, NFA}, util::{ captures::{Captures, GroupInfo}, look::LookMatcher, @@ -452,7 +452,7 @@ impl Core { .utf8(info.config().get_utf8_empty()) .nfa_size_limit(info.config().get_nfa_size_limit()) .shrink(false) - .captures(true) + .which_captures(WhichCaptures::All) .look_matcher(lookm); let nfa = thompson::Compiler::new() .configure(thompson_config.clone()) @@ -499,7 +499,10 @@ impl Core { // useful with capturing groups in reverse. And of course, // the lazy DFA ignores capturing groups in all cases. .configure( - thompson_config.clone().captures(false).reverse(true), + thompson_config + .clone() + .which_captures(WhichCaptures::None) + .reverse(true), ) .build_many_from_hir(hirs) .map_err(BuildError::nfa)?; @@ -1480,7 +1483,7 @@ impl ReverseInner { .utf8(core.info.config().get_utf8_empty()) .nfa_size_limit(core.info.config().get_nfa_size_limit()) .shrink(false) - .captures(false) + .which_captures(WhichCaptures::None) .look_matcher(lookm); let result = thompson::Compiler::new() .configure(thompson_config) diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 2021d93ea..6cc79822a 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -30,7 +30,7 @@ pub struct Config { reverse: Option, nfa_size_limit: Option>, shrink: Option, - captures: Option, + which_captures: Option, look_matcher: Option, #[cfg(test)] unanchored_prefix: Option, @@ -178,12 +178,15 @@ impl Config { /// ``` /// use regex_automata::{ /// dfa::{self, Automaton}, - /// nfa::thompson::NFA, + /// nfa::thompson::{NFA, WhichCaptures}, /// HalfMatch, Input, /// }; /// /// let dfa = dfa::dense::Builder::new() - /// .thompson(NFA::config().captures(false).reverse(true)) + /// .thompson(NFA::config() + /// .which_captures(WhichCaptures::None) + /// .reverse(true) + /// ) /// .build("baz[0-9]+")?; /// let expected = Some(HalfMatch::must(0, 3)); /// assert_eq!( @@ -277,10 +280,12 @@ impl Config { /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long - /// use regex_automata::nfa::thompson::NFA; + /// use regex_automata::nfa::thompson::{NFA, WhichCaptures}; /// /// // Currently we have to disable captures when enabling reverse NFA. - /// let config = NFA::config().captures(false).reverse(true); + /// let config = NFA::config() + /// .which_captures(WhichCaptures::None) + /// .reverse(true); /// let not_shrunk = NFA::compiler() /// .configure(config.clone().shrink(false)) /// .build(r"\w")?; @@ -314,18 +319,70 @@ impl Config { /// require capturing groups to be present in the NFA. Building a Pike VM /// with an NFA without capturing groups will result in an error. /// + /// (Note that since this method is deprecated, the example below uses + /// [`Config::which_captures`] to disable capture states.) + /// /// ``` - /// use regex_automata::nfa::thompson::{pikevm::PikeVM, NFA}; + /// use regex_automata::nfa::thompson::{ + /// pikevm::PikeVM, + /// NFA, + /// WhichCaptures, + /// }; /// /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"[a-z]+")?; /// assert!(PikeVM::new_from_nfa(nfa).is_err()); /// /// # Ok::<(), Box>(()) /// ``` - pub fn captures(mut self, yes: bool) -> Config { - self.captures = Some(yes); + #[deprecated(since = "0.3.5", note = "use which_captures instead")] + pub fn captures(self, yes: bool) -> Config { + self.which_captures(if yes { + WhichCaptures::All + } else { + WhichCaptures::None + }) + } + + /// Configures what kinds of capture groups are compiled into + /// [`State::Capture`](crate::nfa::thompson::State::Capture) states in a + /// Thompson NFA. + /// + /// Currently, using any option except for [`WhichCaptures::None`] requires + /// disabling the [`reverse`](Config::reverse) setting. If both are + /// enabled, then the compiler will return an error. It is expected that + /// this limitation will be lifted in the future. + /// + /// This is set to [`WhichCaptures::All`] by default. Callers may wish to + /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the + /// overhead of capture states for explicit groups. Usually this occurs + /// when one wants to use the `PikeVM` only for determining the overall + /// match. Otherwise, the `PikeVM` could use much more memory than is + /// necessary. + /// + /// # Example + /// + /// This example demonstrates that some regex engines, like the Pike VM, + /// require capturing groups to be present in the NFA. Building a Pike VM + /// with an NFA without capturing groups will result in an error. + /// + /// ``` + /// use regex_automata::nfa::thompson::{ + /// pikevm::PikeVM, + /// NFA, + /// WhichCaptures, + /// }; + /// + /// let nfa = NFA::compiler() + /// .configure(NFA::config().which_captures(WhichCaptures::None)) + /// .build(r"[a-z]+")?; + /// assert!(PikeVM::new_from_nfa(nfa).is_err()); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config { + self.which_captures = Some(which_captures); self } @@ -405,8 +462,14 @@ impl Config { } /// Return whether NFA compilation is configured to produce capture states. + #[deprecated(since = "0.3.5", note = "use get_which_captures instead")] pub fn get_captures(&self) -> bool { - self.captures.unwrap_or(true) + self.get_which_captures().is_any() + } + + /// Return what kinds of capture states will be compiled into an NFA. + pub fn get_which_captures(&self) -> WhichCaptures { + self.which_captures.unwrap_or(WhichCaptures::All) } /// Return the look-around matcher for this NFA. @@ -439,7 +502,7 @@ impl Config { reverse: o.reverse.or(self.reverse), nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit), shrink: o.shrink.or(self.shrink), - captures: o.captures.or(self.captures), + which_captures: o.which_captures.or(self.which_captures), look_matcher: o.look_matcher.or_else(|| self.look_matcher.clone()), #[cfg(test)] unanchored_prefix: o.unanchored_prefix.or(self.unanchored_prefix), @@ -447,6 +510,57 @@ impl Config { } } +/// A configuration indicating which kinds of +/// [`State::Capture`](crate::nfa::thompson::State::Capture) states to include. +/// +/// This configuration can be used with [`Config::which_captures`] to control +/// which capture states are compiled into a Thompson NFA. +/// +/// The default configuration is [`WhichCaptures::All`]. +#[derive(Clone, Copy, Debug)] +pub enum WhichCaptures { + /// All capture states, including those corresponding to both implicit and + /// explicit capture groups, are included in the Thompson NFA. + All, + /// Only capture states corresponding to implicit capture groups are + /// included. Implicit capture groups appear in every pattern implicitly + /// and correspond to the overall match of a pattern. + /// + /// This is useful when one only cares about the overall match of a + /// pattern. By excluding capture states from explicit capture groups, + /// one might be able to reduce the memory usage of a multi-pattern regex + /// substantially if it was otherwise written to have many explicit capture + /// groups. + Implicit, + /// No capture states are compiled into the Thompson NFA. + /// + /// This is useful when capture states are either not needed (for example, + /// if one is only trying to build a DFA) or if they aren't supported (for + /// example, a reverse NFA). + None, +} + +impl Default for WhichCaptures { + fn default() -> WhichCaptures { + WhichCaptures::All + } +} + +impl WhichCaptures { + /// Returns true if this configuration indicates that no capture states + /// should be produced in an NFA. + pub fn is_none(&self) -> bool { + matches!(*self, WhichCaptures::None) + } + + /// Returns true if this configuration indicates that some capture states + /// should be added to an NFA. Note that this might only include capture + /// states for implicit capture groups. + pub fn is_any(&self) -> bool { + !self.is_none() + } +} + /* This compiler below uses Thompson's construction algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA graph as output. The NFA graph @@ -800,7 +914,9 @@ impl Compiler { if exprs.len() > PatternID::LIMIT { return Err(BuildError::too_many_patterns(exprs.len())); } - if self.config.get_reverse() && self.config.get_captures() { + if self.config.get_reverse() + && self.config.get_which_captures().is_any() + { return Err(BuildError::unsupported_captures()); } @@ -978,7 +1094,7 @@ impl Compiler { name: Option<&str>, expr: &Hir, ) -> Result { - if !self.config.get_captures() { + if self.config.get_which_captures().is_none() { return self.c(expr); } @@ -1728,9 +1844,15 @@ mod tests { util::primitives::{PatternID, StateID}, }; + use super::*; + fn build(pattern: &str) -> NFA { NFA::compiler() - .configure(NFA::config().captures(false).unanchored_prefix(false)) + .configure( + NFA::config() + .which_captures(WhichCaptures::None) + .unanchored_prefix(false), + ) .build(pattern) .unwrap() } @@ -1794,7 +1916,7 @@ mod tests { #[test] fn compile_unanchored_prefix() { let nfa = NFA::compiler() - .configure(NFA::config().captures(false)) + .configure(NFA::config().which_captures(WhichCaptures::None)) .build(r"a") .unwrap(); assert_eq!( @@ -1827,7 +1949,11 @@ mod tests { // Check that non-UTF-8 literals work. let nfa = NFA::compiler() - .configure(NFA::config().captures(false).unanchored_prefix(false)) + .configure( + NFA::config() + .which_captures(WhichCaptures::None) + .unanchored_prefix(false), + ) .syntax(crate::util::syntax::Config::new().utf8(false)) .build(r"(?-u)\xFF") .unwrap(); @@ -1937,7 +2063,7 @@ mod tests { let nfa = NFA::compiler() .configure( NFA::config() - .captures(false) + .which_captures(WhichCaptures::None) .reverse(true) .shrink(false) .unanchored_prefix(false), @@ -1965,7 +2091,11 @@ mod tests { #[test] fn compile_many_start_pattern() { let nfa = NFA::compiler() - .configure(NFA::config().captures(false).unanchored_prefix(false)) + .configure( + NFA::config() + .which_captures(WhichCaptures::None) + .unanchored_prefix(false), + ) .build_many(&["a", "b"]) .unwrap(); assert_eq!( @@ -1993,7 +2123,9 @@ mod tests { use regex_syntax::hir::{Class, ClassBytes, Hir}; let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![]))); - let config = NFA::config().captures(false).unanchored_prefix(false); + let config = NFA::config() + .which_captures(WhichCaptures::None) + .unanchored_prefix(false); let nfa = NFA::compiler().configure(config).build_from_hir(&hir).unwrap(); assert_eq!(nfa.states(), &[s_fail(), s_match(0)]); @@ -2005,7 +2137,9 @@ mod tests { use regex_syntax::hir::{Class, ClassUnicode, Hir}; let hir = Hir::class(Class::Unicode(ClassUnicode::new(vec![]))); - let config = NFA::config().captures(false).unanchored_prefix(false); + let config = NFA::config() + .which_captures(WhichCaptures::None) + .unanchored_prefix(false); let nfa = NFA::compiler().configure(config).build_from_hir(&hir).unwrap(); assert_eq!(nfa.states(), &[s_fail(), s_match(0)]); diff --git a/regex-automata/src/nfa/thompson/mod.rs b/regex-automata/src/nfa/thompson/mod.rs index 3581d738c..cf426736d 100644 --- a/regex-automata/src/nfa/thompson/mod.rs +++ b/regex-automata/src/nfa/thompson/mod.rs @@ -78,4 +78,4 @@ pub use self::{ }, }; #[cfg(feature = "syntax")] -pub use compiler::{Compiler, Config}; +pub use compiler::{Compiler, Config, WhichCaptures}; diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 86131406c..2108fa338 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -453,10 +453,10 @@ impl NFA { /// predict the anchored starting state. /// /// ``` - /// use regex_automata::nfa::thompson::{NFA, State}; + /// use regex_automata::nfa::thompson::{NFA, State, WhichCaptures}; /// /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build("a")?; /// let state = nfa.state(nfa.start_anchored()); /// match *state { @@ -711,7 +711,7 @@ impl NFA { /// or not. /// /// ``` - /// use regex_automata::nfa::thompson::NFA; + /// use regex_automata::nfa::thompson::{NFA, WhichCaptures}; /// /// // Obviously has capture states. /// let nfa = NFA::new("(a)")?; @@ -733,7 +733,7 @@ impl NFA { /// // Notice that 'has_capture' is false here even when we have an /// // explicit capture group in the pattern. /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build("(a)")?; /// assert!(!nfa.has_capture()); /// diff --git a/regex-automata/src/util/captures.rs b/regex-automata/src/util/captures.rs index c6517348d..cd3a5f8f7 100644 --- a/regex-automata/src/util/captures.rs +++ b/regex-automata/src/util/captures.rs @@ -1810,10 +1810,10 @@ impl GroupInfo { /// panic even if captures aren't enabled on this NFA: /// /// ``` - /// use regex_automata::nfa::thompson::NFA; + /// use regex_automata::nfa::thompson::{NFA, WhichCaptures}; /// /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build_many(&[ /// r"(?Pa)", /// r"a", @@ -1958,7 +1958,7 @@ impl GroupInfo { /// for different patterns and NFA configurations. /// /// ``` - /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// use regex_automata::{nfa::thompson::{NFA, WhichCaptures}, PatternID}; /// /// let nfa = NFA::new(r"(a)(b)(c)")?; /// // There are 3 explicit groups in the pattern's concrete syntax and @@ -1970,13 +1970,13 @@ impl GroupInfo { /// assert_eq!(1, nfa.group_info().group_len(PatternID::ZERO)); /// /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"abc")?; /// // We disabled capturing groups, so there are none. /// assert_eq!(0, nfa.group_info().group_len(PatternID::ZERO)); /// /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"(a)(b)(c)")?; /// // We disabled capturing groups, so there are none, even if there are /// // explicit groups in the concrete syntax. @@ -2000,7 +2000,7 @@ impl GroupInfo { /// for different patterns and NFA configurations. /// /// ``` - /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// use regex_automata::{nfa::thompson::{NFA, WhichCaptures}, PatternID}; /// /// let nfa = NFA::new(r"(a)(b)(c)")?; /// // There are 3 explicit groups in the pattern's concrete syntax and @@ -2017,13 +2017,13 @@ impl GroupInfo { /// assert_eq!(5, nfa.group_info().all_group_len()); /// /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"abc")?; /// // We disabled capturing groups, so there are none. /// assert_eq!(0, nfa.group_info().all_group_len()); /// /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"(a)(b)(c)")?; /// // We disabled capturing groups, so there are none, even if there are /// // explicit groups in the concrete syntax. From 8ef2d69fc19a70afff86f10cdbdc90c45b91b248 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 4 Aug 2023 10:12:36 -0400 Subject: [PATCH 2/8] automata: respect new 'which_captures' option The NFA compiler now implements the 'All', 'Implicit' and 'None' options. We also add some targeted unit tests to confirm basic behavior. --- regex-automata/src/nfa/thompson/compiler.rs | 90 ++++++++++++++++++++- 1 file changed, 87 insertions(+), 3 deletions(-) diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 6cc79822a..fc3e57710 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -1094,8 +1094,13 @@ impl Compiler { name: Option<&str>, expr: &Hir, ) -> Result { - if self.config.get_which_captures().is_none() { - return self.c(expr); + match self.config.get_which_captures() { + // No capture states means we always skip them. + WhichCaptures::None => return self.c(expr), + // Implicit captures states means we only add when index==0 since + // index==0 implies the group is implicit. + WhichCaptures::Implicit if index > 0 => return self.c(expr), + _ => {} } let start = self.add_capture_start(index, name)?; @@ -1841,7 +1846,7 @@ mod tests { use crate::{ nfa::thompson::{SparseTransitions, State, Transition, NFA}, - util::primitives::{PatternID, StateID}, + util::primitives::{PatternID, SmallIndex, StateID}, }; use super::*; @@ -1903,6 +1908,15 @@ mod tests { } } + fn s_cap(next: usize, pattern: usize, index: usize, slot: usize) -> State { + State::Capture { + next: sid(next), + pattern_id: pid(pattern), + group_index: SmallIndex::new(index).unwrap(), + slot: SmallIndex::new(slot).unwrap(), + } + } + fn s_fail() -> State { State::Fail } @@ -2144,4 +2158,74 @@ mod tests { NFA::compiler().configure(config).build_from_hir(&hir).unwrap(); assert_eq!(nfa.states(), &[s_fail(), s_match(0)]); } + + #[test] + fn compile_captures_all() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .unanchored_prefix(false) + .which_captures(WhichCaptures::All), + ) + .build("a(b)c") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + s_cap(1, 0, 0, 0), + s_byte(b'a', 2), + s_cap(3, 0, 1, 2), + s_byte(b'b', 4), + s_cap(5, 0, 1, 3), + s_byte(b'c', 6), + s_cap(7, 0, 0, 1), + s_match(0) + ] + ); + let ginfo = nfa.group_info(); + assert_eq!(2, ginfo.all_group_len()); + } + + #[test] + fn compile_captures_implicit() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .unanchored_prefix(false) + .which_captures(WhichCaptures::Implicit), + ) + .build("a(b)c") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + s_cap(1, 0, 0, 0), + s_byte(b'a', 2), + s_byte(b'b', 3), + s_byte(b'c', 4), + s_cap(5, 0, 0, 1), + s_match(0) + ] + ); + let ginfo = nfa.group_info(); + assert_eq!(1, ginfo.all_group_len()); + } + + #[test] + fn compile_captures_none() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .unanchored_prefix(false) + .which_captures(WhichCaptures::None), + ) + .build("a(b)c") + .unwrap(); + assert_eq!( + nfa.states(), + &[s_byte(b'a', 1), s_byte(b'b', 2), s_byte(b'c', 3), s_match(0)] + ); + let ginfo = nfa.group_info(); + assert_eq!(0, ginfo.all_group_len()); + } } From 87d6deb1a31b754c0dd65651b026a88e6ee37b82 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 4 Aug 2023 10:50:23 -0400 Subject: [PATCH 3/8] automata: add 'which_captures' knob to meta::Regex This propagates the new Thompson NFA compiler option to the meta regex config API. --- regex-automata/src/meta/regex.rs | 80 +++++++++++++++++++++++++++++ regex-automata/src/meta/strategy.rs | 2 +- 2 files changed, 81 insertions(+), 1 deletion(-) diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs index 6e16ceedb..bc043793d 100644 --- a/regex-automata/src/meta/regex.rs +++ b/regex-automata/src/meta/regex.rs @@ -16,6 +16,7 @@ use crate::{ strategy::{self, Strategy}, wrappers, }, + nfa::thompson::WhichCaptures, util::{ captures::{Captures, GroupInfo}, iter, @@ -2429,6 +2430,7 @@ pub struct Config { utf8_empty: Option, autopre: Option, pre: Option>, + which_captures: Option, nfa_size_limit: Option>, onepass_size_limit: Option>, hybrid_cache_capacity: Option, @@ -2619,6 +2621,75 @@ impl Config { Config { pre: Some(pre), ..self } } + /// Configures what kinds of groups are compiled as "capturing" in the + /// underlying regex engine. + /// + /// This is set to [`WhichCaptures::All`] by default. Callers may wish to + /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the + /// overhead of capture states for explicit groups. + /// + /// Note that another approach to avoiding the overhead of capture groups + /// is by using non-capturing groups in the regex pattern. That is, + /// `(?:a)` instead of `(a)`. This option is useful when you can't control + /// the concrete syntax but know that you don't need the underlying capture + /// states. For example, using `WhichCaptures::Implicit` will behave as if + /// all explicit capturing groups in the pattern were non-capturing. + /// + /// Setting this to `WhichCaptures::None` may result in an error when + /// building a meta regex. + /// + /// # Example + /// + /// This example demonstrates how the results of capture groups can change + /// based on this option. First we show the default (all capture groups in + /// the pattern are capturing): + /// + /// ``` + /// use regex_automata::{meta::Regex, Match, Span}; + /// + /// let re = Regex::new(r"foo([0-9]+)bar")?; + /// let hay = "foo123bar"; + /// + /// let mut caps = re.create_captures(); + /// re.captures(hay, &mut caps); + /// assert_eq!(Some(Span::from(0..9)), caps.get_group(0)); + /// assert_eq!(Some(Span::from(3..6)), caps.get_group(1)); + /// + /// Ok::<(), Box>(()) + /// ``` + /// + /// And now we show the behavior when we only include implicit capture + /// groups. In this case, we can only find the overall match span, but the + /// spans of any other explicit group don't exist because they are treated + /// as non-capturing. (In effect, when `WhichCaptures::Implicit` is used, + /// there is no real point in using [`Regex::captures`] since it will never + /// be able to report more information than [`Regex::find`].) + /// + /// ``` + /// use regex_automata::{ + /// meta::Regex, + /// nfa::thompson::WhichCaptures, + /// Match, + /// Span, + /// }; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().which_captures(WhichCaptures::Implicit)) + /// .build(r"foo([0-9]+)bar")?; + /// let hay = "foo123bar"; + /// + /// let mut caps = re.create_captures(); + /// re.captures(hay, &mut caps); + /// assert_eq!(Some(Span::from(0..9)), caps.get_group(0)); + /// assert_eq!(None, caps.get_group(1)); + /// + /// Ok::<(), Box>(()) + /// ``` + pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config { + self.which_captures = Some(which_captures); + self + } + /// Sets the size limit, in bytes, to enforce on the construction of every /// NFA build by the meta regex engine. /// @@ -2983,6 +3054,14 @@ impl Config { self.pre.as_ref().unwrap_or(&None).as_ref() } + /// Returns the capture configuration, as set by + /// [`Config::which_captures`]. + /// + /// If it was not explicitly set, then a default value is returned. + pub fn get_which_captures(&self) -> WhichCaptures { + self.which_captures.unwrap_or(WhichCaptures::All) + } + /// Returns NFA size limit, as set by [`Config::nfa_size_limit`]. /// /// If it was not explicitly set, then a default value is returned. @@ -3126,6 +3205,7 @@ impl Config { utf8_empty: o.utf8_empty.or(self.utf8_empty), autopre: o.autopre.or(self.autopre), pre: o.pre.or_else(|| self.pre.clone()), + which_captures: o.which_captures.or(self.which_captures), nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit), onepass_size_limit: o .onepass_size_limit diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index 52a501bf6..86610fbea 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -452,7 +452,7 @@ impl Core { .utf8(info.config().get_utf8_empty()) .nfa_size_limit(info.config().get_nfa_size_limit()) .shrink(false) - .which_captures(WhichCaptures::All) + .which_captures(info.config().get_which_captures()) .look_matcher(lookm); let nfa = thompson::Compiler::new() .configure(thompson_config.clone()) From 315207f099e835713bc0dc6d3ae1a3bd0849350a Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 4 Aug 2023 10:51:36 -0400 Subject: [PATCH 4/8] regex: use new 'which_captures' knob for RegexSet While this reduces memory usage by half, unfortunately, it's still quite a bit more than memory usage prior to regex 1.9. This is because we are still allocating room to store two offsets per regex for a rather large regex. --- src/builders.rs | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/builders.rs b/src/builders.rs index d19a0ffe2..a0f9b28b5 100644 --- a/src/builders.rs +++ b/src/builders.rs @@ -28,7 +28,9 @@ use alloc::{ vec::Vec, }; -use regex_automata::{meta, util::syntax, MatchKind}; +use regex_automata::{ + meta, nfa::thompson::WhichCaptures, util::syntax, MatchKind, +}; use crate::error::Error; @@ -100,8 +102,12 @@ impl Builder { } fn build_many_string(&self) -> Result { - let metac = - self.metac.clone().match_kind(MatchKind::All).utf8_empty(true); + let metac = self + .metac + .clone() + .match_kind(MatchKind::All) + .utf8_empty(true) + .which_captures(WhichCaptures::Implicit); let syntaxc = self.syntaxc.clone().utf8(true); let patterns = Arc::from(self.pats.as_slice()); meta::Builder::new() @@ -113,8 +119,12 @@ impl Builder { } fn build_many_bytes(&self) -> Result { - let metac = - self.metac.clone().match_kind(MatchKind::All).utf8_empty(false); + let metac = self + .metac + .clone() + .match_kind(MatchKind::All) + .utf8_empty(false) + .which_captures(WhichCaptures::Implicit); let syntaxc = self.syntaxc.clone().utf8(false); let patterns = Arc::from(self.pats.as_slice()); meta::Builder::new() From b2339ef50f2e0e7d6a58db8206d3ba6a8e36dc28 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 4 Aug 2023 13:54:56 -0400 Subject: [PATCH 5/8] automata: make PikeVM and backtracker work without capture states Previously, construction of these engines checked to make sure the NFA given had some capture states in it. If the NFA didn't, construction failed with an error. To support the case where the NFA has no capture states at all (to avoid gratuitous memory allocation), we remove this restriction and tweak the engine implementations to stop assuming that the NFA has capture states. This turned out to not be too hard, as we only assumed as much in a few places. The main reason why this restriction existed in the first place was semantics. Namely, it's important that the PikeVM remain infallible. But what happens when you ask for match offsets in a search with an NFA that has no capture states? The PikeVM just doesn't support that. Previously it would panic (and thus the reason construction would fail). But now instead it will just report "no match." It's a little hokey, but we justify it to ourselves because "simplicity" and "avoids footguns" are non-goals of this crate. --- regex-automata/src/meta/regex.rs | 6 ++- regex-automata/src/nfa/thompson/backtrack.rs | 29 ++++++------ regex-automata/src/nfa/thompson/compiler.rs | 46 +++++++++++++++----- regex-automata/src/nfa/thompson/error.rs | 12 ----- regex-automata/src/nfa/thompson/pikevm.rs | 40 +++++++---------- 5 files changed, 72 insertions(+), 61 deletions(-) diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs index bc043793d..0d40eaa40 100644 --- a/regex-automata/src/meta/regex.rs +++ b/regex-automata/src/meta/regex.rs @@ -2635,8 +2635,10 @@ impl Config { /// states. For example, using `WhichCaptures::Implicit` will behave as if /// all explicit capturing groups in the pattern were non-capturing. /// - /// Setting this to `WhichCaptures::None` may result in an error when - /// building a meta regex. + /// Setting this to `WhichCaptures::None` is usually not the right thing to + /// do. When no capture states are compiled, some regex engines (such as + /// the `PikeVM`) won't be able to report match offsets. This will manifest + /// as no match being found. /// /// # Example /// diff --git a/regex-automata/src/nfa/thompson/backtrack.rs b/regex-automata/src/nfa/thompson/backtrack.rs index 75b6c096b..c68f9fa42 100644 --- a/regex-automata/src/nfa/thompson/backtrack.rs +++ b/regex-automata/src/nfa/thompson/backtrack.rs @@ -300,15 +300,6 @@ impl Builder { &self, nfa: NFA, ) -> Result { - // If the NFA has no captures, then the backtracker doesn't work since - // it relies on them in order to report match locations. However, in - // the special case of an NFA with no patterns, it is allowed, since - // no matches can ever be produced. And importantly, an NFA with no - // patterns has no capturing groups anyway, so this is necessary to - // permit the backtracker to work with regexes with zero patterns. - if !nfa.has_capture() && nfa.pattern_len() > 0 { - return Err(BuildError::missing_captures()); - } nfa.look_set_any().available().map_err(BuildError::word)?; Ok(BoundedBacktracker { config: self.config.clone(), nfa }) } @@ -954,8 +945,14 @@ impl BoundedBacktracker { None => return Ok(None), Some(pid) => pid, }; - let start = slots[0].unwrap().get(); - let end = slots[1].unwrap().get(); + let start = match slots[0] { + None => return Ok(None), + Some(s) => s.get(), + }; + let end = match slots[1] { + None => return Ok(None), + Some(s) => s.get(), + }; return Ok(Some(Match::new(pid, Span { start, end }))); } let ginfo = self.get_nfa().group_info(); @@ -965,8 +962,14 @@ impl BoundedBacktracker { None => return Ok(None), Some(pid) => pid, }; - let start = slots[pid.as_usize() * 2].unwrap().get(); - let end = slots[pid.as_usize() * 2 + 1].unwrap().get(); + let start = match slots[pid.as_usize() * 2] { + None => return Ok(None), + Some(s) => s.get(), + }; + let end = match slots[pid.as_usize() * 2 + 1] { + None => return Ok(None), + Some(s) => s.get(), + }; Ok(Some(Match::new(pid, Span { start, end }))) } diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index fc3e57710..065e9ef27 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -316,8 +316,8 @@ impl Config { /// # Example /// /// This example demonstrates that some regex engines, like the Pike VM, - /// require capturing groups to be present in the NFA. Building a Pike VM - /// with an NFA without capturing groups will result in an error. + /// require capturing states to be present in the NFA to report match + /// offsets. /// /// (Note that since this method is deprecated, the example below uses /// [`Config::which_captures`] to disable capture states.) @@ -329,10 +329,13 @@ impl Config { /// WhichCaptures, /// }; /// - /// let nfa = NFA::compiler() - /// .configure(NFA::config().which_captures(WhichCaptures::None)) + /// let re = PikeVM::builder() + /// .thompson(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"[a-z]+")?; - /// assert!(PikeVM::new_from_nfa(nfa).is_err()); + /// let mut cache = re.create_cache(); + /// + /// assert!(re.is_match(&mut cache, "abc")); + /// assert_eq!(None, re.find(&mut cache, "abc")); /// /// # Ok::<(), Box>(()) /// ``` @@ -364,8 +367,8 @@ impl Config { /// # Example /// /// This example demonstrates that some regex engines, like the Pike VM, - /// require capturing groups to be present in the NFA. Building a Pike VM - /// with an NFA without capturing groups will result in an error. + /// require capturing states to be present in the NFA to report match + /// offsets. /// /// ``` /// use regex_automata::nfa::thompson::{ @@ -374,10 +377,33 @@ impl Config { /// WhichCaptures, /// }; /// - /// let nfa = NFA::compiler() - /// .configure(NFA::config().which_captures(WhichCaptures::None)) + /// let re = PikeVM::builder() + /// .thompson(NFA::config().which_captures(WhichCaptures::None)) + /// .build(r"[a-z]+")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(re.is_match(&mut cache, "abc")); + /// assert_eq!(None, re.find(&mut cache, "abc")); + /// + /// # Ok::<(), Box>(()) + /// ``` + /// + /// The same applies to the bounded backtracker: + /// + /// ``` + /// use regex_automata::nfa::thompson::{ + /// backtrack::BoundedBacktracker, + /// NFA, + /// WhichCaptures, + /// }; + /// + /// let re = BoundedBacktracker::builder() + /// .thompson(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"[a-z]+")?; - /// assert!(PikeVM::new_from_nfa(nfa).is_err()); + /// let mut cache = re.create_cache(); + /// + /// assert!(re.try_is_match(&mut cache, "abc")?); + /// assert_eq!(None, re.try_find(&mut cache, "abc")?); /// /// # Ok::<(), Box>(()) /// ``` diff --git a/regex-automata/src/nfa/thompson/error.rs b/regex-automata/src/nfa/thompson/error.rs index 82648813b..3c2fa8a21 100644 --- a/regex-automata/src/nfa/thompson/error.rs +++ b/regex-automata/src/nfa/thompson/error.rs @@ -68,9 +68,6 @@ enum BuildErrorKind { /// The invalid index that was given. index: u32, }, - /// An error that occurs when one tries to build an NFA simulation (such as - /// the PikeVM) without any capturing groups. - MissingCaptures, /// An error that occurs when one tries to build a reverse NFA with /// captures enabled. Currently, this isn't supported, but we probably /// should support it at some point. @@ -126,10 +123,6 @@ impl BuildError { BuildError { kind: BuildErrorKind::InvalidCaptureIndex { index } } } - pub(crate) fn missing_captures() -> BuildError { - BuildError { kind: BuildErrorKind::MissingCaptures } - } - #[cfg(feature = "syntax")] pub(crate) fn unsupported_captures() -> BuildError { BuildError { kind: BuildErrorKind::UnsupportedCaptures } @@ -181,11 +174,6 @@ impl core::fmt::Display for BuildError { "capture group index {} is invalid (too big or discontinuous)", index, ), - BuildErrorKind::MissingCaptures => write!( - f, - "operation requires the NFA to have capturing groups, \ - but the NFA given contains none", - ), #[cfg(feature = "syntax")] BuildErrorKind::UnsupportedCaptures => write!( f, diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index 79ce3c60d..f5c0b200e 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -275,15 +275,6 @@ impl Builder { /// construction of the NFA itself will of course be ignored, since the NFA /// given here is already built. pub fn build_from_nfa(&self, nfa: NFA) -> Result { - // If the NFA has no captures, then the PikeVM doesn't work since it - // relies on them in order to report match locations. However, in - // the special case of an NFA with no patterns, it is allowed, since - // no matches can ever be produced. And importantly, an NFA with no - // patterns has no capturing groups anyway, so this is necessary to - // permit the PikeVM to work with regexes with zero patterns. - if !nfa.has_capture() && nfa.pattern_len() > 0 { - return Err(BuildError::missing_captures()); - } nfa.look_set_any().available().map_err(BuildError::word)?; Ok(PikeVM { config: self.config.clone(), nfa }) } @@ -828,16 +819,16 @@ impl PikeVM { if self.get_nfa().pattern_len() == 1 { let mut slots = [None, None]; let pid = self.search_slots(cache, &input, &mut slots)?; - let start = slots[0].unwrap().get(); - let end = slots[1].unwrap().get(); + let start = slots[0]?.get(); + let end = slots[1]?.get(); return Some(Match::new(pid, Span { start, end })); } let ginfo = self.get_nfa().group_info(); let slots_len = ginfo.implicit_slot_len(); let mut slots = vec![None; slots_len]; let pid = self.search_slots(cache, &input, &mut slots)?; - let start = slots[pid.as_usize() * 2].unwrap().get(); - let end = slots[pid.as_usize() * 2 + 1].unwrap().get(); + let start = slots[pid.as_usize() * 2]?.get(); + let end = slots[pid.as_usize() * 2 + 1]?.get(); Some(Match::new(pid, Span { start, end })) } @@ -1123,15 +1114,15 @@ impl PikeVM { if self.get_nfa().pattern_len() == 1 { let mut enough = [None, None]; let got = self.search_slots_imp(cache, input, &mut enough); - // This is OK because we know `enough_slots` is strictly bigger - // than `slots`, otherwise this special case isn't reached. + // This is OK because we know `enough` is strictly bigger than + // `slots`, otherwise this special case isn't reached. slots.copy_from_slice(&enough[..slots.len()]); return got; } let mut enough = vec![None; min]; let got = self.search_slots_imp(cache, input, &mut enough); - // This is OK because we know `enough_slots` is strictly bigger than - // `slots`, otherwise this special case isn't reached. + // This is OK because we know `enough` is strictly bigger than `slots`, + // otherwise this special case isn't reached. slots.copy_from_slice(&enough[..slots.len()]); got } @@ -2108,15 +2099,16 @@ impl SlotTable { // if a 'Captures' has fewer slots, e.g., none at all or only slots // for tracking the overall match instead of all slots for every // group. - self.slots_for_captures = nfa.group_info().slot_len(); + self.slots_for_captures = core::cmp::max( + self.slots_per_state, + nfa.pattern_len().checked_mul(2).unwrap(), + ); let len = nfa .states() .len() - // We add 1 so that our last row is always empty. We use it as - // "scratch" space for computing the epsilon closure off of the - // starting state. - .checked_add(1) - .and_then(|x| x.checked_mul(self.slots_per_state)) + .checked_mul(self.slots_per_state) + // Add space to account for scratch space used during a search. + .and_then(|x| x.checked_add(self.slots_for_captures)) // It seems like this could actually panic on legitimate inputs on // 32-bit targets, and very likely to panic on 16-bit. Should we // somehow convert this to an error? What about something similar @@ -2170,7 +2162,7 @@ impl SlotTable { /// compute an epsilon closure outside of the user supplied regex, and thus /// never want it to have any capturing slots set. fn all_absent(&mut self) -> &mut [Option] { - let i = self.table.len() - self.slots_per_state; + let i = self.table.len() - self.slots_for_captures; &mut self.table[i..i + self.slots_for_captures] } } From 9d86815a7de06c25795537cc0baf17823354d676 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 4 Aug 2023 13:58:30 -0400 Subject: [PATCH 6/8] regex: switch RegexSet to use WhichCaptures::None And this finally resolves the memory usage problem, as the PikeVM cache used by the RegexSet in #1059 no longer allocates MBs of memory because of the existence of impossible-to-use capturing groups. Fixes #1059 --- src/builders.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/builders.rs b/src/builders.rs index a0f9b28b5..46c4824c5 100644 --- a/src/builders.rs +++ b/src/builders.rs @@ -107,7 +107,7 @@ impl Builder { .clone() .match_kind(MatchKind::All) .utf8_empty(true) - .which_captures(WhichCaptures::Implicit); + .which_captures(WhichCaptures::None); let syntaxc = self.syntaxc.clone().utf8(true); let patterns = Arc::from(self.pats.as_slice()); meta::Builder::new() @@ -124,7 +124,7 @@ impl Builder { .clone() .match_kind(MatchKind::All) .utf8_empty(false) - .which_captures(WhichCaptures::Implicit); + .which_captures(WhichCaptures::None); let syntaxc = self.syntaxc.clone().utf8(false); let patterns = Arc::from(self.pats.as_slice()); meta::Builder::new() From 3527ee0353a9d103fe9945cf9d2cd7dcba061313 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 5 Aug 2023 10:41:24 -0400 Subject: [PATCH 7/8] automata: add 'is_match' as its own path to meta regex internals I originally prided myself on not having a dedicated `is_match` routine on the meta regex engine's internal `Strategy` trait, and actually spent a fair amount of attention ensuring that `is_match` and `find` always returned the same results. That is, `is_match` returns true if and only if `find` returns a match. But the fix in the previous commits for #1059 means that a `PikeVM` and a `BoundedBacktracker` can be used to run a search with an NFA that has no capture states. Since both engines are implemented to only track offsets via those capture states, it follows that the only thing that can be returned in such cases is whether a match occurs (and if so, which pattern matched). That in turn means that `is_match` can return `true` while `find` can return `None` for the same search. This is because the latter returns `None` even when a match is found but there are no capture states to record the offsets of the match. This in theory could be resolved by adding APIs to the `PikeVM` and the `BoundedBacktracker` that return a `HalfMatch` without depending on any capture states at all. Then `is_match` could be implemented in terms of those APIs. That is probably the right path, but it's pretty gnarly to do without breaking changes and I don't want to do any breaking changes right now. So instead, we just add a special path to the meta regex engine for `is_match` and permit some cases to have different results between `is_match` and `find`. Sigh. --- regex-automata/src/meta/regex.rs | 9 ++- regex-automata/src/meta/strategy.rs | 120 +++++++++++++++++++++++++++- regex-automata/src/meta/wrappers.rs | 32 ++++++++ 3 files changed, 158 insertions(+), 3 deletions(-) diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs index 0d40eaa40..3a04b14d8 100644 --- a/regex-automata/src/meta/regex.rs +++ b/regex-automata/src/meta/regex.rs @@ -529,7 +529,14 @@ impl Regex { #[inline] pub fn is_match<'h, I: Into>>(&self, input: I) -> bool { let input = input.into().earliest(true); - self.search_half(&input).is_some() + if self.imp.info.is_impossible(&input) { + return false; + } + let mut guard = self.pool.get(); + let result = self.imp.strat.is_match(&mut guard, &input); + // See 'Regex::search' for why we put the guard back explicitly. + PoolGuard::put(guard); + result } /// Executes a leftmost search and returns the first match that is found, diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index 86610fbea..ea6c6ab57 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -58,6 +58,8 @@ pub(super) trait Strategy: input: &Input<'_>, ) -> Option; + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool; + fn search_slots( &self, cache: &mut Cache, @@ -399,6 +401,10 @@ impl Strategy for Pre

{ self.search(cache, input).map(|m| HalfMatch::new(m.pattern(), m.end())) } + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + self.search(cache, input).is_some() + } + fn search_slots( &self, cache: &mut Cache, @@ -623,6 +629,29 @@ impl Core { } } + fn is_match_nofail(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if let Some(ref e) = self.onepass.get(input) { + trace!( + "using OnePass for is-match search at {:?}", + input.get_span() + ); + e.search_slots(&mut cache.onepass, input, &mut []).is_some() + } else if let Some(ref e) = self.backtrack.get(input) { + trace!( + "using BoundedBacktracker for is-match search at {:?}", + input.get_span() + ); + e.is_match(&mut cache.backtrack, input) + } else { + trace!( + "using PikeVM for is-match search at {:?}", + input.get_span() + ); + let e = self.pikevm.get(); + e.is_match(&mut cache.pikevm, input) + } + } + fn is_capture_search_needed(&self, slots_len: usize) -> bool { slots_len > self.nfa.group_info().implicit_slot_len() } @@ -703,7 +732,7 @@ impl Strategy for Core { // The main difference with 'search' is that if we're using a DFA, we // can use a single forward scan without needing to run the reverse // DFA. - return if let Some(e) = self.dfa.get(input) { + if let Some(e) = self.dfa.get(input) { trace!("using full DFA for half search at {:?}", input.get_span()); match e.try_search_half_fwd(input) { Ok(x) => x, @@ -723,7 +752,38 @@ impl Strategy for Core { } } else { self.search_half_nofail(cache, input) - }; + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if let Some(e) = self.dfa.get(input) { + trace!( + "using full DFA for is-match search at {:?}", + input.get_span() + ); + match e.try_search_half_fwd(input) { + Ok(x) => x.is_some(), + Err(_err) => { + trace!("full DFA half search failed: {}", _err); + self.is_match_nofail(cache, input) + } + } + } else if let Some(e) = self.hybrid.get(input) { + trace!( + "using lazy DFA for is-match search at {:?}", + input.get_span() + ); + match e.try_search_half_fwd(&mut cache.hybrid, input) { + Ok(x) => x.is_some(), + Err(_err) => { + trace!("lazy DFA half search failed: {}", _err); + self.is_match_nofail(cache, input) + } + } + } else { + self.is_match_nofail(cache, input) + } } #[cfg_attr(feature = "perf-inline", inline(always))] @@ -983,6 +1043,21 @@ impl Strategy for ReverseAnchored { } } + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if input.get_anchored().is_anchored() { + return self.core.is_match(cache, input); + } + match self.try_search_half_anchored_rev(cache, input) { + Err(_err) => { + trace!("fast reverse anchored search failed: {}", _err); + self.core.is_match_nofail(cache, input) + } + Ok(None) => false, + Ok(Some(_)) => true, + } + } + #[cfg_attr(feature = "perf-inline", inline(always))] fn search_slots( &self, @@ -1335,6 +1410,28 @@ impl Strategy for ReverseSuffix { } } + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if input.get_anchored().is_anchored() { + return self.core.is_match(cache, input); + } + match self.try_search_half_start(cache, input) { + Err(RetryError::Quadratic(_err)) => { + trace!("reverse suffix half optimization failed: {}", _err); + self.core.is_match_nofail(cache, input) + } + Err(RetryError::Fail(_err)) => { + trace!( + "reverse suffix reverse fast half search failed: {}", + _err + ); + self.core.is_match_nofail(cache, input) + } + Ok(None) => false, + Ok(Some(_)) => true, + } + } + #[cfg_attr(feature = "perf-inline", inline(always))] fn search_slots( &self, @@ -1717,6 +1814,25 @@ impl Strategy for ReverseInner { } } + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if input.get_anchored().is_anchored() { + return self.core.is_match(cache, input); + } + match self.try_search_full(cache, input) { + Err(RetryError::Quadratic(_err)) => { + trace!("reverse inner half optimization failed: {}", _err); + self.core.is_match_nofail(cache, input) + } + Err(RetryError::Fail(_err)) => { + trace!("reverse inner fast half search failed: {}", _err); + self.core.is_match_nofail(cache, input) + } + Ok(None) => false, + Ok(Some(_)) => true, + } + } + #[cfg_attr(feature = "perf-inline", inline(always))] fn search_slots( &self, diff --git a/regex-automata/src/meta/wrappers.rs b/regex-automata/src/meta/wrappers.rs index 8f58363a1..08110d9bb 100644 --- a/regex-automata/src/meta/wrappers.rs +++ b/regex-automata/src/meta/wrappers.rs @@ -87,6 +87,15 @@ impl PikeVMEngine { Ok(PikeVMEngine(engine)) } + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn is_match( + &self, + cache: &mut PikeVMCache, + input: &Input<'_>, + ) -> bool { + self.0.is_match(cache.0.as_mut().unwrap(), input.clone()) + } + #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn search_slots( &self, @@ -212,6 +221,29 @@ impl BoundedBacktrackerEngine { } } + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn is_match( + &self, + cache: &mut BoundedBacktrackerCache, + input: &Input<'_>, + ) -> bool { + #[cfg(feature = "nfa-backtrack")] + { + // OK because we only permit access to this engine when we know + // the haystack is short enough for the backtracker to run without + // reporting an error. + self.0 + .try_is_match(cache.0.as_mut().unwrap(), input.clone()) + .unwrap() + } + #[cfg(not(feature = "nfa-backtrack"))] + { + // Impossible to reach because this engine is never constructed + // if the requisite features aren't enabled. + unreachable!() + } + } + #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn search_slots( &self, From a6948a8975b178bb9c390d0824550be4e8bf0f47 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 5 Aug 2023 13:52:37 -0400 Subject: [PATCH 8/8] automata: add internal HalfMatch APIs for NFA engines Welp, okay, turns out we do need to know at least the end offset of a match even when the NFA has no capture states. This is necessary for correctly handling the case where a regex can match the empty string but the caller has asked that matches not split a codepoint. If we don't know the end offset of a match, then we can't correctly determine whether a match exists or not and are forced to return no match even when a match exists. We can get away with this I think for `find`-style APIs where the caller has specifically requested match offsets while simultaneously configuring the NFA to not track offsets, but with `is_match`-style APIs, we really should be able to handle it correctly. We should eventually just expose the `HalfMatch` APIs on `PikeVM` and `BoundedBacktracker`, but for now we keep them private. --- regex-automata/src/nfa/thompson/backtrack.rs | 59 ++++++++----------- regex-automata/src/nfa/thompson/pikevm.rs | 60 +++++++++----------- 2 files changed, 49 insertions(+), 70 deletions(-) diff --git a/regex-automata/src/nfa/thompson/backtrack.rs b/regex-automata/src/nfa/thompson/backtrack.rs index c68f9fa42..eba037c1d 100644 --- a/regex-automata/src/nfa/thompson/backtrack.rs +++ b/regex-automata/src/nfa/thompson/backtrack.rs @@ -19,7 +19,7 @@ use crate::{ empty, iter, prefilter::Prefilter, primitives::{NonMaxUsize, PatternID, SmallIndex, StateID}, - search::{Anchored, Input, Match, MatchError, Span}, + search::{Anchored, HalfMatch, Input, Match, MatchError, Span}, }, }; @@ -1295,12 +1295,14 @@ impl BoundedBacktracker { ) -> Result, MatchError> { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); if !utf8empty { - return self.try_search_slots_imp(cache, input, slots); + let maybe_hm = self.try_search_slots_imp(cache, input, slots)?; + return Ok(maybe_hm.map(|hm| hm.pattern())); } // See PikeVM::try_search_slots for why we do this. let min = self.get_nfa().group_info().implicit_slot_len(); if slots.len() >= min { - return self.try_search_slots_imp(cache, input, slots); + let maybe_hm = self.try_search_slots_imp(cache, input, slots)?; + return Ok(maybe_hm.map(|hm| hm.pattern())); } if self.get_nfa().pattern_len() == 1 { let mut enough = [None, None]; @@ -1308,14 +1310,14 @@ impl BoundedBacktracker { // This is OK because we know `enough_slots` is strictly bigger // than `slots`, otherwise this special case isn't reached. slots.copy_from_slice(&enough[..slots.len()]); - return Ok(got); + return Ok(got.map(|hm| hm.pattern())); } let mut enough = vec![None; min]; let got = self.try_search_slots_imp(cache, input, &mut enough)?; // This is OK because we know `enough_slots` is strictly bigger than // `slots`, otherwise this special case isn't reached. slots.copy_from_slice(&enough[..slots.len()]); - Ok(got) + Ok(got.map(|hm| hm.pattern())) } /// This is the actual implementation of `try_search_slots_imp` that @@ -1328,30 +1330,17 @@ impl BoundedBacktracker { cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], - ) -> Result, MatchError> { + ) -> Result, MatchError> { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); - let (pid, end) = match self.search_imp(cache, input, slots)? { + let hm = match self.search_imp(cache, input, slots)? { None => return Ok(None), - Some(pid) if !utf8empty => return Ok(Some(pid)), - Some(pid) => { - let slot_start = pid.as_usize() * 2; - let slot_end = slot_start + 1; - // OK because we know we have a match and we know our caller - // provided slots are big enough (which we make true above if - // the caller didn't). Namely, we're only here when 'utf8empty' - // is true, and when that's true, we require slots for every - // pattern. - (pid, slots[slot_end].unwrap().get()) - } + Some(hm) if !utf8empty => return Ok(Some(hm)), + Some(hm) => hm, }; - empty::skip_splits_fwd(input, pid, end, |input| { - let pid = match self.search_imp(cache, input, slots)? { - None => return Ok(None), - Some(pid) => pid, - }; - let slot_start = pid.as_usize() * 2; - let slot_end = slot_start + 1; - Ok(Some((pid, slots[slot_end].unwrap().get()))) + empty::skip_splits_fwd(input, hm, hm.offset(), |input| { + Ok(self + .search_imp(cache, input, slots)? + .map(|hm| (hm, hm.offset()))) }) } @@ -1367,7 +1356,7 @@ impl BoundedBacktracker { cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], - ) -> Result, MatchError> { + ) -> Result, MatchError> { // Unlike in the PikeVM, we write our capturing group spans directly // into the caller's captures groups. So we have to make sure we're // starting with a blank slate first. In the PikeVM, we avoid this @@ -1414,10 +1403,9 @@ impl BoundedBacktracker { Some(ref span) => at = span.start, } } - if let Some(pid) = - self.backtrack(cache, input, at, start_id, slots) + if let Some(hm) = self.backtrack(cache, input, at, start_id, slots) { - return Ok(Some(pid)); + return Ok(Some(hm)); } at += 1; } @@ -1438,14 +1426,13 @@ impl BoundedBacktracker { at: usize, start_id: StateID, slots: &mut [Option], - ) -> Option { + ) -> Option { cache.stack.push(Frame::Step { sid: start_id, at }); while let Some(frame) = cache.stack.pop() { match frame { Frame::Step { sid, at } => { - if let Some(pid) = self.step(cache, input, sid, at, slots) - { - return Some(pid); + if let Some(hm) = self.step(cache, input, sid, at, slots) { + return Some(hm); } } Frame::RestoreCapture { slot, offset } => { @@ -1475,7 +1462,7 @@ impl BoundedBacktracker { mut sid: StateID, mut at: usize, slots: &mut [Option], - ) -> Option { + ) -> Option { loop { if !cache.visited.insert(sid, at - input.start()) { return None; @@ -1558,7 +1545,7 @@ impl BoundedBacktracker { } State::Fail => return None, State::Match { pattern_id } => { - return Some(pattern_id); + return Some(HalfMatch::new(pattern_id, at)); } } } diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index f5c0b200e..0128c151a 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -17,7 +17,9 @@ use crate::{ empty, iter, prefilter::Prefilter, primitives::{NonMaxUsize, PatternID, SmallIndex, StateID}, - search::{Anchored, Input, Match, MatchKind, PatternSet, Span}, + search::{ + Anchored, HalfMatch, Input, Match, MatchKind, PatternSet, Span, + }, sparse_set::SparseSet, }, }; @@ -1094,7 +1096,8 @@ impl PikeVM { ) -> Option { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); if !utf8empty { - return self.search_slots_imp(cache, input, slots); + let hm = self.search_slots_imp(cache, input, slots)?; + return Some(hm.pattern()); } // There is an unfortunate special case where if the regex can // match the empty string and UTF-8 mode is enabled, the search @@ -1109,7 +1112,8 @@ impl PikeVM { // this case. let min = self.get_nfa().group_info().implicit_slot_len(); if slots.len() >= min { - return self.search_slots_imp(cache, input, slots); + let hm = self.search_slots_imp(cache, input, slots)?; + return Some(hm.pattern()); } if self.get_nfa().pattern_len() == 1 { let mut enough = [None, None]; @@ -1117,14 +1121,14 @@ impl PikeVM { // This is OK because we know `enough` is strictly bigger than // `slots`, otherwise this special case isn't reached. slots.copy_from_slice(&enough[..slots.len()]); - return got; + return got.map(|hm| hm.pattern()); } let mut enough = vec![None; min]; let got = self.search_slots_imp(cache, input, &mut enough); // This is OK because we know `enough` is strictly bigger than `slots`, // otherwise this special case isn't reached. slots.copy_from_slice(&enough[..slots.len()]); - got + got.map(|hm| hm.pattern()) } /// This is the actual implementation of `search_slots_imp` that @@ -1137,30 +1141,17 @@ impl PikeVM { cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], - ) -> Option { + ) -> Option { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); - let (pid, end) = match self.search_imp(cache, input, slots) { + let hm = match self.search_imp(cache, input, slots) { None => return None, - Some(pid) if !utf8empty => return Some(pid), - Some(pid) => { - let slot_start = pid.as_usize() * 2; - let slot_end = slot_start + 1; - // OK because we know we have a match and we know our caller - // provided slots are big enough (which we make true above if - // the caller didn't). Namely, we're only here when 'utf8empty' - // is true, and when that's true, we require slots for every - // pattern. - (pid, slots[slot_end].unwrap().get()) - } + Some(hm) if !utf8empty => return Some(hm), + Some(hm) => hm, }; - empty::skip_splits_fwd(input, pid, end, |input| { - let pid = match self.search_imp(cache, input, slots) { - None => return Ok(None), - Some(pid) => pid, - }; - let slot_start = pid.as_usize() * 2; - let slot_end = slot_start + 1; - Ok(Some((pid, slots[slot_end].unwrap().get()))) + empty::skip_splits_fwd(input, hm, hm.offset(), |input| { + Ok(self + .search_imp(cache, input, slots) + .map(|hm| (hm, hm.offset()))) }) // OK because the PikeVM never errors. .unwrap() @@ -1235,7 +1226,7 @@ impl PikeVM { cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], - ) -> Option { + ) -> Option { cache.setup_search(slots.len()); if input.is_done() { return None; @@ -1264,7 +1255,7 @@ impl PikeVM { let pre = if anchored { None } else { self.get_config().get_prefilter() }; let Cache { ref mut stack, ref mut curr, ref mut next } = cache; - let mut pid = None; + let mut hm = None; // Yes, our search doesn't end at input.end(), but includes it. This // is necessary because matches are delayed by one byte, just like // how the DFA engines work. The delay is used to handle look-behind @@ -1283,7 +1274,7 @@ impl PikeVM { if curr.set.is_empty() { // We have a match and we haven't been instructed to continue // on even after finding a match, so we can quit. - if pid.is_some() && !allmatches { + if hm.is_some() && !allmatches { break; } // If we're running an anchored search and we've advanced @@ -1353,7 +1344,7 @@ impl PikeVM { // search. If we re-computed it at every position, we would be // simulating an unanchored search when we were tasked to perform // an anchored search. - if (!pid.is_some() || allmatches) + if (!hm.is_some() || allmatches) && (!anchored || at == input.start()) { // Since we are adding to the 'curr' active states and since @@ -1372,14 +1363,15 @@ impl PikeVM { let slots = next.slot_table.all_absent(); self.epsilon_closure(stack, slots, curr, input, at, start_id); } - if let Some(x) = self.nexts(stack, curr, next, input, at, slots) { - pid = Some(x); + if let Some(pid) = self.nexts(stack, curr, next, input, at, slots) + { + hm = Some(HalfMatch::new(pid, at)); } // Unless the caller asked us to return early, we need to mush on // to see if we can extend our match. (But note that 'nexts' will // quit right after seeing a match when match_kind==LeftmostFirst, // as is consistent with leftmost-first match priority.) - if input.get_earliest() && pid.is_some() { + if input.get_earliest() && hm.is_some() { break; } core::mem::swap(curr, next); @@ -1387,7 +1379,7 @@ impl PikeVM { at += 1; } instrument!(|c| c.eprint(&self.nfa)); - pid + hm } /// The implementation for the 'which_overlapping_matches' API. Basically,