diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index 75ca85e6e..6da865f97 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -1170,7 +1170,10 @@ impl Builder { .clone() // We can always forcefully disable captures because DFAs do not // support them. - .configure(thompson::Config::new().captures(false)) + .configure( + thompson::Config::new() + .which_captures(thompson::WhichCaptures::None), + ) .build_many(patterns) .map_err(BuildError::nfa)?; self.build_from_nfa(&nfa) diff --git a/regex-automata/src/hybrid/dfa.rs b/regex-automata/src/hybrid/dfa.rs index 86963248f..67261c1a3 100644 --- a/regex-automata/src/hybrid/dfa.rs +++ b/regex-automata/src/hybrid/dfa.rs @@ -3973,7 +3973,10 @@ impl Builder { .clone() // We can always forcefully disable captures because DFAs do not // support them. - .configure(thompson::Config::new().captures(false)) + .configure( + thompson::Config::new() + .which_captures(thompson::WhichCaptures::None), + ) .build_many(patterns) .map_err(BuildError::nfa)?; self.build_from_nfa(nfa) diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs index 6e16ceedb..3a04b14d8 100644 --- a/regex-automata/src/meta/regex.rs +++ b/regex-automata/src/meta/regex.rs @@ -16,6 +16,7 @@ use crate::{ strategy::{self, Strategy}, wrappers, }, + nfa::thompson::WhichCaptures, util::{ captures::{Captures, GroupInfo}, iter, @@ -528,7 +529,14 @@ impl Regex { #[inline] pub fn is_match<'h, I: Into>>(&self, input: I) -> bool { let input = input.into().earliest(true); - self.search_half(&input).is_some() + if self.imp.info.is_impossible(&input) { + return false; + } + let mut guard = self.pool.get(); + let result = self.imp.strat.is_match(&mut guard, &input); + // See 'Regex::search' for why we put the guard back explicitly. + PoolGuard::put(guard); + result } /// Executes a leftmost search and returns the first match that is found, @@ -2429,6 +2437,7 @@ pub struct Config { utf8_empty: Option, autopre: Option, pre: Option>, + which_captures: Option, nfa_size_limit: Option>, onepass_size_limit: Option>, hybrid_cache_capacity: Option, @@ -2619,6 +2628,77 @@ impl Config { Config { pre: Some(pre), ..self } } + /// Configures what kinds of groups are compiled as "capturing" in the + /// underlying regex engine. + /// + /// This is set to [`WhichCaptures::All`] by default. Callers may wish to + /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the + /// overhead of capture states for explicit groups. + /// + /// Note that another approach to avoiding the overhead of capture groups + /// is by using non-capturing groups in the regex pattern. That is, + /// `(?:a)` instead of `(a)`. This option is useful when you can't control + /// the concrete syntax but know that you don't need the underlying capture + /// states. For example, using `WhichCaptures::Implicit` will behave as if + /// all explicit capturing groups in the pattern were non-capturing. + /// + /// Setting this to `WhichCaptures::None` is usually not the right thing to + /// do. When no capture states are compiled, some regex engines (such as + /// the `PikeVM`) won't be able to report match offsets. This will manifest + /// as no match being found. + /// + /// # Example + /// + /// This example demonstrates how the results of capture groups can change + /// based on this option. First we show the default (all capture groups in + /// the pattern are capturing): + /// + /// ``` + /// use regex_automata::{meta::Regex, Match, Span}; + /// + /// let re = Regex::new(r"foo([0-9]+)bar")?; + /// let hay = "foo123bar"; + /// + /// let mut caps = re.create_captures(); + /// re.captures(hay, &mut caps); + /// assert_eq!(Some(Span::from(0..9)), caps.get_group(0)); + /// assert_eq!(Some(Span::from(3..6)), caps.get_group(1)); + /// + /// Ok::<(), Box>(()) + /// ``` + /// + /// And now we show the behavior when we only include implicit capture + /// groups. In this case, we can only find the overall match span, but the + /// spans of any other explicit group don't exist because they are treated + /// as non-capturing. (In effect, when `WhichCaptures::Implicit` is used, + /// there is no real point in using [`Regex::captures`] since it will never + /// be able to report more information than [`Regex::find`].) + /// + /// ``` + /// use regex_automata::{ + /// meta::Regex, + /// nfa::thompson::WhichCaptures, + /// Match, + /// Span, + /// }; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().which_captures(WhichCaptures::Implicit)) + /// .build(r"foo([0-9]+)bar")?; + /// let hay = "foo123bar"; + /// + /// let mut caps = re.create_captures(); + /// re.captures(hay, &mut caps); + /// assert_eq!(Some(Span::from(0..9)), caps.get_group(0)); + /// assert_eq!(None, caps.get_group(1)); + /// + /// Ok::<(), Box>(()) + /// ``` + pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config { + self.which_captures = Some(which_captures); + self + } + /// Sets the size limit, in bytes, to enforce on the construction of every /// NFA build by the meta regex engine. /// @@ -2983,6 +3063,14 @@ impl Config { self.pre.as_ref().unwrap_or(&None).as_ref() } + /// Returns the capture configuration, as set by + /// [`Config::which_captures`]. + /// + /// If it was not explicitly set, then a default value is returned. + pub fn get_which_captures(&self) -> WhichCaptures { + self.which_captures.unwrap_or(WhichCaptures::All) + } + /// Returns NFA size limit, as set by [`Config::nfa_size_limit`]. /// /// If it was not explicitly set, then a default value is returned. @@ -3126,6 +3214,7 @@ impl Config { utf8_empty: o.utf8_empty.or(self.utf8_empty), autopre: o.autopre.or(self.autopre), pre: o.pre.or_else(|| self.pre.clone()), + which_captures: o.which_captures.or(self.which_captures), nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit), onepass_size_limit: o .onepass_size_limit diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index aa1d61ef3..ea6c6ab57 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -13,7 +13,7 @@ use crate::{ regex::{Cache, RegexInfo}, reverse_inner, wrappers, }, - nfa::thompson::{self, NFA}, + nfa::thompson::{self, WhichCaptures, NFA}, util::{ captures::{Captures, GroupInfo}, look::LookMatcher, @@ -58,6 +58,8 @@ pub(super) trait Strategy: input: &Input<'_>, ) -> Option; + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool; + fn search_slots( &self, cache: &mut Cache, @@ -399,6 +401,10 @@ impl Strategy for Pre

{ self.search(cache, input).map(|m| HalfMatch::new(m.pattern(), m.end())) } + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + self.search(cache, input).is_some() + } + fn search_slots( &self, cache: &mut Cache, @@ -452,7 +458,7 @@ impl Core { .utf8(info.config().get_utf8_empty()) .nfa_size_limit(info.config().get_nfa_size_limit()) .shrink(false) - .captures(true) + .which_captures(info.config().get_which_captures()) .look_matcher(lookm); let nfa = thompson::Compiler::new() .configure(thompson_config.clone()) @@ -499,7 +505,10 @@ impl Core { // useful with capturing groups in reverse. And of course, // the lazy DFA ignores capturing groups in all cases. .configure( - thompson_config.clone().captures(false).reverse(true), + thompson_config + .clone() + .which_captures(WhichCaptures::None) + .reverse(true), ) .build_many_from_hir(hirs) .map_err(BuildError::nfa)?; @@ -620,6 +629,29 @@ impl Core { } } + fn is_match_nofail(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if let Some(ref e) = self.onepass.get(input) { + trace!( + "using OnePass for is-match search at {:?}", + input.get_span() + ); + e.search_slots(&mut cache.onepass, input, &mut []).is_some() + } else if let Some(ref e) = self.backtrack.get(input) { + trace!( + "using BoundedBacktracker for is-match search at {:?}", + input.get_span() + ); + e.is_match(&mut cache.backtrack, input) + } else { + trace!( + "using PikeVM for is-match search at {:?}", + input.get_span() + ); + let e = self.pikevm.get(); + e.is_match(&mut cache.pikevm, input) + } + } + fn is_capture_search_needed(&self, slots_len: usize) -> bool { slots_len > self.nfa.group_info().implicit_slot_len() } @@ -700,7 +732,7 @@ impl Strategy for Core { // The main difference with 'search' is that if we're using a DFA, we // can use a single forward scan without needing to run the reverse // DFA. - return if let Some(e) = self.dfa.get(input) { + if let Some(e) = self.dfa.get(input) { trace!("using full DFA for half search at {:?}", input.get_span()); match e.try_search_half_fwd(input) { Ok(x) => x, @@ -720,7 +752,38 @@ impl Strategy for Core { } } else { self.search_half_nofail(cache, input) - }; + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if let Some(e) = self.dfa.get(input) { + trace!( + "using full DFA for is-match search at {:?}", + input.get_span() + ); + match e.try_search_half_fwd(input) { + Ok(x) => x.is_some(), + Err(_err) => { + trace!("full DFA half search failed: {}", _err); + self.is_match_nofail(cache, input) + } + } + } else if let Some(e) = self.hybrid.get(input) { + trace!( + "using lazy DFA for is-match search at {:?}", + input.get_span() + ); + match e.try_search_half_fwd(&mut cache.hybrid, input) { + Ok(x) => x.is_some(), + Err(_err) => { + trace!("lazy DFA half search failed: {}", _err); + self.is_match_nofail(cache, input) + } + } + } else { + self.is_match_nofail(cache, input) + } } #[cfg_attr(feature = "perf-inline", inline(always))] @@ -980,6 +1043,21 @@ impl Strategy for ReverseAnchored { } } + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if input.get_anchored().is_anchored() { + return self.core.is_match(cache, input); + } + match self.try_search_half_anchored_rev(cache, input) { + Err(_err) => { + trace!("fast reverse anchored search failed: {}", _err); + self.core.is_match_nofail(cache, input) + } + Ok(None) => false, + Ok(Some(_)) => true, + } + } + #[cfg_attr(feature = "perf-inline", inline(always))] fn search_slots( &self, @@ -1332,6 +1410,28 @@ impl Strategy for ReverseSuffix { } } + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if input.get_anchored().is_anchored() { + return self.core.is_match(cache, input); + } + match self.try_search_half_start(cache, input) { + Err(RetryError::Quadratic(_err)) => { + trace!("reverse suffix half optimization failed: {}", _err); + self.core.is_match_nofail(cache, input) + } + Err(RetryError::Fail(_err)) => { + trace!( + "reverse suffix reverse fast half search failed: {}", + _err + ); + self.core.is_match_nofail(cache, input) + } + Ok(None) => false, + Ok(Some(_)) => true, + } + } + #[cfg_attr(feature = "perf-inline", inline(always))] fn search_slots( &self, @@ -1480,7 +1580,7 @@ impl ReverseInner { .utf8(core.info.config().get_utf8_empty()) .nfa_size_limit(core.info.config().get_nfa_size_limit()) .shrink(false) - .captures(false) + .which_captures(WhichCaptures::None) .look_matcher(lookm); let result = thompson::Compiler::new() .configure(thompson_config) @@ -1714,6 +1814,25 @@ impl Strategy for ReverseInner { } } + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if input.get_anchored().is_anchored() { + return self.core.is_match(cache, input); + } + match self.try_search_full(cache, input) { + Err(RetryError::Quadratic(_err)) => { + trace!("reverse inner half optimization failed: {}", _err); + self.core.is_match_nofail(cache, input) + } + Err(RetryError::Fail(_err)) => { + trace!("reverse inner fast half search failed: {}", _err); + self.core.is_match_nofail(cache, input) + } + Ok(None) => false, + Ok(Some(_)) => true, + } + } + #[cfg_attr(feature = "perf-inline", inline(always))] fn search_slots( &self, diff --git a/regex-automata/src/meta/wrappers.rs b/regex-automata/src/meta/wrappers.rs index 8f58363a1..08110d9bb 100644 --- a/regex-automata/src/meta/wrappers.rs +++ b/regex-automata/src/meta/wrappers.rs @@ -87,6 +87,15 @@ impl PikeVMEngine { Ok(PikeVMEngine(engine)) } + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn is_match( + &self, + cache: &mut PikeVMCache, + input: &Input<'_>, + ) -> bool { + self.0.is_match(cache.0.as_mut().unwrap(), input.clone()) + } + #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn search_slots( &self, @@ -212,6 +221,29 @@ impl BoundedBacktrackerEngine { } } + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn is_match( + &self, + cache: &mut BoundedBacktrackerCache, + input: &Input<'_>, + ) -> bool { + #[cfg(feature = "nfa-backtrack")] + { + // OK because we only permit access to this engine when we know + // the haystack is short enough for the backtracker to run without + // reporting an error. + self.0 + .try_is_match(cache.0.as_mut().unwrap(), input.clone()) + .unwrap() + } + #[cfg(not(feature = "nfa-backtrack"))] + { + // Impossible to reach because this engine is never constructed + // if the requisite features aren't enabled. + unreachable!() + } + } + #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn search_slots( &self, diff --git a/regex-automata/src/nfa/thompson/backtrack.rs b/regex-automata/src/nfa/thompson/backtrack.rs index 75b6c096b..eba037c1d 100644 --- a/regex-automata/src/nfa/thompson/backtrack.rs +++ b/regex-automata/src/nfa/thompson/backtrack.rs @@ -19,7 +19,7 @@ use crate::{ empty, iter, prefilter::Prefilter, primitives::{NonMaxUsize, PatternID, SmallIndex, StateID}, - search::{Anchored, Input, Match, MatchError, Span}, + search::{Anchored, HalfMatch, Input, Match, MatchError, Span}, }, }; @@ -300,15 +300,6 @@ impl Builder { &self, nfa: NFA, ) -> Result { - // If the NFA has no captures, then the backtracker doesn't work since - // it relies on them in order to report match locations. However, in - // the special case of an NFA with no patterns, it is allowed, since - // no matches can ever be produced. And importantly, an NFA with no - // patterns has no capturing groups anyway, so this is necessary to - // permit the backtracker to work with regexes with zero patterns. - if !nfa.has_capture() && nfa.pattern_len() > 0 { - return Err(BuildError::missing_captures()); - } nfa.look_set_any().available().map_err(BuildError::word)?; Ok(BoundedBacktracker { config: self.config.clone(), nfa }) } @@ -954,8 +945,14 @@ impl BoundedBacktracker { None => return Ok(None), Some(pid) => pid, }; - let start = slots[0].unwrap().get(); - let end = slots[1].unwrap().get(); + let start = match slots[0] { + None => return Ok(None), + Some(s) => s.get(), + }; + let end = match slots[1] { + None => return Ok(None), + Some(s) => s.get(), + }; return Ok(Some(Match::new(pid, Span { start, end }))); } let ginfo = self.get_nfa().group_info(); @@ -965,8 +962,14 @@ impl BoundedBacktracker { None => return Ok(None), Some(pid) => pid, }; - let start = slots[pid.as_usize() * 2].unwrap().get(); - let end = slots[pid.as_usize() * 2 + 1].unwrap().get(); + let start = match slots[pid.as_usize() * 2] { + None => return Ok(None), + Some(s) => s.get(), + }; + let end = match slots[pid.as_usize() * 2 + 1] { + None => return Ok(None), + Some(s) => s.get(), + }; Ok(Some(Match::new(pid, Span { start, end }))) } @@ -1292,12 +1295,14 @@ impl BoundedBacktracker { ) -> Result, MatchError> { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); if !utf8empty { - return self.try_search_slots_imp(cache, input, slots); + let maybe_hm = self.try_search_slots_imp(cache, input, slots)?; + return Ok(maybe_hm.map(|hm| hm.pattern())); } // See PikeVM::try_search_slots for why we do this. let min = self.get_nfa().group_info().implicit_slot_len(); if slots.len() >= min { - return self.try_search_slots_imp(cache, input, slots); + let maybe_hm = self.try_search_slots_imp(cache, input, slots)?; + return Ok(maybe_hm.map(|hm| hm.pattern())); } if self.get_nfa().pattern_len() == 1 { let mut enough = [None, None]; @@ -1305,14 +1310,14 @@ impl BoundedBacktracker { // This is OK because we know `enough_slots` is strictly bigger // than `slots`, otherwise this special case isn't reached. slots.copy_from_slice(&enough[..slots.len()]); - return Ok(got); + return Ok(got.map(|hm| hm.pattern())); } let mut enough = vec![None; min]; let got = self.try_search_slots_imp(cache, input, &mut enough)?; // This is OK because we know `enough_slots` is strictly bigger than // `slots`, otherwise this special case isn't reached. slots.copy_from_slice(&enough[..slots.len()]); - Ok(got) + Ok(got.map(|hm| hm.pattern())) } /// This is the actual implementation of `try_search_slots_imp` that @@ -1325,30 +1330,17 @@ impl BoundedBacktracker { cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], - ) -> Result, MatchError> { + ) -> Result, MatchError> { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); - let (pid, end) = match self.search_imp(cache, input, slots)? { + let hm = match self.search_imp(cache, input, slots)? { None => return Ok(None), - Some(pid) if !utf8empty => return Ok(Some(pid)), - Some(pid) => { - let slot_start = pid.as_usize() * 2; - let slot_end = slot_start + 1; - // OK because we know we have a match and we know our caller - // provided slots are big enough (which we make true above if - // the caller didn't). Namely, we're only here when 'utf8empty' - // is true, and when that's true, we require slots for every - // pattern. - (pid, slots[slot_end].unwrap().get()) - } + Some(hm) if !utf8empty => return Ok(Some(hm)), + Some(hm) => hm, }; - empty::skip_splits_fwd(input, pid, end, |input| { - let pid = match self.search_imp(cache, input, slots)? { - None => return Ok(None), - Some(pid) => pid, - }; - let slot_start = pid.as_usize() * 2; - let slot_end = slot_start + 1; - Ok(Some((pid, slots[slot_end].unwrap().get()))) + empty::skip_splits_fwd(input, hm, hm.offset(), |input| { + Ok(self + .search_imp(cache, input, slots)? + .map(|hm| (hm, hm.offset()))) }) } @@ -1364,7 +1356,7 @@ impl BoundedBacktracker { cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], - ) -> Result, MatchError> { + ) -> Result, MatchError> { // Unlike in the PikeVM, we write our capturing group spans directly // into the caller's captures groups. So we have to make sure we're // starting with a blank slate first. In the PikeVM, we avoid this @@ -1411,10 +1403,9 @@ impl BoundedBacktracker { Some(ref span) => at = span.start, } } - if let Some(pid) = - self.backtrack(cache, input, at, start_id, slots) + if let Some(hm) = self.backtrack(cache, input, at, start_id, slots) { - return Ok(Some(pid)); + return Ok(Some(hm)); } at += 1; } @@ -1435,14 +1426,13 @@ impl BoundedBacktracker { at: usize, start_id: StateID, slots: &mut [Option], - ) -> Option { + ) -> Option { cache.stack.push(Frame::Step { sid: start_id, at }); while let Some(frame) = cache.stack.pop() { match frame { Frame::Step { sid, at } => { - if let Some(pid) = self.step(cache, input, sid, at, slots) - { - return Some(pid); + if let Some(hm) = self.step(cache, input, sid, at, slots) { + return Some(hm); } } Frame::RestoreCapture { slot, offset } => { @@ -1472,7 +1462,7 @@ impl BoundedBacktracker { mut sid: StateID, mut at: usize, slots: &mut [Option], - ) -> Option { + ) -> Option { loop { if !cache.visited.insert(sid, at - input.start()) { return None; @@ -1555,7 +1545,7 @@ impl BoundedBacktracker { } State::Fail => return None, State::Match { pattern_id } => { - return Some(pattern_id); + return Some(HalfMatch::new(pattern_id, at)); } } } diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 2021d93ea..065e9ef27 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -30,7 +30,7 @@ pub struct Config { reverse: Option, nfa_size_limit: Option>, shrink: Option, - captures: Option, + which_captures: Option, look_matcher: Option, #[cfg(test)] unanchored_prefix: Option, @@ -178,12 +178,15 @@ impl Config { /// ``` /// use regex_automata::{ /// dfa::{self, Automaton}, - /// nfa::thompson::NFA, + /// nfa::thompson::{NFA, WhichCaptures}, /// HalfMatch, Input, /// }; /// /// let dfa = dfa::dense::Builder::new() - /// .thompson(NFA::config().captures(false).reverse(true)) + /// .thompson(NFA::config() + /// .which_captures(WhichCaptures::None) + /// .reverse(true) + /// ) /// .build("baz[0-9]+")?; /// let expected = Some(HalfMatch::must(0, 3)); /// assert_eq!( @@ -277,10 +280,12 @@ impl Config { /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long - /// use regex_automata::nfa::thompson::NFA; + /// use regex_automata::nfa::thompson::{NFA, WhichCaptures}; /// /// // Currently we have to disable captures when enabling reverse NFA. - /// let config = NFA::config().captures(false).reverse(true); + /// let config = NFA::config() + /// .which_captures(WhichCaptures::None) + /// .reverse(true); /// let not_shrunk = NFA::compiler() /// .configure(config.clone().shrink(false)) /// .build(r"\w")?; @@ -311,21 +316,99 @@ impl Config { /// # Example /// /// This example demonstrates that some regex engines, like the Pike VM, - /// require capturing groups to be present in the NFA. Building a Pike VM - /// with an NFA without capturing groups will result in an error. + /// require capturing states to be present in the NFA to report match + /// offsets. + /// + /// (Note that since this method is deprecated, the example below uses + /// [`Config::which_captures`] to disable capture states.) /// /// ``` - /// use regex_automata::nfa::thompson::{pikevm::PikeVM, NFA}; + /// use regex_automata::nfa::thompson::{ + /// pikevm::PikeVM, + /// NFA, + /// WhichCaptures, + /// }; /// - /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// let re = PikeVM::builder() + /// .thompson(NFA::config().which_captures(WhichCaptures::None)) + /// .build(r"[a-z]+")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(re.is_match(&mut cache, "abc")); + /// assert_eq!(None, re.find(&mut cache, "abc")); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[deprecated(since = "0.3.5", note = "use which_captures instead")] + pub fn captures(self, yes: bool) -> Config { + self.which_captures(if yes { + WhichCaptures::All + } else { + WhichCaptures::None + }) + } + + /// Configures what kinds of capture groups are compiled into + /// [`State::Capture`](crate::nfa::thompson::State::Capture) states in a + /// Thompson NFA. + /// + /// Currently, using any option except for [`WhichCaptures::None`] requires + /// disabling the [`reverse`](Config::reverse) setting. If both are + /// enabled, then the compiler will return an error. It is expected that + /// this limitation will be lifted in the future. + /// + /// This is set to [`WhichCaptures::All`] by default. Callers may wish to + /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the + /// overhead of capture states for explicit groups. Usually this occurs + /// when one wants to use the `PikeVM` only for determining the overall + /// match. Otherwise, the `PikeVM` could use much more memory than is + /// necessary. + /// + /// # Example + /// + /// This example demonstrates that some regex engines, like the Pike VM, + /// require capturing states to be present in the NFA to report match + /// offsets. + /// + /// ``` + /// use regex_automata::nfa::thompson::{ + /// pikevm::PikeVM, + /// NFA, + /// WhichCaptures, + /// }; + /// + /// let re = PikeVM::builder() + /// .thompson(NFA::config().which_captures(WhichCaptures::None)) + /// .build(r"[a-z]+")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(re.is_match(&mut cache, "abc")); + /// assert_eq!(None, re.find(&mut cache, "abc")); + /// + /// # Ok::<(), Box>(()) + /// ``` + /// + /// The same applies to the bounded backtracker: + /// + /// ``` + /// use regex_automata::nfa::thompson::{ + /// backtrack::BoundedBacktracker, + /// NFA, + /// WhichCaptures, + /// }; + /// + /// let re = BoundedBacktracker::builder() + /// .thompson(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"[a-z]+")?; - /// assert!(PikeVM::new_from_nfa(nfa).is_err()); + /// let mut cache = re.create_cache(); + /// + /// assert!(re.try_is_match(&mut cache, "abc")?); + /// assert_eq!(None, re.try_find(&mut cache, "abc")?); /// /// # Ok::<(), Box>(()) /// ``` - pub fn captures(mut self, yes: bool) -> Config { - self.captures = Some(yes); + pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config { + self.which_captures = Some(which_captures); self } @@ -405,8 +488,14 @@ impl Config { } /// Return whether NFA compilation is configured to produce capture states. + #[deprecated(since = "0.3.5", note = "use get_which_captures instead")] pub fn get_captures(&self) -> bool { - self.captures.unwrap_or(true) + self.get_which_captures().is_any() + } + + /// Return what kinds of capture states will be compiled into an NFA. + pub fn get_which_captures(&self) -> WhichCaptures { + self.which_captures.unwrap_or(WhichCaptures::All) } /// Return the look-around matcher for this NFA. @@ -439,7 +528,7 @@ impl Config { reverse: o.reverse.or(self.reverse), nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit), shrink: o.shrink.or(self.shrink), - captures: o.captures.or(self.captures), + which_captures: o.which_captures.or(self.which_captures), look_matcher: o.look_matcher.or_else(|| self.look_matcher.clone()), #[cfg(test)] unanchored_prefix: o.unanchored_prefix.or(self.unanchored_prefix), @@ -447,6 +536,57 @@ impl Config { } } +/// A configuration indicating which kinds of +/// [`State::Capture`](crate::nfa::thompson::State::Capture) states to include. +/// +/// This configuration can be used with [`Config::which_captures`] to control +/// which capture states are compiled into a Thompson NFA. +/// +/// The default configuration is [`WhichCaptures::All`]. +#[derive(Clone, Copy, Debug)] +pub enum WhichCaptures { + /// All capture states, including those corresponding to both implicit and + /// explicit capture groups, are included in the Thompson NFA. + All, + /// Only capture states corresponding to implicit capture groups are + /// included. Implicit capture groups appear in every pattern implicitly + /// and correspond to the overall match of a pattern. + /// + /// This is useful when one only cares about the overall match of a + /// pattern. By excluding capture states from explicit capture groups, + /// one might be able to reduce the memory usage of a multi-pattern regex + /// substantially if it was otherwise written to have many explicit capture + /// groups. + Implicit, + /// No capture states are compiled into the Thompson NFA. + /// + /// This is useful when capture states are either not needed (for example, + /// if one is only trying to build a DFA) or if they aren't supported (for + /// example, a reverse NFA). + None, +} + +impl Default for WhichCaptures { + fn default() -> WhichCaptures { + WhichCaptures::All + } +} + +impl WhichCaptures { + /// Returns true if this configuration indicates that no capture states + /// should be produced in an NFA. + pub fn is_none(&self) -> bool { + matches!(*self, WhichCaptures::None) + } + + /// Returns true if this configuration indicates that some capture states + /// should be added to an NFA. Note that this might only include capture + /// states for implicit capture groups. + pub fn is_any(&self) -> bool { + !self.is_none() + } +} + /* This compiler below uses Thompson's construction algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA graph as output. The NFA graph @@ -800,7 +940,9 @@ impl Compiler { if exprs.len() > PatternID::LIMIT { return Err(BuildError::too_many_patterns(exprs.len())); } - if self.config.get_reverse() && self.config.get_captures() { + if self.config.get_reverse() + && self.config.get_which_captures().is_any() + { return Err(BuildError::unsupported_captures()); } @@ -978,8 +1120,13 @@ impl Compiler { name: Option<&str>, expr: &Hir, ) -> Result { - if !self.config.get_captures() { - return self.c(expr); + match self.config.get_which_captures() { + // No capture states means we always skip them. + WhichCaptures::None => return self.c(expr), + // Implicit captures states means we only add when index==0 since + // index==0 implies the group is implicit. + WhichCaptures::Implicit if index > 0 => return self.c(expr), + _ => {} } let start = self.add_capture_start(index, name)?; @@ -1725,12 +1872,18 @@ mod tests { use crate::{ nfa::thompson::{SparseTransitions, State, Transition, NFA}, - util::primitives::{PatternID, StateID}, + util::primitives::{PatternID, SmallIndex, StateID}, }; + use super::*; + fn build(pattern: &str) -> NFA { NFA::compiler() - .configure(NFA::config().captures(false).unanchored_prefix(false)) + .configure( + NFA::config() + .which_captures(WhichCaptures::None) + .unanchored_prefix(false), + ) .build(pattern) .unwrap() } @@ -1781,6 +1934,15 @@ mod tests { } } + fn s_cap(next: usize, pattern: usize, index: usize, slot: usize) -> State { + State::Capture { + next: sid(next), + pattern_id: pid(pattern), + group_index: SmallIndex::new(index).unwrap(), + slot: SmallIndex::new(slot).unwrap(), + } + } + fn s_fail() -> State { State::Fail } @@ -1794,7 +1956,7 @@ mod tests { #[test] fn compile_unanchored_prefix() { let nfa = NFA::compiler() - .configure(NFA::config().captures(false)) + .configure(NFA::config().which_captures(WhichCaptures::None)) .build(r"a") .unwrap(); assert_eq!( @@ -1827,7 +1989,11 @@ mod tests { // Check that non-UTF-8 literals work. let nfa = NFA::compiler() - .configure(NFA::config().captures(false).unanchored_prefix(false)) + .configure( + NFA::config() + .which_captures(WhichCaptures::None) + .unanchored_prefix(false), + ) .syntax(crate::util::syntax::Config::new().utf8(false)) .build(r"(?-u)\xFF") .unwrap(); @@ -1937,7 +2103,7 @@ mod tests { let nfa = NFA::compiler() .configure( NFA::config() - .captures(false) + .which_captures(WhichCaptures::None) .reverse(true) .shrink(false) .unanchored_prefix(false), @@ -1965,7 +2131,11 @@ mod tests { #[test] fn compile_many_start_pattern() { let nfa = NFA::compiler() - .configure(NFA::config().captures(false).unanchored_prefix(false)) + .configure( + NFA::config() + .which_captures(WhichCaptures::None) + .unanchored_prefix(false), + ) .build_many(&["a", "b"]) .unwrap(); assert_eq!( @@ -1993,7 +2163,9 @@ mod tests { use regex_syntax::hir::{Class, ClassBytes, Hir}; let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![]))); - let config = NFA::config().captures(false).unanchored_prefix(false); + let config = NFA::config() + .which_captures(WhichCaptures::None) + .unanchored_prefix(false); let nfa = NFA::compiler().configure(config).build_from_hir(&hir).unwrap(); assert_eq!(nfa.states(), &[s_fail(), s_match(0)]); @@ -2005,9 +2177,81 @@ mod tests { use regex_syntax::hir::{Class, ClassUnicode, Hir}; let hir = Hir::class(Class::Unicode(ClassUnicode::new(vec![]))); - let config = NFA::config().captures(false).unanchored_prefix(false); + let config = NFA::config() + .which_captures(WhichCaptures::None) + .unanchored_prefix(false); let nfa = NFA::compiler().configure(config).build_from_hir(&hir).unwrap(); assert_eq!(nfa.states(), &[s_fail(), s_match(0)]); } + + #[test] + fn compile_captures_all() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .unanchored_prefix(false) + .which_captures(WhichCaptures::All), + ) + .build("a(b)c") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + s_cap(1, 0, 0, 0), + s_byte(b'a', 2), + s_cap(3, 0, 1, 2), + s_byte(b'b', 4), + s_cap(5, 0, 1, 3), + s_byte(b'c', 6), + s_cap(7, 0, 0, 1), + s_match(0) + ] + ); + let ginfo = nfa.group_info(); + assert_eq!(2, ginfo.all_group_len()); + } + + #[test] + fn compile_captures_implicit() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .unanchored_prefix(false) + .which_captures(WhichCaptures::Implicit), + ) + .build("a(b)c") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + s_cap(1, 0, 0, 0), + s_byte(b'a', 2), + s_byte(b'b', 3), + s_byte(b'c', 4), + s_cap(5, 0, 0, 1), + s_match(0) + ] + ); + let ginfo = nfa.group_info(); + assert_eq!(1, ginfo.all_group_len()); + } + + #[test] + fn compile_captures_none() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .unanchored_prefix(false) + .which_captures(WhichCaptures::None), + ) + .build("a(b)c") + .unwrap(); + assert_eq!( + nfa.states(), + &[s_byte(b'a', 1), s_byte(b'b', 2), s_byte(b'c', 3), s_match(0)] + ); + let ginfo = nfa.group_info(); + assert_eq!(0, ginfo.all_group_len()); + } } diff --git a/regex-automata/src/nfa/thompson/error.rs b/regex-automata/src/nfa/thompson/error.rs index 82648813b..3c2fa8a21 100644 --- a/regex-automata/src/nfa/thompson/error.rs +++ b/regex-automata/src/nfa/thompson/error.rs @@ -68,9 +68,6 @@ enum BuildErrorKind { /// The invalid index that was given. index: u32, }, - /// An error that occurs when one tries to build an NFA simulation (such as - /// the PikeVM) without any capturing groups. - MissingCaptures, /// An error that occurs when one tries to build a reverse NFA with /// captures enabled. Currently, this isn't supported, but we probably /// should support it at some point. @@ -126,10 +123,6 @@ impl BuildError { BuildError { kind: BuildErrorKind::InvalidCaptureIndex { index } } } - pub(crate) fn missing_captures() -> BuildError { - BuildError { kind: BuildErrorKind::MissingCaptures } - } - #[cfg(feature = "syntax")] pub(crate) fn unsupported_captures() -> BuildError { BuildError { kind: BuildErrorKind::UnsupportedCaptures } @@ -181,11 +174,6 @@ impl core::fmt::Display for BuildError { "capture group index {} is invalid (too big or discontinuous)", index, ), - BuildErrorKind::MissingCaptures => write!( - f, - "operation requires the NFA to have capturing groups, \ - but the NFA given contains none", - ), #[cfg(feature = "syntax")] BuildErrorKind::UnsupportedCaptures => write!( f, diff --git a/regex-automata/src/nfa/thompson/mod.rs b/regex-automata/src/nfa/thompson/mod.rs index 3581d738c..cf426736d 100644 --- a/regex-automata/src/nfa/thompson/mod.rs +++ b/regex-automata/src/nfa/thompson/mod.rs @@ -78,4 +78,4 @@ pub use self::{ }, }; #[cfg(feature = "syntax")] -pub use compiler::{Compiler, Config}; +pub use compiler::{Compiler, Config, WhichCaptures}; diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 86131406c..2108fa338 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -453,10 +453,10 @@ impl NFA { /// predict the anchored starting state. /// /// ``` - /// use regex_automata::nfa::thompson::{NFA, State}; + /// use regex_automata::nfa::thompson::{NFA, State, WhichCaptures}; /// /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build("a")?; /// let state = nfa.state(nfa.start_anchored()); /// match *state { @@ -711,7 +711,7 @@ impl NFA { /// or not. /// /// ``` - /// use regex_automata::nfa::thompson::NFA; + /// use regex_automata::nfa::thompson::{NFA, WhichCaptures}; /// /// // Obviously has capture states. /// let nfa = NFA::new("(a)")?; @@ -733,7 +733,7 @@ impl NFA { /// // Notice that 'has_capture' is false here even when we have an /// // explicit capture group in the pattern. /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build("(a)")?; /// assert!(!nfa.has_capture()); /// diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index 79ce3c60d..0128c151a 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -17,7 +17,9 @@ use crate::{ empty, iter, prefilter::Prefilter, primitives::{NonMaxUsize, PatternID, SmallIndex, StateID}, - search::{Anchored, Input, Match, MatchKind, PatternSet, Span}, + search::{ + Anchored, HalfMatch, Input, Match, MatchKind, PatternSet, Span, + }, sparse_set::SparseSet, }, }; @@ -275,15 +277,6 @@ impl Builder { /// construction of the NFA itself will of course be ignored, since the NFA /// given here is already built. pub fn build_from_nfa(&self, nfa: NFA) -> Result { - // If the NFA has no captures, then the PikeVM doesn't work since it - // relies on them in order to report match locations. However, in - // the special case of an NFA with no patterns, it is allowed, since - // no matches can ever be produced. And importantly, an NFA with no - // patterns has no capturing groups anyway, so this is necessary to - // permit the PikeVM to work with regexes with zero patterns. - if !nfa.has_capture() && nfa.pattern_len() > 0 { - return Err(BuildError::missing_captures()); - } nfa.look_set_any().available().map_err(BuildError::word)?; Ok(PikeVM { config: self.config.clone(), nfa }) } @@ -828,16 +821,16 @@ impl PikeVM { if self.get_nfa().pattern_len() == 1 { let mut slots = [None, None]; let pid = self.search_slots(cache, &input, &mut slots)?; - let start = slots[0].unwrap().get(); - let end = slots[1].unwrap().get(); + let start = slots[0]?.get(); + let end = slots[1]?.get(); return Some(Match::new(pid, Span { start, end })); } let ginfo = self.get_nfa().group_info(); let slots_len = ginfo.implicit_slot_len(); let mut slots = vec![None; slots_len]; let pid = self.search_slots(cache, &input, &mut slots)?; - let start = slots[pid.as_usize() * 2].unwrap().get(); - let end = slots[pid.as_usize() * 2 + 1].unwrap().get(); + let start = slots[pid.as_usize() * 2]?.get(); + let end = slots[pid.as_usize() * 2 + 1]?.get(); Some(Match::new(pid, Span { start, end })) } @@ -1103,7 +1096,8 @@ impl PikeVM { ) -> Option { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); if !utf8empty { - return self.search_slots_imp(cache, input, slots); + let hm = self.search_slots_imp(cache, input, slots)?; + return Some(hm.pattern()); } // There is an unfortunate special case where if the regex can // match the empty string and UTF-8 mode is enabled, the search @@ -1118,22 +1112,23 @@ impl PikeVM { // this case. let min = self.get_nfa().group_info().implicit_slot_len(); if slots.len() >= min { - return self.search_slots_imp(cache, input, slots); + let hm = self.search_slots_imp(cache, input, slots)?; + return Some(hm.pattern()); } if self.get_nfa().pattern_len() == 1 { let mut enough = [None, None]; let got = self.search_slots_imp(cache, input, &mut enough); - // This is OK because we know `enough_slots` is strictly bigger - // than `slots`, otherwise this special case isn't reached. + // This is OK because we know `enough` is strictly bigger than + // `slots`, otherwise this special case isn't reached. slots.copy_from_slice(&enough[..slots.len()]); - return got; + return got.map(|hm| hm.pattern()); } let mut enough = vec![None; min]; let got = self.search_slots_imp(cache, input, &mut enough); - // This is OK because we know `enough_slots` is strictly bigger than - // `slots`, otherwise this special case isn't reached. + // This is OK because we know `enough` is strictly bigger than `slots`, + // otherwise this special case isn't reached. slots.copy_from_slice(&enough[..slots.len()]); - got + got.map(|hm| hm.pattern()) } /// This is the actual implementation of `search_slots_imp` that @@ -1146,30 +1141,17 @@ impl PikeVM { cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], - ) -> Option { + ) -> Option { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); - let (pid, end) = match self.search_imp(cache, input, slots) { + let hm = match self.search_imp(cache, input, slots) { None => return None, - Some(pid) if !utf8empty => return Some(pid), - Some(pid) => { - let slot_start = pid.as_usize() * 2; - let slot_end = slot_start + 1; - // OK because we know we have a match and we know our caller - // provided slots are big enough (which we make true above if - // the caller didn't). Namely, we're only here when 'utf8empty' - // is true, and when that's true, we require slots for every - // pattern. - (pid, slots[slot_end].unwrap().get()) - } + Some(hm) if !utf8empty => return Some(hm), + Some(hm) => hm, }; - empty::skip_splits_fwd(input, pid, end, |input| { - let pid = match self.search_imp(cache, input, slots) { - None => return Ok(None), - Some(pid) => pid, - }; - let slot_start = pid.as_usize() * 2; - let slot_end = slot_start + 1; - Ok(Some((pid, slots[slot_end].unwrap().get()))) + empty::skip_splits_fwd(input, hm, hm.offset(), |input| { + Ok(self + .search_imp(cache, input, slots) + .map(|hm| (hm, hm.offset()))) }) // OK because the PikeVM never errors. .unwrap() @@ -1244,7 +1226,7 @@ impl PikeVM { cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], - ) -> Option { + ) -> Option { cache.setup_search(slots.len()); if input.is_done() { return None; @@ -1273,7 +1255,7 @@ impl PikeVM { let pre = if anchored { None } else { self.get_config().get_prefilter() }; let Cache { ref mut stack, ref mut curr, ref mut next } = cache; - let mut pid = None; + let mut hm = None; // Yes, our search doesn't end at input.end(), but includes it. This // is necessary because matches are delayed by one byte, just like // how the DFA engines work. The delay is used to handle look-behind @@ -1292,7 +1274,7 @@ impl PikeVM { if curr.set.is_empty() { // We have a match and we haven't been instructed to continue // on even after finding a match, so we can quit. - if pid.is_some() && !allmatches { + if hm.is_some() && !allmatches { break; } // If we're running an anchored search and we've advanced @@ -1362,7 +1344,7 @@ impl PikeVM { // search. If we re-computed it at every position, we would be // simulating an unanchored search when we were tasked to perform // an anchored search. - if (!pid.is_some() || allmatches) + if (!hm.is_some() || allmatches) && (!anchored || at == input.start()) { // Since we are adding to the 'curr' active states and since @@ -1381,14 +1363,15 @@ impl PikeVM { let slots = next.slot_table.all_absent(); self.epsilon_closure(stack, slots, curr, input, at, start_id); } - if let Some(x) = self.nexts(stack, curr, next, input, at, slots) { - pid = Some(x); + if let Some(pid) = self.nexts(stack, curr, next, input, at, slots) + { + hm = Some(HalfMatch::new(pid, at)); } // Unless the caller asked us to return early, we need to mush on // to see if we can extend our match. (But note that 'nexts' will // quit right after seeing a match when match_kind==LeftmostFirst, // as is consistent with leftmost-first match priority.) - if input.get_earliest() && pid.is_some() { + if input.get_earliest() && hm.is_some() { break; } core::mem::swap(curr, next); @@ -1396,7 +1379,7 @@ impl PikeVM { at += 1; } instrument!(|c| c.eprint(&self.nfa)); - pid + hm } /// The implementation for the 'which_overlapping_matches' API. Basically, @@ -2108,15 +2091,16 @@ impl SlotTable { // if a 'Captures' has fewer slots, e.g., none at all or only slots // for tracking the overall match instead of all slots for every // group. - self.slots_for_captures = nfa.group_info().slot_len(); + self.slots_for_captures = core::cmp::max( + self.slots_per_state, + nfa.pattern_len().checked_mul(2).unwrap(), + ); let len = nfa .states() .len() - // We add 1 so that our last row is always empty. We use it as - // "scratch" space for computing the epsilon closure off of the - // starting state. - .checked_add(1) - .and_then(|x| x.checked_mul(self.slots_per_state)) + .checked_mul(self.slots_per_state) + // Add space to account for scratch space used during a search. + .and_then(|x| x.checked_add(self.slots_for_captures)) // It seems like this could actually panic on legitimate inputs on // 32-bit targets, and very likely to panic on 16-bit. Should we // somehow convert this to an error? What about something similar @@ -2170,7 +2154,7 @@ impl SlotTable { /// compute an epsilon closure outside of the user supplied regex, and thus /// never want it to have any capturing slots set. fn all_absent(&mut self) -> &mut [Option] { - let i = self.table.len() - self.slots_per_state; + let i = self.table.len() - self.slots_for_captures; &mut self.table[i..i + self.slots_for_captures] } } diff --git a/regex-automata/src/util/captures.rs b/regex-automata/src/util/captures.rs index c6517348d..cd3a5f8f7 100644 --- a/regex-automata/src/util/captures.rs +++ b/regex-automata/src/util/captures.rs @@ -1810,10 +1810,10 @@ impl GroupInfo { /// panic even if captures aren't enabled on this NFA: /// /// ``` - /// use regex_automata::nfa::thompson::NFA; + /// use regex_automata::nfa::thompson::{NFA, WhichCaptures}; /// /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build_many(&[ /// r"(?Pa)", /// r"a", @@ -1958,7 +1958,7 @@ impl GroupInfo { /// for different patterns and NFA configurations. /// /// ``` - /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// use regex_automata::{nfa::thompson::{NFA, WhichCaptures}, PatternID}; /// /// let nfa = NFA::new(r"(a)(b)(c)")?; /// // There are 3 explicit groups in the pattern's concrete syntax and @@ -1970,13 +1970,13 @@ impl GroupInfo { /// assert_eq!(1, nfa.group_info().group_len(PatternID::ZERO)); /// /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"abc")?; /// // We disabled capturing groups, so there are none. /// assert_eq!(0, nfa.group_info().group_len(PatternID::ZERO)); /// /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"(a)(b)(c)")?; /// // We disabled capturing groups, so there are none, even if there are /// // explicit groups in the concrete syntax. @@ -2000,7 +2000,7 @@ impl GroupInfo { /// for different patterns and NFA configurations. /// /// ``` - /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// use regex_automata::{nfa::thompson::{NFA, WhichCaptures}, PatternID}; /// /// let nfa = NFA::new(r"(a)(b)(c)")?; /// // There are 3 explicit groups in the pattern's concrete syntax and @@ -2017,13 +2017,13 @@ impl GroupInfo { /// assert_eq!(5, nfa.group_info().all_group_len()); /// /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"abc")?; /// // We disabled capturing groups, so there are none. /// assert_eq!(0, nfa.group_info().all_group_len()); /// /// let nfa = NFA::compiler() - /// .configure(NFA::config().captures(false)) + /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"(a)(b)(c)")?; /// // We disabled capturing groups, so there are none, even if there are /// // explicit groups in the concrete syntax. diff --git a/src/builders.rs b/src/builders.rs index d19a0ffe2..46c4824c5 100644 --- a/src/builders.rs +++ b/src/builders.rs @@ -28,7 +28,9 @@ use alloc::{ vec::Vec, }; -use regex_automata::{meta, util::syntax, MatchKind}; +use regex_automata::{ + meta, nfa::thompson::WhichCaptures, util::syntax, MatchKind, +}; use crate::error::Error; @@ -100,8 +102,12 @@ impl Builder { } fn build_many_string(&self) -> Result { - let metac = - self.metac.clone().match_kind(MatchKind::All).utf8_empty(true); + let metac = self + .metac + .clone() + .match_kind(MatchKind::All) + .utf8_empty(true) + .which_captures(WhichCaptures::None); let syntaxc = self.syntaxc.clone().utf8(true); let patterns = Arc::from(self.pats.as_slice()); meta::Builder::new() @@ -113,8 +119,12 @@ impl Builder { } fn build_many_bytes(&self) -> Result { - let metac = - self.metac.clone().match_kind(MatchKind::All).utf8_empty(false); + let metac = self + .metac + .clone() + .match_kind(MatchKind::All) + .utf8_empty(false) + .which_captures(WhichCaptures::None); let syntaxc = self.syntaxc.clone().utf8(false); let patterns = Arc::from(self.pats.as_slice()); meta::Builder::new()