Skip to content

Commit 6b66a15

Browse files
committed
Store look-behind offsets separately
1 parent bac64f4 commit 6b66a15

File tree

4 files changed

+241
-162
lines changed

4 files changed

+241
-162
lines changed

regex-automata/src/nfa/thompson/builder.rs

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use alloc::{sync::Arc, vec, vec::Vec};
55
use crate::{
66
nfa::thompson::{
77
error::BuildError,
8-
nfa::{self, LookBehindInfo, SparseTransitions, Transition, NFA},
8+
nfa::{self, LookBehindTree, SparseTransitions, Transition, NFA},
99
},
1010
util::{
1111
look::{Look, LookMatcher},
@@ -340,11 +340,9 @@ pub struct Builder {
340340
/// contains a single regex, then `start_pattern[0]` and `start_anchored`
341341
/// are always equivalent.
342342
start_pattern: Vec<StateID>,
343-
/// A vector of meta-data information about each look-behind in this NFA.
344-
///
345-
/// Must be stored in a depth-first pre-order with regards to the nesting
346-
/// of look-behinds.
347-
lookbehinds: Vec<LookBehindInfo>,
343+
/// A vector of look-behinds appearing in the regex. Order reflects the
344+
/// order in the regex.
345+
lookbehinds: Vec<LookBehindTree>,
348346
/// A map from pattern ID to capture group index to name. (If no name
349347
/// exists, then a None entry is present. Thus, all capturing groups are
350348
/// present in this mapping.)
@@ -719,14 +717,21 @@ impl Builder {
719717
/// starts.
720718
///
721719
/// Look-behinds must be started in a depth-first pre-order fashion with
722-
/// regards to the nesting of look-behinds.
720+
/// regards to the nesting of look-behinds. The nesting path is stored
721+
/// as indices in `path`.
723722
pub fn start_lookbehind(
724723
&mut self,
725724
start_id: StateID,
726725
offset_from_start: Option<usize>,
726+
path: &[usize],
727727
) {
728-
self.lookbehinds
729-
.push(LookBehindInfo::new(start_id, offset_from_start));
728+
let mut current = &mut self.lookbehinds;
729+
730+
for index in path {
731+
current = current[*index].children_mut();
732+
}
733+
734+
current.push(LookBehindTree::new(start_id, offset_from_start));
730735
}
731736

732737
/// Add an "empty" NFA state.

regex-automata/src/nfa/thompson/compiler.rs

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -718,6 +718,8 @@ pub struct Compiler {
718718
/// the current look-behind expression. When `None`, the distance can be
719719
/// seen as infinity.
720720
current_lookbehind_offset_from_start: RefCell<Option<usize>>,
721+
/// The current path of look-behind nesting.
722+
lookbehind_nesting_path: RefCell<Vec<usize>>,
721723
}
722724

723725
impl Compiler {
@@ -732,6 +734,7 @@ impl Compiler {
732734
utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
733735
lookaround_index: RefCell::new(SmallIndex::ZERO),
734736
current_lookbehind_offset_from_start: RefCell::new(Some(0)),
737+
lookbehind_nesting_path: RefCell::new(vec![0]),
735738
}
736739
}
737740

@@ -972,6 +975,8 @@ impl Compiler {
972975
.borrow_mut()
973976
.set_size_limit(self.config.get_nfa_size_limit())?;
974977
*self.lookaround_index.borrow_mut() = SmallIndex::ZERO;
978+
*self.lookbehind_nesting_path.borrow_mut() = vec![0];
979+
*self.current_lookbehind_offset_from_start.borrow_mut() = Some(0);
975980

976981
// We always add an unanchored prefix unless we were specifically told
977982
// not to (for tests only), or if we know that the regex is anchored
@@ -1059,15 +1064,22 @@ impl Compiler {
10591064

10601065
let unanchored =
10611066
self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?;
1062-
self.builder
1063-
.borrow_mut()
1064-
.start_lookbehind(unanchored.start, start_offset);
1067+
self.builder.borrow_mut().start_lookbehind(
1068+
unanchored.start,
1069+
start_offset,
1070+
self.lookbehind_nesting_path.borrow().split_last().unwrap().1,
1071+
);
10651072

10661073
// When compiling the subexpression we temporarily change the starting
10671074
// offset and restore it after. This way, the subexpression is relativized
1068-
// to our current offset.
1075+
// to our current offset. We also update the path to the current lookbehind
1076+
// expression.
1077+
self.lookbehind_nesting_path.borrow_mut().push(0);
10691078
*self.current_lookbehind_offset_from_start.borrow_mut() = start_offset;
10701079
let sub = self.c(lookaround.sub())?;
1080+
let mut path = self.lookbehind_nesting_path.borrow_mut();
1081+
path.pop();
1082+
*path.last_mut().unwrap() += 1;
10711083
*self.current_lookbehind_offset_from_start.borrow_mut() =
10721084
relative_start;
10731085

regex-automata/src/nfa/thompson/nfa.rs

Lines changed: 70 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1108,7 +1108,7 @@ impl NFA {
11081108

11091109
/// Returns the starting states for initializing look-behind evaluation.
11101110
#[inline]
1111-
pub fn lookbehinds(&self) -> &Vec<LookBehindInfo> {
1111+
pub fn lookbehinds(&self) -> &[LookBehindTree] {
11121112
&self.0.lookbehinds
11131113
}
11141114

@@ -1277,46 +1277,83 @@ pub(super) struct Inner {
12771277
/// This is needed to initialize the table for storing the result of
12781278
/// look-around evaluation.
12791279
lookaround_count: usize,
1280-
/// A vector of meta-data information about each look-behind in this NFA.
1281-
///
1282-
/// Must be stored in a depth-first pre-order with regards to the nesting
1283-
/// of look-behinds.
1284-
lookbehinds: Vec<LookBehindInfo>,
1280+
/// A vector of look-behinds appearing in the regex. Order reflects the
1281+
/// order in the regex.
1282+
lookbehinds: Vec<LookBehindTree>,
12851283
/// Heap memory used indirectly by NFA states and other things (like the
12861284
/// various capturing group representations above). Since each state
12871285
/// might use a different amount of heap, we need to keep track of this
12881286
/// incrementally.
12891287
memory_extra: usize,
12901288
}
12911289

1292-
/// Information about a look-behind needed for execution.
1293-
#[derive(Clone, Copy, Debug)]
1294-
pub struct LookBehindInfo {
1295-
/// The id of the start state of the look-behind subexpression.
1290+
/// Information about a look-behinds needed for execution. It preserves the
1291+
/// nesting structure of look-behinds.
1292+
#[derive(Clone, Debug)]
1293+
pub struct LookBehindTree {
12961294
start_id: StateID,
1297-
/// The offset (in bytes) from the beginning of the main regex that a
1298-
/// look-behind starts at. If `None`, the offset is unbounded.
12991295
offset_from_start: Option<usize>,
1296+
children: Vec<LookBehindTree>,
13001297
}
13011298

1302-
impl LookBehindInfo {
1303-
pub(super) fn new(
1304-
start_id: StateID,
1305-
offset_from_start: Option<usize>,
1306-
) -> Self {
1307-
Self { start_id, offset_from_start }
1299+
impl LookBehindTree {
1300+
pub fn new(start_id: StateID, offset_from_start: Option<usize>) -> Self {
1301+
Self { start_id, offset_from_start, children: Vec::new() }
13081302
}
13091303

1310-
/// Start states of the look-behind subexpression.
1311-
pub(super) fn start_state(&self) -> StateID {
1304+
/// The id of the start state of the look-behind subexpression.
1305+
pub fn start_id(&self) -> StateID {
13121306
self.start_id
13131307
}
13141308

13151309
/// The offset (in bytes) from the beginning of the main regex that a
13161310
/// look-behind starts at. If `None`, the offset is unbounded.
1317-
pub(super) fn offset_from_start(&self) -> Option<usize> {
1311+
pub fn offset_from_start(&self) -> Option<usize> {
13181312
self.offset_from_start
13191313
}
1314+
1315+
/// The look-behinds this look-behind contains. Order reflects the order
1316+
/// in the regex.
1317+
pub fn children(&self) -> &[LookBehindTree] {
1318+
&self.children
1319+
}
1320+
1321+
/// Calls `fun` on this look-behind tree and all of its children in pre-order.
1322+
/// `fun` should return `true` if the traversal should continue and `false`
1323+
/// if it should stop.
1324+
///
1325+
/// The return value indicates whether the traversal was at any point stopped.
1326+
pub fn preorder(&self, fun: &impl Fn(&LookBehindTree) -> bool) -> bool {
1327+
if !fun(self) {
1328+
return false;
1329+
}
1330+
for child in &self.children {
1331+
if !child.preorder(fun) {
1332+
return false;
1333+
}
1334+
}
1335+
true
1336+
}
1337+
1338+
/// Like [`preorder`], but allows mutating the nodes.
1339+
pub fn preorder_mut(
1340+
&mut self,
1341+
fun: &impl Fn(&mut LookBehindTree) -> bool,
1342+
) -> bool {
1343+
if !fun(self) {
1344+
return false;
1345+
}
1346+
for child in &mut self.children {
1347+
if !child.preorder_mut(fun) {
1348+
return false;
1349+
}
1350+
}
1351+
true
1352+
}
1353+
1354+
pub fn children_mut(&mut self) -> &mut Vec<LookBehindTree> {
1355+
&mut self.children
1356+
}
13201357
}
13211358

13221359
impl Inner {
@@ -1465,7 +1502,7 @@ impl Inner {
14651502
///
14661503
/// The slice must be in a depth-first pre-order with regards to the
14671504
/// nesting of look-behinds.
1468-
pub(super) fn set_lookbehinds(&mut self, lookbehinds: &[LookBehindInfo]) {
1505+
pub(super) fn set_lookbehinds(&mut self, lookbehinds: &[LookBehindTree]) {
14691506
self.lookbehinds = lookbehinds.to_vec();
14701507
}
14711508

@@ -1522,9 +1559,12 @@ impl Inner {
15221559
for id in self.start_pattern.iter_mut() {
15231560
*id = old_to_new[*id];
15241561
}
1525-
for LookBehindInfo { start_id: id, .. } in self.lookbehinds.iter_mut()
1526-
{
1527-
*id = old_to_new[*id];
1562+
1563+
for lbs in self.lookbehinds.iter_mut() {
1564+
lbs.preorder_mut(&|e| {
1565+
e.start_id = old_to_new[e.start_id];
1566+
true
1567+
});
15281568
}
15291569
}
15301570
}
@@ -1537,7 +1577,11 @@ impl fmt::Debug for Inner {
15371577
'^'
15381578
} else if sid == self.start_unanchored {
15391579
'>'
1540-
} else if self.lookbehinds.iter().any(|i| i.start_state() == sid) {
1580+
} else if self
1581+
.lookbehinds
1582+
.iter()
1583+
.any(|i| !i.preorder(&|e| e.start_id() != sid))
1584+
{
15411585
'<'
15421586
} else {
15431587
' '

0 commit comments

Comments
 (0)