From 1a7304b8a42a37366946de00f449b1b6a041ebee Mon Sep 17 00:00:00 2001 From: Travis Cross Date: Thu, 15 May 2025 15:00:56 +0000 Subject: [PATCH] Parse grammar without regexes We'd been parsing the grammar with a combination of recursive descent and regular expression matchers. This combination has its merits, and it's done tastefully here, but it seems maybe more straightforward to do the parsing entirely with recursive descent. Among other things, doing it this way allows us to provide more precise error reporting on malformed inputs. The cost, in terms of lines of code, of doing this is rather modest, and the result seems at least as clear -- there's some mental cost to code switching between the two worlds. So let's make the switch and parse the grammar without regular expressions. We verified that the rendered output of the Reference is byte identical before and after this change. --- mdbook-spec/src/grammar/parser.rs | 116 +++++++++++++++++------------- 1 file changed, 68 insertions(+), 48 deletions(-) diff --git a/mdbook-spec/src/grammar/parser.rs b/mdbook-spec/src/grammar/parser.rs index 7a92f4772..631fb7ae8 100644 --- a/mdbook-spec/src/grammar/parser.rs +++ b/mdbook-spec/src/grammar/parser.rs @@ -1,11 +1,9 @@ //! A parser of the ENBF-like grammar. use super::{Characters, Expression, ExpressionKind, Grammar, Production}; -use regex::{Captures, Regex}; use std::fmt; use std::fmt::Display; use std::path::Path; -use std::sync::LazyLock; struct Parser<'a> { input: &'a str, @@ -76,18 +74,6 @@ impl Parser<'_> { &self.input[i..i + upper] } - /// If the input matches the given regex, it is returned and the head is moved forward. - /// - /// Note that regexes must start with `^`. - fn take_re(&mut self, re: &Regex) -> Option> { - if let Some(cap) = re.captures(&self.input[self.index..]) { - self.index += cap[0].len(); - Some(cap) - } else { - None - } - } - /// Returns whether or not the given string is next, and advances the head if it is. fn take_str(&mut self, s: &str) -> bool { if self.input[self.index..].starts_with(s) { @@ -168,13 +154,12 @@ impl Parser<'_> { } fn parse_expression(&mut self) -> Result> { - static ALT_RE: LazyLock = LazyLock::new(|| Regex::new(r"^ *\| *").unwrap()); - let mut es = Vec::new(); loop { let Some(e) = self.parse_seq()? else { break }; es.push(e); - if self.take_re(&ALT_RE).is_none() { + _ = self.space0(); + if !self.take_str("|") { break; } } @@ -268,13 +253,20 @@ impl Parser<'_> { Some(ExpressionKind::Nt(nt)) } + /// Parse terminal within backticks. fn parse_terminal(&mut self) -> Result { - static TERMINAL_RE: LazyLock = - LazyLock::new(|| Regex::new(r"^`([^`\n]+)`").unwrap()); - match self.take_re(&TERMINAL_RE) { - Some(cap) => Ok(ExpressionKind::Terminal(cap[1].to_string())), - None => bail!(self, "unterminated terminal, expected closing backtick"), + Ok(ExpressionKind::Terminal(self.parse_terminal_str()?)) + } + + /// Parse string within backticks. + fn parse_terminal_str(&mut self) -> Result { + self.expect("`", "expected opening backtick")?; + let term = self.take_while(&|x| !['\n', '`'].contains(&x)).to_string(); + if term.is_empty() { + bail!(self, "expected terminal"); } + self.expect("`", "expected closing backtick")?; + Ok(term) } fn parse_charset(&mut self) -> Result { @@ -282,7 +274,7 @@ impl Parser<'_> { let mut characters = Vec::new(); loop { self.space0(); - let Some(ch) = self.parse_characters() else { + let Some(ch) = self.parse_characters()? else { break; }; characters.push(ch); @@ -295,27 +287,48 @@ impl Parser<'_> { Ok(ExpressionKind::Charset(characters)) } - fn parse_characters(&mut self) -> Option { - static RANGE_RE: LazyLock = LazyLock::new(|| Regex::new(r"^`(.)`-`(.)`").unwrap()); - static TERMINAL_RE: LazyLock = LazyLock::new(|| Regex::new("^`([^`\n]+)`").unwrap()); - if let Some(cap) = self.take_re(&RANGE_RE) { - let a = cap[1].chars().next().unwrap(); - let b = cap[2].chars().next().unwrap(); - Some(Characters::Range(a, b)) - } else if let Some(cap) = self.take_re(&TERMINAL_RE) { - Some(Characters::Terminal(cap[1].to_string())) + /// Parse an element of a character class, e.g. + /// `` `a`-`b` `` | `` `term` `` | `` NonTerminal ``. + fn parse_characters(&mut self) -> Result> { + if let Some(b'`') = self.peek() { + let recov = self.index; + let a = self.parse_terminal_str()?; + if self.take_str("-") { + //~^ Parse `` `a`-`b` `` character range. + if a.len() > 1 { + self.index = recov + 1; + bail!(self, "invalid start terminal in range"); + } + let recov = self.index; + let b = self.parse_terminal_str()?; + if b.len() > 1 { + self.index = recov + 1; + bail!(self, "invalid end terminal in range"); + } + let a = a.chars().next().unwrap(); + let b = b.chars().next().unwrap(); + Ok(Some(Characters::Range(a, b))) + } else { + //~^ Parse terminal in backticks. + Ok(Some(Characters::Terminal(a))) + } + } else if let Some(name) = self.parse_name() { + //~^ Parse nonterminal identifier. + Ok(Some(Characters::Named(name))) } else { - let name = self.parse_name()?; - Some(Characters::Named(name)) + Ok(None) } } + /// Parse e.g. ``. fn parse_prose(&mut self) -> Result { - static PROSE_RE: LazyLock = LazyLock::new(|| Regex::new(r"^<([^>\n]+)>").unwrap()); - match self.take_re(&PROSE_RE) { - Some(cap) => Ok(ExpressionKind::Prose(cap[1].to_string())), - None => bail!(self, "unterminated prose, expected closing `>`"), + self.expect("<", "expected opening `<`")?; + let text = self.take_while(&|x| !['\n', '>'].contains(&x)).to_string(); + if text.is_empty() { + bail!(self, "expected prose text"); } + self.expect(">", "expected closing `>`")?; + Ok(ExpressionKind::Prose(text)) } fn parse_grouped(&mut self) -> Result { @@ -344,13 +357,19 @@ impl Parser<'_> { Ok(ExpressionKind::NegExpression(box_kind(kind))) } + /// Parse e.g. `F00F` after `U+`. fn parse_unicode(&mut self) -> Result { - static UNICODE_RE: LazyLock = LazyLock::new(|| Regex::new(r"^[A-Z0-9]{4}").unwrap()); - - match self.take_re(&UNICODE_RE) { - Some(s) => Ok(ExpressionKind::Unicode(s[0].to_string())), - None => bail!(self, "expected 4 hexadecimal uppercase digits after U+"), + let mut xs = Vec::with_capacity(4); + for _ in 0..4 { + match self.peek() { + Some(x @ (b'0'..=b'9' | b'A'..=b'F')) => { + xs.push(x); + self.index += 1; + } + _ => bail!(self, "expected 4 uppercase hexidecimal digits after `U+`"), + } } + Ok(ExpressionKind::Unicode(String::from_utf8(xs).unwrap())) } /// Parse `?` after expression. @@ -428,16 +447,17 @@ impl Parser<'_> { Ok(Some(self.input[start..self.index - 1].to_string())) } + /// Parse footnote reference, e.g. `[^id]`. fn parse_footnote(&mut self) -> Result> { - static FOOTNOTE_RE: LazyLock = - LazyLock::new(|| Regex::new(r"^([^\]\n]+)]").unwrap()); if !self.take_str("[^") { return Ok(None); } - match self.take_re(&FOOTNOTE_RE) { - Some(cap) => Ok(Some(cap[1].to_string())), - None => bail!(self, "unterminated footnote, expected closing `]`"), + let id = self.take_while(&|x| !['\n', ']'].contains(&x)).to_string(); + if id.is_empty() { + bail!(self, "expected footnote id"); } + self.expect("]", "expected closing `]`")?; + Ok(Some(id)) } }