From d904a1bbdbdc8261bcf4bd59322b8c8e2269abac Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Wed, 31 Jul 2024 20:27:50 +0100 Subject: [PATCH 1/5] custom postgreSQL precedence --- src/dialect/postgresql.rs | 110 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs index 8254e807b..8ac9c2072 100644 --- a/src/dialect/postgresql.rs +++ b/src/dialect/postgresql.rs @@ -9,6 +9,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +use log::debug; use crate::ast::{CommentObject, Statement}; use crate::dialect::Dialect; @@ -20,6 +21,24 @@ use crate::tokenizer::Token; #[derive(Debug)] pub struct PostgreSqlDialect {} + +// based on https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-PRECEDENCE +const DOUBLE_COLON_PREC: u8 = 140; +const BRACKET_PREC: u8 = 130; +const COLLATE_PREC: u8 = 120; +const AT_TZ_PREC: u8 = 110; +const CARET_PREC: u8 = 100; +const MUL_DIV_MOD_OP_PREC: u8 = 90; +const PLUS_MINUS_PREC: u8 = 80; +const PG_OTHER_PREC: u8 = 70; +const BETWEEN_LIKE_PREC: u8 = 60; +const EQ_PREC: u8 = 50; +const IS_PREC: u8 = 40; +const NOT_PREC: u8 = 30; +const AND_PREC: u8 = 20; +const OR_PREC: u8 = 10; +const UNKNOWN_PREC: u8 = 0; + impl Dialect for PostgreSqlDialect { fn identifier_quote_style(&self, _identifier: &str) -> Option { Some('"') @@ -67,6 +86,97 @@ impl Dialect for PostgreSqlDialect { ) } + fn get_next_precedence(&self, parser: &Parser) -> Option> { + // return None to fall back to the default behavior + + let token = parser.peek_token(); + debug!("get_next_precedence() {:?}", token); + let precedence = match token.token { + Token::Word(w) if w.keyword == Keyword::OR => OR_PREC, + Token::Word(w) if w.keyword == Keyword::AND => AND_PREC, + Token::Word(w) if w.keyword == Keyword::AT => { + match (parser.peek_nth_token(1).token, parser.peek_nth_token(2).token) { + (Token::Word(w), Token::Word(w2)) + if w.keyword == Keyword::TIME && w2.keyword == Keyword::ZONE => + { + AT_TZ_PREC + } + _ => UNKNOWN_PREC, + } + } + + Token::Word(w) if w.keyword == Keyword::NOT => match parser.peek_nth_token(1).token { + // The precedence of NOT varies depending on keyword that + // follows it. If it is followed by IN, BETWEEN, or LIKE, + // it takes on the precedence of those tokens. Otherwise, it + // is not an infix operator, and therefore has zero + // precedence. + Token::Word(w) if w.keyword == Keyword::IN => BETWEEN_LIKE_PREC, + Token::Word(w) if w.keyword == Keyword::BETWEEN => BETWEEN_LIKE_PREC, + Token::Word(w) if w.keyword == Keyword::LIKE => BETWEEN_LIKE_PREC, + Token::Word(w) if w.keyword == Keyword::ILIKE => BETWEEN_LIKE_PREC, + Token::Word(w) if w.keyword == Keyword::RLIKE => BETWEEN_LIKE_PREC, + Token::Word(w) if w.keyword == Keyword::REGEXP => BETWEEN_LIKE_PREC, + Token::Word(w) if w.keyword == Keyword::SIMILAR => BETWEEN_LIKE_PREC, + _ => NOT_PREC, + }, + Token::Word(w) if w.keyword == Keyword::IS => IS_PREC, + Token::Word(w) if w.keyword == Keyword::IN => BETWEEN_LIKE_PREC, + Token::Word(w) if w.keyword == Keyword::BETWEEN => BETWEEN_LIKE_PREC, + Token::Word(w) if w.keyword == Keyword::LIKE => BETWEEN_LIKE_PREC, + Token::Word(w) if w.keyword == Keyword::ILIKE => BETWEEN_LIKE_PREC, + Token::Word(w) if w.keyword == Keyword::RLIKE => BETWEEN_LIKE_PREC, + Token::Word(w) if w.keyword == Keyword::REGEXP => BETWEEN_LIKE_PREC, + Token::Word(w) if w.keyword == Keyword::SIMILAR => BETWEEN_LIKE_PREC, + Token::Word(w) if w.keyword == Keyword::OPERATOR => BETWEEN_LIKE_PREC, + Token::Word(w) if w.keyword == Keyword::DIV => MUL_DIV_MOD_OP_PREC, + Token::Word(w) if w.keyword == Keyword::COLLATE => COLLATE_PREC, + Token::Eq + | Token::Lt + | Token::LtEq + | Token::Neq + | Token::Gt + | Token::GtEq + | Token::DoubleEq + | Token::Tilde + | Token::TildeAsterisk + | Token::ExclamationMarkTilde + | Token::ExclamationMarkTildeAsterisk + | Token::DoubleTilde + | Token::DoubleTildeAsterisk + | Token::ExclamationMarkDoubleTilde + | Token::ExclamationMarkDoubleTildeAsterisk + | Token::Spaceship => EQ_PREC, + Token::Caret => CARET_PREC, + Token::Plus | Token::Minus => PLUS_MINUS_PREC, + Token::Mul | Token::Div | Token::Mod => MUL_DIV_MOD_OP_PREC, + Token::DoubleColon => DOUBLE_COLON_PREC, + Token::LBracket => BRACKET_PREC, + Token::Arrow + | Token::LongArrow + | Token::HashArrow + | Token::HashLongArrow + | Token::AtArrow + | Token::ArrowAt + | Token::HashMinus + | Token::AtQuestion + | Token::AtAt + | Token::Question + | Token::QuestionAnd + | Token::QuestionPipe + | Token::ExclamationMark + | Token::Overlap + | Token::CaretAt + | Token::StringConcat + | Token::Sharp + | Token::ShiftRight + | Token::ShiftLeft + | Token::CustomBinaryOperator(_) => PG_OTHER_PREC, + _ => UNKNOWN_PREC, + }; + Some(Ok(precedence)) + } + fn parse_statement(&self, parser: &mut Parser) -> Option> { if parser.parse_keyword(Keyword::COMMENT) { Some(parse_comment(parser)) From 278571bdb9f5a9178ae6eb805af8e952e950cafc Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Thu, 1 Aug 2024 01:08:19 +0100 Subject: [PATCH 2/5] correct postgres lexical precedence --- src/ast/operator.rs | 2 +- src/dialect/mod.rs | 6 +- src/dialect/postgresql.rs | 128 ++++++++++++++++---------- src/parser/mod.rs | 185 ++++++++++++++++++++++++-------------- 4 files changed, 203 insertions(+), 118 deletions(-) diff --git a/src/ast/operator.rs b/src/ast/operator.rs index e70df344a..db6ed0564 100644 --- a/src/ast/operator.rs +++ b/src/ast/operator.rs @@ -151,7 +151,7 @@ pub enum BinaryOperator { Arrow, /// The `->>` operator. /// - /// On PostgreSQL, this operator that extracts a JSON object field or JSON + /// On PostgreSQL, this operator extracts a JSON object field or JSON /// array element and converts it to text, for example `'{"a":"b"}'::json /// ->> 'a'` or `[1, 2, 3]'::json ->> 2`. /// diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index 22e0baeb2..066bf37e3 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -44,7 +44,7 @@ pub use self::redshift::RedshiftSqlDialect; pub use self::snowflake::SnowflakeDialect; pub use self::sqlite::SQLiteDialect; pub use crate::keywords; -use crate::parser::{Parser, ParserError}; +use crate::parser::{Parser, ParserError, Precedence}; #[cfg(not(feature = "std"))] use alloc::boxed::Box; @@ -305,6 +305,10 @@ pub trait Dialect: Debug + Any { // return None to fall back to the default behavior None } + + fn precedence_numeric(&self, p: Precedence) -> u8 { + p.numeric() + } } impl dyn Dialect { diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs index 8ac9c2072..9d11d5ab8 100644 --- a/src/dialect/postgresql.rs +++ b/src/dialect/postgresql.rs @@ -14,30 +14,15 @@ use log::debug; use crate::ast::{CommentObject, Statement}; use crate::dialect::Dialect; use crate::keywords::Keyword; -use crate::parser::{Parser, ParserError}; +use crate::parser::{Parser, ParserError, Precedence}; use crate::tokenizer::Token; /// A [`Dialect`] for [PostgreSQL](https://www.postgresql.org/) #[derive(Debug)] pub struct PostgreSqlDialect {} - -// based on https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-PRECEDENCE -const DOUBLE_COLON_PREC: u8 = 140; const BRACKET_PREC: u8 = 130; const COLLATE_PREC: u8 = 120; -const AT_TZ_PREC: u8 = 110; -const CARET_PREC: u8 = 100; -const MUL_DIV_MOD_OP_PREC: u8 = 90; -const PLUS_MINUS_PREC: u8 = 80; -const PG_OTHER_PREC: u8 = 70; -const BETWEEN_LIKE_PREC: u8 = 60; -const EQ_PREC: u8 = 50; -const IS_PREC: u8 = 40; -const NOT_PREC: u8 = 30; -const AND_PREC: u8 = 20; -const OR_PREC: u8 = 10; -const UNKNOWN_PREC: u8 = 0; impl Dialect for PostgreSqlDialect { fn identifier_quote_style(&self, _identifier: &str) -> Option { @@ -87,21 +72,28 @@ impl Dialect for PostgreSqlDialect { } fn get_next_precedence(&self, parser: &Parser) -> Option> { - // return None to fall back to the default behavior - let token = parser.peek_token(); debug!("get_next_precedence() {:?}", token); + + macro_rules! p { + ($precedence:ident) => {self.precedence_numeric(Precedence::$precedence)}; + } + let precedence = match token.token { - Token::Word(w) if w.keyword == Keyword::OR => OR_PREC, - Token::Word(w) if w.keyword == Keyword::AND => AND_PREC, + Token::Word(w) if w.keyword == Keyword::OR => p!(Or), + Token::Word(w) if w.keyword == Keyword::XOR => p!(Xor), + Token::Word(w) if w.keyword == Keyword::AND => p!(And), Token::Word(w) if w.keyword == Keyword::AT => { - match (parser.peek_nth_token(1).token, parser.peek_nth_token(2).token) { + match ( + parser.peek_nth_token(1).token, + parser.peek_nth_token(2).token, + ) { (Token::Word(w), Token::Word(w2)) if w.keyword == Keyword::TIME && w2.keyword == Keyword::ZONE => { - AT_TZ_PREC + p!(AtTz) } - _ => UNKNOWN_PREC, + _ => p!(Unknown), } } @@ -111,25 +103,25 @@ impl Dialect for PostgreSqlDialect { // it takes on the precedence of those tokens. Otherwise, it // is not an infix operator, and therefore has zero // precedence. - Token::Word(w) if w.keyword == Keyword::IN => BETWEEN_LIKE_PREC, - Token::Word(w) if w.keyword == Keyword::BETWEEN => BETWEEN_LIKE_PREC, - Token::Word(w) if w.keyword == Keyword::LIKE => BETWEEN_LIKE_PREC, - Token::Word(w) if w.keyword == Keyword::ILIKE => BETWEEN_LIKE_PREC, - Token::Word(w) if w.keyword == Keyword::RLIKE => BETWEEN_LIKE_PREC, - Token::Word(w) if w.keyword == Keyword::REGEXP => BETWEEN_LIKE_PREC, - Token::Word(w) if w.keyword == Keyword::SIMILAR => BETWEEN_LIKE_PREC, - _ => NOT_PREC, + Token::Word(w) if w.keyword == Keyword::IN => p!(Between), + Token::Word(w) if w.keyword == Keyword::BETWEEN => p!(Between), + Token::Word(w) if w.keyword == Keyword::LIKE => p!(Between), + Token::Word(w) if w.keyword == Keyword::ILIKE => p!(Between), + Token::Word(w) if w.keyword == Keyword::RLIKE => p!(Between), + Token::Word(w) if w.keyword == Keyword::REGEXP => p!(Between), + Token::Word(w) if w.keyword == Keyword::SIMILAR => p!(Between), + _ => p!(Unknown), }, - Token::Word(w) if w.keyword == Keyword::IS => IS_PREC, - Token::Word(w) if w.keyword == Keyword::IN => BETWEEN_LIKE_PREC, - Token::Word(w) if w.keyword == Keyword::BETWEEN => BETWEEN_LIKE_PREC, - Token::Word(w) if w.keyword == Keyword::LIKE => BETWEEN_LIKE_PREC, - Token::Word(w) if w.keyword == Keyword::ILIKE => BETWEEN_LIKE_PREC, - Token::Word(w) if w.keyword == Keyword::RLIKE => BETWEEN_LIKE_PREC, - Token::Word(w) if w.keyword == Keyword::REGEXP => BETWEEN_LIKE_PREC, - Token::Word(w) if w.keyword == Keyword::SIMILAR => BETWEEN_LIKE_PREC, - Token::Word(w) if w.keyword == Keyword::OPERATOR => BETWEEN_LIKE_PREC, - Token::Word(w) if w.keyword == Keyword::DIV => MUL_DIV_MOD_OP_PREC, + Token::Word(w) if w.keyword == Keyword::IS => p!(Is), + Token::Word(w) if w.keyword == Keyword::IN => p!(Between), + Token::Word(w) if w.keyword == Keyword::BETWEEN => p!(Between), + Token::Word(w) if w.keyword == Keyword::LIKE => p!(Between), + Token::Word(w) if w.keyword == Keyword::ILIKE => p!(Between), + Token::Word(w) if w.keyword == Keyword::RLIKE => p!(Between), + Token::Word(w) if w.keyword == Keyword::REGEXP => p!(Between), + Token::Word(w) if w.keyword == Keyword::SIMILAR => p!(Between), + Token::Word(w) if w.keyword == Keyword::OPERATOR => p!(Between), + Token::Word(w) if w.keyword == Keyword::DIV => p!(MulDivModOp), Token::Word(w) if w.keyword == Keyword::COLLATE => COLLATE_PREC, Token::Eq | Token::Lt @@ -146,11 +138,13 @@ impl Dialect for PostgreSqlDialect { | Token::DoubleTildeAsterisk | Token::ExclamationMarkDoubleTilde | Token::ExclamationMarkDoubleTildeAsterisk - | Token::Spaceship => EQ_PREC, - Token::Caret => CARET_PREC, - Token::Plus | Token::Minus => PLUS_MINUS_PREC, - Token::Mul | Token::Div | Token::Mod => MUL_DIV_MOD_OP_PREC, - Token::DoubleColon => DOUBLE_COLON_PREC, + | Token::Spaceship => p!(Eq), + Token::Pipe => p!(Pipe), + Token::Caret => p!(Caret), + Token::Ampersand => p!(Ampersand), + Token::Plus | Token::Minus => p!(PlusMinus), + Token::Mul | Token::Div | Token::Mod => p!(MulDivModOp), + Token::DoubleColon => p!(DoubleColon), Token::LBracket => BRACKET_PREC, Token::Arrow | Token::LongArrow @@ -171,8 +165,8 @@ impl Dialect for PostgreSqlDialect { | Token::Sharp | Token::ShiftRight | Token::ShiftLeft - | Token::CustomBinaryOperator(_) => PG_OTHER_PREC, - _ => UNKNOWN_PREC, + | Token::CustomBinaryOperator(_) => p!(PgOther), + _ => p!(Unknown), }; Some(Ok(precedence)) } @@ -192,6 +186,44 @@ impl Dialect for PostgreSqlDialect { fn supports_group_by_expr(&self) -> bool { true } + + /* + const DOUBLE_COLON_PREC: u8 = 140; + const BRACKET_PREC: u8 = 130; + const COLLATE_PREC: u8 = 120; + const AT_TZ_PREC: u8 = 110; + const CARET_PREC: u8 = 100; + const MUL_DIV_MOD_OP_PREC: u8 = 90; + const PLUS_MINUS_PREC: u8 = 80; + const PG_OTHER_PREC: u8 = 70; + const BETWEEN_LIKE_PREC: u8 = 60; + const EQ_PREC: u8 = 50; + const IS_PREC: u8 = 40; + const NOT_PREC: u8 = 30; + const AND_PREC: u8 = 20; + const OR_PREC: u8 = 10; + const UNKNOWN_PREC: u8 = 0; + */ + /// based on https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-PRECEDENCE + fn precedence_numeric(&self, p: Precedence) -> u8 { + match p { + Precedence::DoubleColon => 140, + Precedence::AtTz => 110, + Precedence::MulDivModOp => 90, + Precedence::PlusMinus => 80, + Precedence::Caret => 110, + Precedence::Between => 60, + Precedence::Eq => 50, + Precedence::Like => 60, + Precedence::Is => 40, + Precedence::PgOther | Precedence::Pipe | Precedence::Ampersand => 70, + Precedence::UnaryNot => 30, + Precedence::And => 20, + Precedence::Xor => 79, + Precedence::Or => 10, + Precedence::Unknown => 0, + } + } } pub fn parse_comment(parser: &mut Parser) -> Result { diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 725e24bfb..3b9e8692c 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -1075,7 +1075,7 @@ impl<'a> Parser<'a> { self.parse_bigquery_struct_literal() } Keyword::PRIOR if matches!(self.state, ParserState::ConnectBy) => { - let expr = self.parse_subexpr(Self::PLUS_MINUS_PREC)?; + let expr = self.parse_subexpr(self.prec(Precedence::PlusMinus))?; Ok(Expr::Prior(Box::new(expr))) } Keyword::MAP if self.peek_token() == Token::LBrace && self.dialect.support_map_literal_syntax() => { @@ -1163,7 +1163,7 @@ impl<'a> Parser<'a> { }; Ok(Expr::UnaryOp { op, - expr: Box::new(self.parse_subexpr(Self::MUL_DIV_MOD_OP_PREC)?), + expr: Box::new(self.parse_subexpr(self.prec(Precedence::MulDivModOp))?), }) } tok @ Token::DoubleExclamationMark @@ -1183,7 +1183,7 @@ impl<'a> Parser<'a> { }; Ok(Expr::UnaryOp { op, - expr: Box::new(self.parse_subexpr(Self::PLUS_MINUS_PREC)?), + expr: Box::new(self.parse_subexpr(self.prec(Precedence::PlusMinus))?), }) } Token::EscapedStringLiteral(_) if dialect_of!(self is PostgreSqlDialect | GenericDialect) => @@ -1712,12 +1712,13 @@ impl<'a> Parser<'a> { } pub fn parse_position_expr(&mut self, ident: Ident) -> Result { + let between_prec = self.prec(Precedence::Between); let position_expr = self.maybe_parse(|p| { // PARSE SELECT POSITION('@' in field) p.expect_token(&Token::LParen)?; // Parse the subexpr till the IN keyword - let expr = p.parse_subexpr(Self::BETWEEN_PREC)?; + let expr = p.parse_subexpr(between_prec)?; p.expect_keyword(Keyword::IN)?; let from = p.parse_expr()?; p.expect_token(&Token::RParen)?; @@ -1967,12 +1968,12 @@ impl<'a> Parser<'a> { } _ => Ok(Expr::UnaryOp { op: UnaryOperator::Not, - expr: Box::new(self.parse_subexpr(Self::UNARY_NOT_PREC)?), + expr: Box::new(self.parse_subexpr(self.prec(Precedence::UnaryNot))?), }), }, _ => Ok(Expr::UnaryOp { op: UnaryOperator::Not, - expr: Box::new(self.parse_subexpr(Self::UNARY_NOT_PREC)?), + expr: Box::new(self.parse_subexpr(self.prec(Precedence::UnaryNot))?), }), } } @@ -2648,7 +2649,7 @@ impl<'a> Parser<'a> { Ok(Expr::RLike { negated, expr: Box::new(expr), - pattern: Box::new(self.parse_subexpr(Self::LIKE_PREC)?), + pattern: Box::new(self.parse_subexpr(self.prec(Precedence::Like))?), regexp, }) } else if self.parse_keyword(Keyword::IN) { @@ -2659,21 +2660,21 @@ impl<'a> Parser<'a> { Ok(Expr::Like { negated, expr: Box::new(expr), - pattern: Box::new(self.parse_subexpr(Self::LIKE_PREC)?), + pattern: Box::new(self.parse_subexpr(self.prec(Precedence::Like))?), escape_char: self.parse_escape_char()?, }) } else if self.parse_keyword(Keyword::ILIKE) { Ok(Expr::ILike { negated, expr: Box::new(expr), - pattern: Box::new(self.parse_subexpr(Self::LIKE_PREC)?), + pattern: Box::new(self.parse_subexpr(self.prec(Precedence::Like))?), escape_char: self.parse_escape_char()?, }) } else if self.parse_keywords(&[Keyword::SIMILAR, Keyword::TO]) { Ok(Expr::SimilarTo { negated, expr: Box::new(expr), - pattern: Box::new(self.parse_subexpr(Self::LIKE_PREC)?), + pattern: Box::new(self.parse_subexpr(self.prec(Precedence::Like))?), escape_char: self.parse_escape_char()?, }) } else { @@ -2948,9 +2949,9 @@ impl<'a> Parser<'a> { pub fn parse_between(&mut self, expr: Expr, negated: bool) -> Result { // Stop parsing subexpressions for and on tokens with // precedence lower than that of `BETWEEN`, such as `AND`, `IS`, etc. - let low = self.parse_subexpr(Self::BETWEEN_PREC)?; + let low = self.parse_subexpr(self.prec(Precedence::Between))?; self.expect_keyword(Keyword::AND)?; - let high = self.parse_subexpr(Self::BETWEEN_PREC)?; + let high = self.parse_subexpr(self.prec(Precedence::Between))?; Ok(Expr::Between { expr: Box::new(expr), negated, @@ -2969,24 +2970,6 @@ impl<'a> Parser<'a> { }) } - // Use https://www.postgresql.org/docs/7.0/operators.htm#AEN2026 as a reference - // higher number = higher precedence - // - // NOTE: The pg documentation is incomplete, e.g. the AT TIME ZONE operator - // actually has higher precedence than addition. - // See https://postgrespro.com/list/thread-id/2673331. - const AT_TZ_PREC: u8 = 41; - const MUL_DIV_MOD_OP_PREC: u8 = 40; - const PLUS_MINUS_PREC: u8 = 30; - const XOR_PREC: u8 = 24; - const BETWEEN_PREC: u8 = 20; - const LIKE_PREC: u8 = 19; - const IS_PREC: u8 = 17; - const PG_OTHER_PREC: u8 = 16; - const UNARY_NOT_PREC: u8 = 15; - const AND_PREC: u8 = 10; - const OR_PREC: u8 = 5; - /// Get the precedence of the next token pub fn get_next_precedence(&self) -> Result { // allow the dialect to override precedence logic @@ -2994,23 +2977,25 @@ impl<'a> Parser<'a> { return precedence; } + macro_rules! p { + ($precedence:ident) => {self.prec(Precedence::$precedence)}; + } + let token = self.peek_token(); debug!("get_next_precedence() {:?}", token); - let [token_0, token_1, token_2] = self.peek_tokens_with_location(); - debug!("0: {token_0} 1: {token_1} 2: {token_2}"); match token.token { - Token::Word(w) if w.keyword == Keyword::OR => Ok(Self::OR_PREC), - Token::Word(w) if w.keyword == Keyword::AND => Ok(Self::AND_PREC), - Token::Word(w) if w.keyword == Keyword::XOR => Ok(Self::XOR_PREC), + Token::Word(w) if w.keyword == Keyword::OR => Ok(p!(Or)), + Token::Word(w) if w.keyword == Keyword::AND => Ok(p!(And)), + Token::Word(w) if w.keyword == Keyword::XOR => Ok(p!(Xor)), Token::Word(w) if w.keyword == Keyword::AT => { match (self.peek_nth_token(1).token, self.peek_nth_token(2).token) { (Token::Word(w), Token::Word(w2)) if w.keyword == Keyword::TIME && w2.keyword == Keyword::ZONE => { - Ok(Self::AT_TZ_PREC) + Ok(p!(AtTz)) } - _ => Ok(0), + _ => Ok(p!(Unknown)), } } @@ -3020,25 +3005,25 @@ impl<'a> Parser<'a> { // it takes on the precedence of those tokens. Otherwise, it // is not an infix operator, and therefore has zero // precedence. - Token::Word(w) if w.keyword == Keyword::IN => Ok(Self::BETWEEN_PREC), - Token::Word(w) if w.keyword == Keyword::BETWEEN => Ok(Self::BETWEEN_PREC), - Token::Word(w) if w.keyword == Keyword::LIKE => Ok(Self::LIKE_PREC), - Token::Word(w) if w.keyword == Keyword::ILIKE => Ok(Self::LIKE_PREC), - Token::Word(w) if w.keyword == Keyword::RLIKE => Ok(Self::LIKE_PREC), - Token::Word(w) if w.keyword == Keyword::REGEXP => Ok(Self::LIKE_PREC), - Token::Word(w) if w.keyword == Keyword::SIMILAR => Ok(Self::LIKE_PREC), - _ => Ok(0), + Token::Word(w) if w.keyword == Keyword::IN => Ok(p!(Between)), + Token::Word(w) if w.keyword == Keyword::BETWEEN => Ok(p!(Between)), + Token::Word(w) if w.keyword == Keyword::LIKE => Ok(p!(Like)), + Token::Word(w) if w.keyword == Keyword::ILIKE => Ok(p!(Like)), + Token::Word(w) if w.keyword == Keyword::RLIKE => Ok(p!(Like)), + Token::Word(w) if w.keyword == Keyword::REGEXP => Ok(p!(Like)), + Token::Word(w) if w.keyword == Keyword::SIMILAR => Ok(p!(Like)), + _ => Ok(p!(Unknown)), }, - Token::Word(w) if w.keyword == Keyword::IS => Ok(Self::IS_PREC), - Token::Word(w) if w.keyword == Keyword::IN => Ok(Self::BETWEEN_PREC), - Token::Word(w) if w.keyword == Keyword::BETWEEN => Ok(Self::BETWEEN_PREC), - Token::Word(w) if w.keyword == Keyword::LIKE => Ok(Self::LIKE_PREC), - Token::Word(w) if w.keyword == Keyword::ILIKE => Ok(Self::LIKE_PREC), - Token::Word(w) if w.keyword == Keyword::RLIKE => Ok(Self::LIKE_PREC), - Token::Word(w) if w.keyword == Keyword::REGEXP => Ok(Self::LIKE_PREC), - Token::Word(w) if w.keyword == Keyword::SIMILAR => Ok(Self::LIKE_PREC), - Token::Word(w) if w.keyword == Keyword::OPERATOR => Ok(Self::BETWEEN_PREC), - Token::Word(w) if w.keyword == Keyword::DIV => Ok(Self::MUL_DIV_MOD_OP_PREC), + Token::Word(w) if w.keyword == Keyword::IS => Ok(p!(Is)), + Token::Word(w) if w.keyword == Keyword::IN => Ok(p!(Between)), + Token::Word(w) if w.keyword == Keyword::BETWEEN => Ok(p!(Between)), + Token::Word(w) if w.keyword == Keyword::LIKE => Ok(p!(Like)), + Token::Word(w) if w.keyword == Keyword::ILIKE => Ok(p!(Like)), + Token::Word(w) if w.keyword == Keyword::RLIKE => Ok(p!(Like)), + Token::Word(w) if w.keyword == Keyword::REGEXP => Ok(p!(Like)), + Token::Word(w) if w.keyword == Keyword::SIMILAR => Ok(p!(Like)), + Token::Word(w) if w.keyword == Keyword::OPERATOR => Ok(p!(Between)), + Token::Word(w) if w.keyword == Keyword::DIV => Ok(p!(MulDivModOp)), Token::Eq | Token::Lt | Token::LtEq @@ -3054,18 +3039,22 @@ impl<'a> Parser<'a> { | Token::DoubleTildeAsterisk | Token::ExclamationMarkDoubleTilde | Token::ExclamationMarkDoubleTildeAsterisk - | Token::Spaceship => Ok(20), - Token::Pipe => Ok(21), - Token::Caret | Token::Sharp | Token::ShiftRight | Token::ShiftLeft => Ok(22), - Token::Ampersand => Ok(23), - Token::Plus | Token::Minus => Ok(Self::PLUS_MINUS_PREC), - Token::Mul | Token::Div | Token::DuckIntDiv | Token::Mod | Token::StringConcat => { - Ok(Self::MUL_DIV_MOD_OP_PREC) + | Token::Spaceship => Ok(p!(Eq)), + Token::Pipe => Ok(p!(Pipe)), + Token::Caret | Token::Sharp | Token::ShiftRight | Token::ShiftLeft => { + Ok(p!(Caret)) } - Token::DoubleColon => Ok(50), - Token::Colon if dialect_of!(self is SnowflakeDialect) => Ok(50), - Token::ExclamationMark => Ok(50), - Token::LBracket | Token::Overlap | Token::CaretAt => Ok(50), + Token::Ampersand => Ok(p!(Ampersand)), + Token::Plus | Token::Minus => Ok(p!(PlusMinus)), + Token::Mul | Token::Div | Token::DuckIntDiv | Token::Mod | Token::StringConcat => { + Ok(p!(MulDivModOp)) + } + Token::DoubleColon + | Token::ExclamationMark + | Token::LBracket + | Token::Overlap + | Token::CaretAt => Ok(p!(DoubleColon)), + Token::Colon if dialect_of!(self is SnowflakeDialect) => Ok(p!(DoubleColon)), Token::Arrow | Token::LongArrow | Token::HashArrow @@ -3078,11 +3067,15 @@ impl<'a> Parser<'a> { | Token::Question | Token::QuestionAnd | Token::QuestionPipe - | Token::CustomBinaryOperator(_) => Ok(Self::PG_OTHER_PREC), - _ => Ok(0), + | Token::CustomBinaryOperator(_) => Ok(p!(PgOther)), + _ => Ok(p!(Unknown)), } } + fn prec(&self, p: Precedence) -> u8 { + self.dialect.precedence_numeric(p) + } + /// Return the first non-whitespace token that has not yet been processed /// (or None if reached end-of-file) pub fn peek_token(&self) -> TokenWithLocation { @@ -11399,6 +11392,62 @@ impl<'a> Parser<'a> { } } + +/// Use to define the lexical Precedence of operators. +/// +/// Numeric values of enum members are used to define the default precedence of the operators. +/// +/// Uses (APPROXIMATELY) as a reference +/// higher number = higher precedence +/// +/// NOTE: The pg documentation is incomplete, e.g. the AT TIME ZONE operator +/// actually has higher precedence than addition. +/// See . +#[derive(Debug, Clone, Copy)] +#[repr(u8)] +pub enum Precedence { + DoubleColon, + AtTz, + MulDivModOp, + PlusMinus, + Xor, + Ampersand, + Caret, + Pipe, + Between, + Eq, + Like, + Is, + PgOther, + UnaryNot, + And, + Or, + Unknown, +} + +impl Precedence { + pub fn numeric(&self) -> u8 { + match self { + Precedence::DoubleColon => 50, + Precedence::AtTz => 41, + Precedence::MulDivModOp => 40, + Precedence::PlusMinus => 30, + Precedence::Xor => 24, + Precedence::Ampersand => 23, + Precedence::Caret => 22, + Precedence::Pipe => 21, + Precedence::Between | Precedence::Eq => 20, + Precedence::Like => 19, + Precedence::Is => 17, + Precedence::PgOther => 16, + Precedence::UnaryNot => 15, + Precedence::And => 10, + Precedence::Or => 5, + Precedence::Unknown => 0, + } + } +} + impl Word { pub fn to_ident(&self) -> Ident { Ident { From ac53f97b9b73876c018d73d2be7eac9852f04f64 Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Thu, 1 Aug 2024 16:43:18 +0100 Subject: [PATCH 3/5] simplify precedence logic, add tests --- src/dialect/mod.rs | 162 +++++++++++++++++++++++++++++- src/dialect/postgresql.rs | 138 ++++++++++++------------- src/dialect/snowflake.rs | 9 ++ src/parser/mod.rs | 195 ++++-------------------------------- tests/sqlparser_postgres.rs | 112 +++++++++++++++++++++ 5 files changed, 364 insertions(+), 252 deletions(-) diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index 066bf37e3..d47770857 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -44,10 +44,13 @@ pub use self::redshift::RedshiftSqlDialect; pub use self::snowflake::SnowflakeDialect; pub use self::sqlite::SQLiteDialect; pub use crate::keywords; -use crate::parser::{Parser, ParserError, Precedence}; +use crate::parser::{Parser, ParserError}; +use crate::keywords::Keyword; +use crate::tokenizer::Token; #[cfg(not(feature = "std"))] use alloc::boxed::Box; +use log::debug; /// Convenience check if a [`Parser`] uses a certain dialect. /// @@ -300,17 +303,170 @@ pub trait Dialect: Debug + Any { // return None to fall back to the default behavior None } + + /// Get the precedence of the next token + /// + /// Higher number => higher precedence + fn get_next_precedence_full(&self, parser: &Parser) -> Result { + if let Some(precedence) = self.get_next_precedence(parser) { + return precedence; + } + + let token = parser.peek_token(); + debug!("get_next_precedence() {:?}", token); + match token.token { + Token::Word(w) if w.keyword == Keyword::OR => Ok(OR_PREC), + Token::Word(w) if w.keyword == Keyword::AND => Ok(AND_PREC), + Token::Word(w) if w.keyword == Keyword::XOR => Ok(XOR_PREC), + + Token::Word(w) if w.keyword == Keyword::AT => { + match ( + parser.peek_nth_token(1).token, + parser.peek_nth_token(2).token, + ) { + (Token::Word(w), Token::Word(w2)) + if w.keyword == Keyword::TIME && w2.keyword == Keyword::ZONE => + { + Ok(AT_TZ_PREC) + } + _ => Ok(UNKNOWN_PREC), + } + } + + Token::Word(w) if w.keyword == Keyword::NOT => match parser.peek_nth_token(1).token { + // The precedence of NOT varies depending on keyword that + // follows it. If it is followed by IN, BETWEEN, or LIKE, + // it takes on the precedence of those tokens. Otherwise, it + // is not an infix operator, and therefore has zero + // precedence. + Token::Word(w) if w.keyword == Keyword::IN => Ok(BETWEEN_PREC), + Token::Word(w) if w.keyword == Keyword::BETWEEN => Ok(BETWEEN_PREC), + Token::Word(w) if w.keyword == Keyword::LIKE => Ok(LIKE_PREC), + Token::Word(w) if w.keyword == Keyword::ILIKE => Ok(LIKE_PREC), + Token::Word(w) if w.keyword == Keyword::RLIKE => Ok(LIKE_PREC), + Token::Word(w) if w.keyword == Keyword::REGEXP => Ok(LIKE_PREC), + Token::Word(w) if w.keyword == Keyword::SIMILAR => Ok(LIKE_PREC), + _ => Ok(UNKNOWN_PREC), + }, + Token::Word(w) if w.keyword == Keyword::IS => Ok(IS_PREC), + Token::Word(w) if w.keyword == Keyword::IN => Ok(BETWEEN_PREC), + Token::Word(w) if w.keyword == Keyword::BETWEEN => Ok(BETWEEN_PREC), + Token::Word(w) if w.keyword == Keyword::LIKE => Ok(LIKE_PREC), + Token::Word(w) if w.keyword == Keyword::ILIKE => Ok(LIKE_PREC), + Token::Word(w) if w.keyword == Keyword::RLIKE => Ok(LIKE_PREC), + Token::Word(w) if w.keyword == Keyword::REGEXP => Ok(LIKE_PREC), + Token::Word(w) if w.keyword == Keyword::SIMILAR => Ok(LIKE_PREC), + Token::Word(w) if w.keyword == Keyword::OPERATOR => Ok(BETWEEN_PREC), + Token::Word(w) if w.keyword == Keyword::DIV => Ok(MUL_DIV_MOD_OP_PREC), + Token::Eq + | Token::Lt + | Token::LtEq + | Token::Neq + | Token::Gt + | Token::GtEq + | Token::DoubleEq + | Token::Tilde + | Token::TildeAsterisk + | Token::ExclamationMarkTilde + | Token::ExclamationMarkTildeAsterisk + | Token::DoubleTilde + | Token::DoubleTildeAsterisk + | Token::ExclamationMarkDoubleTilde + | Token::ExclamationMarkDoubleTildeAsterisk + | Token::Spaceship => Ok(EQ_PREC), + Token::Pipe => Ok(PIPE_PREC), + Token::Caret | Token::Sharp | Token::ShiftRight | Token::ShiftLeft => Ok(CARET_PREC), + Token::Ampersand => Ok(AMPERSAND_PREC), + Token::Plus | Token::Minus => Ok(PLUS_MINUS_PREC), + Token::Mul | Token::Div | Token::DuckIntDiv | Token::Mod | Token::StringConcat => { + Ok(MUL_DIV_MOD_OP_PREC) + } + Token::DoubleColon + | Token::ExclamationMark + | Token::LBracket + | Token::Overlap + | Token::CaretAt => Ok(DOUBLE_COLON_PREC), + // Token::Colon if (self as dyn Dialect).is::() => Ok(DOUBLE_COLON_PREC), + Token::Arrow + | Token::LongArrow + | Token::HashArrow + | Token::HashLongArrow + | Token::AtArrow + | Token::ArrowAt + | Token::HashMinus + | Token::AtQuestion + | Token::AtAt + | Token::Question + | Token::QuestionAnd + | Token::QuestionPipe + | Token::CustomBinaryOperator(_) => Ok(PG_OTHER_PREC), + _ => Ok(UNKNOWN_PREC), + } + } + /// Dialect-specific statement parser override fn parse_statement(&self, _parser: &mut Parser) -> Option> { // return None to fall back to the default behavior None } - fn precedence_numeric(&self, p: Precedence) -> u8 { - p.numeric() + /// The following precedence values are used directly by `Parse` or in dialects, + /// so have to be made public by the dialect. + fn prec_double_colon(&self) -> u8 { + DOUBLE_COLON_PREC + } + + fn prec_mul_div_mod_op(&self) -> u8 { + MUL_DIV_MOD_OP_PREC + } + + fn prec_plus_minus(&self) -> u8 { + PLUS_MINUS_PREC + } + + fn prec_between(&self) -> u8 { + BETWEEN_PREC + } + + fn prec_like(&self) -> u8 { + LIKE_PREC + } + + fn prec_unary_not(&self) -> u8 { + UNARY_NOT_PREC + } + + fn prec_unknown(&self) -> u8 { + UNKNOWN_PREC } } +// Define the lexical Precedence of operators. +// +// Uses (APPROXIMATELY) as a reference +// higher number = higher precedence +// +// NOTE: The pg documentation is incomplete, e.g. the AT TIME ZONE operator +// actually has higher precedence than addition. +// See . +const DOUBLE_COLON_PREC: u8 = 50; +const AT_TZ_PREC: u8 = 41; +const MUL_DIV_MOD_OP_PREC: u8 = 40; +const PLUS_MINUS_PREC: u8 = 30; +const XOR_PREC: u8 = 24; +const AMPERSAND_PREC: u8 = 23; +const CARET_PREC: u8 = 22; +const PIPE_PREC: u8 = 21; +const BETWEEN_PREC: u8 = 20; +const EQ_PREC: u8 = 20; +const LIKE_PREC: u8 = 19; +const IS_PREC: u8 = 17; +const PG_OTHER_PREC: u8 = 16; +const UNARY_NOT_PREC: u8 = 15; +const AND_PREC: u8 = 10; +const OR_PREC: u8 = 5; +const UNKNOWN_PREC: u8 = 0; + impl dyn Dialect { #[inline] pub fn is(&self) -> bool { diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs index 9d11d5ab8..293fb9e7d 100644 --- a/src/dialect/postgresql.rs +++ b/src/dialect/postgresql.rs @@ -14,15 +14,29 @@ use log::debug; use crate::ast::{CommentObject, Statement}; use crate::dialect::Dialect; use crate::keywords::Keyword; -use crate::parser::{Parser, ParserError, Precedence}; +use crate::parser::{Parser, ParserError}; use crate::tokenizer::Token; /// A [`Dialect`] for [PostgreSQL](https://www.postgresql.org/) #[derive(Debug)] pub struct PostgreSqlDialect {} +const DOUBLE_COLON_PREC: u8 = 140; const BRACKET_PREC: u8 = 130; const COLLATE_PREC: u8 = 120; +const AT_TZ_PREC: u8 = 110; +const CARET_PREC: u8 = 100; +const MUL_DIV_MOD_OP_PREC: u8 = 90; +const PLUS_MINUS_PREC: u8 = 80; +// there's no XOR operator in PostgreSQL, but support it here to avoid breaking tests +const XOR_PREC: u8 = 75; +const PG_OTHER_PREC: u8 = 70; +const BETWEEN_LIKE_PREC: u8 = 60; +const EQ_PREC: u8 = 50; +const IS_PREC: u8 = 40; +const NOT_PREC: u8 = 30; +const AND_PREC: u8 = 20; +const OR_PREC: u8 = 10; impl Dialect for PostgreSqlDialect { fn identifier_quote_style(&self, _identifier: &str) -> Option { @@ -75,14 +89,10 @@ impl Dialect for PostgreSqlDialect { let token = parser.peek_token(); debug!("get_next_precedence() {:?}", token); - macro_rules! p { - ($precedence:ident) => {self.precedence_numeric(Precedence::$precedence)}; - } - let precedence = match token.token { - Token::Word(w) if w.keyword == Keyword::OR => p!(Or), - Token::Word(w) if w.keyword == Keyword::XOR => p!(Xor), - Token::Word(w) if w.keyword == Keyword::AND => p!(And), + Token::Word(w) if w.keyword == Keyword::OR => OR_PREC, + Token::Word(w) if w.keyword == Keyword::XOR => XOR_PREC, + Token::Word(w) if w.keyword == Keyword::AND => AND_PREC, Token::Word(w) if w.keyword == Keyword::AT => { match ( parser.peek_nth_token(1).token, @@ -91,9 +101,9 @@ impl Dialect for PostgreSqlDialect { (Token::Word(w), Token::Word(w2)) if w.keyword == Keyword::TIME && w2.keyword == Keyword::ZONE => { - p!(AtTz) + AT_TZ_PREC } - _ => p!(Unknown), + _ => self.prec_unknown(), } } @@ -103,25 +113,25 @@ impl Dialect for PostgreSqlDialect { // it takes on the precedence of those tokens. Otherwise, it // is not an infix operator, and therefore has zero // precedence. - Token::Word(w) if w.keyword == Keyword::IN => p!(Between), - Token::Word(w) if w.keyword == Keyword::BETWEEN => p!(Between), - Token::Word(w) if w.keyword == Keyword::LIKE => p!(Between), - Token::Word(w) if w.keyword == Keyword::ILIKE => p!(Between), - Token::Word(w) if w.keyword == Keyword::RLIKE => p!(Between), - Token::Word(w) if w.keyword == Keyword::REGEXP => p!(Between), - Token::Word(w) if w.keyword == Keyword::SIMILAR => p!(Between), - _ => p!(Unknown), + Token::Word(w) if w.keyword == Keyword::IN => BETWEEN_LIKE_PREC, + Token::Word(w) if w.keyword == Keyword::BETWEEN => BETWEEN_LIKE_PREC, + Token::Word(w) if w.keyword == Keyword::LIKE => BETWEEN_LIKE_PREC, + Token::Word(w) if w.keyword == Keyword::ILIKE => BETWEEN_LIKE_PREC, + Token::Word(w) if w.keyword == Keyword::RLIKE => BETWEEN_LIKE_PREC, + Token::Word(w) if w.keyword == Keyword::REGEXP => BETWEEN_LIKE_PREC, + Token::Word(w) if w.keyword == Keyword::SIMILAR => BETWEEN_LIKE_PREC, + _ => self.prec_unknown(), }, - Token::Word(w) if w.keyword == Keyword::IS => p!(Is), - Token::Word(w) if w.keyword == Keyword::IN => p!(Between), - Token::Word(w) if w.keyword == Keyword::BETWEEN => p!(Between), - Token::Word(w) if w.keyword == Keyword::LIKE => p!(Between), - Token::Word(w) if w.keyword == Keyword::ILIKE => p!(Between), - Token::Word(w) if w.keyword == Keyword::RLIKE => p!(Between), - Token::Word(w) if w.keyword == Keyword::REGEXP => p!(Between), - Token::Word(w) if w.keyword == Keyword::SIMILAR => p!(Between), - Token::Word(w) if w.keyword == Keyword::OPERATOR => p!(Between), - Token::Word(w) if w.keyword == Keyword::DIV => p!(MulDivModOp), + Token::Word(w) if w.keyword == Keyword::IS => IS_PREC, + Token::Word(w) if w.keyword == Keyword::IN => BETWEEN_LIKE_PREC, + Token::Word(w) if w.keyword == Keyword::BETWEEN => BETWEEN_LIKE_PREC, + Token::Word(w) if w.keyword == Keyword::LIKE => BETWEEN_LIKE_PREC, + Token::Word(w) if w.keyword == Keyword::ILIKE => BETWEEN_LIKE_PREC, + Token::Word(w) if w.keyword == Keyword::RLIKE => BETWEEN_LIKE_PREC, + Token::Word(w) if w.keyword == Keyword::REGEXP => BETWEEN_LIKE_PREC, + Token::Word(w) if w.keyword == Keyword::SIMILAR => BETWEEN_LIKE_PREC, + Token::Word(w) if w.keyword == Keyword::OPERATOR => BETWEEN_LIKE_PREC, + Token::Word(w) if w.keyword == Keyword::DIV => MUL_DIV_MOD_OP_PREC, Token::Word(w) if w.keyword == Keyword::COLLATE => COLLATE_PREC, Token::Eq | Token::Lt @@ -138,13 +148,11 @@ impl Dialect for PostgreSqlDialect { | Token::DoubleTildeAsterisk | Token::ExclamationMarkDoubleTilde | Token::ExclamationMarkDoubleTildeAsterisk - | Token::Spaceship => p!(Eq), - Token::Pipe => p!(Pipe), - Token::Caret => p!(Caret), - Token::Ampersand => p!(Ampersand), - Token::Plus | Token::Minus => p!(PlusMinus), - Token::Mul | Token::Div | Token::Mod => p!(MulDivModOp), - Token::DoubleColon => p!(DoubleColon), + | Token::Spaceship => EQ_PREC, + Token::Caret => CARET_PREC, + Token::Plus | Token::Minus => PLUS_MINUS_PREC, + Token::Mul | Token::Div | Token::Mod => MUL_DIV_MOD_OP_PREC, + Token::DoubleColon => DOUBLE_COLON_PREC, Token::LBracket => BRACKET_PREC, Token::Arrow | Token::LongArrow @@ -165,8 +173,10 @@ impl Dialect for PostgreSqlDialect { | Token::Sharp | Token::ShiftRight | Token::ShiftLeft - | Token::CustomBinaryOperator(_) => p!(PgOther), - _ => p!(Unknown), + | Token::Pipe + | Token::Ampersand + | Token::CustomBinaryOperator(_) => PG_OTHER_PREC, + _ => self.prec_unknown(), }; Some(Ok(precedence)) } @@ -187,42 +197,24 @@ impl Dialect for PostgreSqlDialect { true } - /* - const DOUBLE_COLON_PREC: u8 = 140; - const BRACKET_PREC: u8 = 130; - const COLLATE_PREC: u8 = 120; - const AT_TZ_PREC: u8 = 110; - const CARET_PREC: u8 = 100; - const MUL_DIV_MOD_OP_PREC: u8 = 90; - const PLUS_MINUS_PREC: u8 = 80; - const PG_OTHER_PREC: u8 = 70; - const BETWEEN_LIKE_PREC: u8 = 60; - const EQ_PREC: u8 = 50; - const IS_PREC: u8 = 40; - const NOT_PREC: u8 = 30; - const AND_PREC: u8 = 20; - const OR_PREC: u8 = 10; - const UNKNOWN_PREC: u8 = 0; - */ - /// based on https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-PRECEDENCE - fn precedence_numeric(&self, p: Precedence) -> u8 { - match p { - Precedence::DoubleColon => 140, - Precedence::AtTz => 110, - Precedence::MulDivModOp => 90, - Precedence::PlusMinus => 80, - Precedence::Caret => 110, - Precedence::Between => 60, - Precedence::Eq => 50, - Precedence::Like => 60, - Precedence::Is => 40, - Precedence::PgOther | Precedence::Pipe | Precedence::Ampersand => 70, - Precedence::UnaryNot => 30, - Precedence::And => 20, - Precedence::Xor => 79, - Precedence::Or => 10, - Precedence::Unknown => 0, - } + fn prec_mul_div_mod_op(&self) -> u8 { + MUL_DIV_MOD_OP_PREC + } + + fn prec_plus_minus(&self) -> u8 { + PLUS_MINUS_PREC + } + + fn prec_between(&self) -> u8 { + BETWEEN_LIKE_PREC + } + + fn prec_like(&self) -> u8 { + BETWEEN_LIKE_PREC + } + + fn prec_unary_not(&self) -> u8 { + NOT_PREC } } diff --git a/src/dialect/snowflake.rs b/src/dialect/snowflake.rs index 9f1d7f27b..fe35d8da3 100644 --- a/src/dialect/snowflake.rs +++ b/src/dialect/snowflake.rs @@ -145,6 +145,15 @@ impl Dialect for SnowflakeDialect { None } + + fn get_next_precedence(&self, parser: &Parser) -> Option> { + let token = parser.peek_token(); + // Snowflake supports the `:` cast operator unlike other dialects + match token.token { + Token::Colon => Some(Ok(self.prec_double_colon())), + _ => None, + } + } } /// Parse snowflake create table statement. diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 3b9e8692c..5d1f1f575 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -871,7 +871,7 @@ impl<'a> Parser<'a> { /// Parse a new expression. pub fn parse_expr(&mut self) -> Result { let _guard = self.recursion_counter.try_decrease()?; - self.parse_subexpr(0) + self.parse_subexpr(self.dialect.prec_unknown()) } /// Parse tokens until the precedence changes. @@ -893,7 +893,7 @@ impl<'a> Parser<'a> { } pub fn parse_interval_expr(&mut self) -> Result { - let precedence = 0; + let precedence = self.dialect.prec_unknown(); let mut expr = self.parse_prefix()?; loop { @@ -914,9 +914,9 @@ impl<'a> Parser<'a> { let token = self.peek_token(); match token.token { - Token::Word(w) if w.keyword == Keyword::AND => Ok(0), - Token::Word(w) if w.keyword == Keyword::OR => Ok(0), - Token::Word(w) if w.keyword == Keyword::XOR => Ok(0), + Token::Word(w) if w.keyword == Keyword::AND => Ok(self.dialect.prec_unknown()), + Token::Word(w) if w.keyword == Keyword::OR => Ok(self.dialect.prec_unknown()), + Token::Word(w) if w.keyword == Keyword::XOR => Ok(self.dialect.prec_unknown()), _ => self.get_next_precedence(), } } @@ -1075,7 +1075,7 @@ impl<'a> Parser<'a> { self.parse_bigquery_struct_literal() } Keyword::PRIOR if matches!(self.state, ParserState::ConnectBy) => { - let expr = self.parse_subexpr(self.prec(Precedence::PlusMinus))?; + let expr = self.parse_subexpr(self.dialect.prec_plus_minus())?; Ok(Expr::Prior(Box::new(expr))) } Keyword::MAP if self.peek_token() == Token::LBrace && self.dialect.support_map_literal_syntax() => { @@ -1163,7 +1163,7 @@ impl<'a> Parser<'a> { }; Ok(Expr::UnaryOp { op, - expr: Box::new(self.parse_subexpr(self.prec(Precedence::MulDivModOp))?), + expr: Box::new(self.parse_subexpr(self.dialect.prec_mul_div_mod_op())?), }) } tok @ Token::DoubleExclamationMark @@ -1183,7 +1183,7 @@ impl<'a> Parser<'a> { }; Ok(Expr::UnaryOp { op, - expr: Box::new(self.parse_subexpr(self.prec(Precedence::PlusMinus))?), + expr: Box::new(self.parse_subexpr(self.dialect.prec_plus_minus())?), }) } Token::EscapedStringLiteral(_) if dialect_of!(self is PostgreSqlDialect | GenericDialect) => @@ -1712,7 +1712,7 @@ impl<'a> Parser<'a> { } pub fn parse_position_expr(&mut self, ident: Ident) -> Result { - let between_prec = self.prec(Precedence::Between); + let between_prec = self.dialect.prec_between(); let position_expr = self.maybe_parse(|p| { // PARSE SELECT POSITION('@' in field) p.expect_token(&Token::LParen)?; @@ -1968,12 +1968,12 @@ impl<'a> Parser<'a> { } _ => Ok(Expr::UnaryOp { op: UnaryOperator::Not, - expr: Box::new(self.parse_subexpr(self.prec(Precedence::UnaryNot))?), + expr: Box::new(self.parse_subexpr(self.dialect.prec_unary_not())?), }), }, _ => Ok(Expr::UnaryOp { op: UnaryOperator::Not, - expr: Box::new(self.parse_subexpr(self.prec(Precedence::UnaryNot))?), + expr: Box::new(self.parse_subexpr(self.dialect.prec_unary_not())?), }), } } @@ -2649,7 +2649,7 @@ impl<'a> Parser<'a> { Ok(Expr::RLike { negated, expr: Box::new(expr), - pattern: Box::new(self.parse_subexpr(self.prec(Precedence::Like))?), + pattern: Box::new(self.parse_subexpr(self.dialect.prec_like())?), regexp, }) } else if self.parse_keyword(Keyword::IN) { @@ -2660,21 +2660,21 @@ impl<'a> Parser<'a> { Ok(Expr::Like { negated, expr: Box::new(expr), - pattern: Box::new(self.parse_subexpr(self.prec(Precedence::Like))?), + pattern: Box::new(self.parse_subexpr(self.dialect.prec_like())?), escape_char: self.parse_escape_char()?, }) } else if self.parse_keyword(Keyword::ILIKE) { Ok(Expr::ILike { negated, expr: Box::new(expr), - pattern: Box::new(self.parse_subexpr(self.prec(Precedence::Like))?), + pattern: Box::new(self.parse_subexpr(self.dialect.prec_like())?), escape_char: self.parse_escape_char()?, }) } else if self.parse_keywords(&[Keyword::SIMILAR, Keyword::TO]) { Ok(Expr::SimilarTo { negated, expr: Box::new(expr), - pattern: Box::new(self.parse_subexpr(self.prec(Precedence::Like))?), + pattern: Box::new(self.parse_subexpr(self.dialect.prec_like())?), escape_char: self.parse_escape_char()?, }) } else { @@ -2949,9 +2949,9 @@ impl<'a> Parser<'a> { pub fn parse_between(&mut self, expr: Expr, negated: bool) -> Result { // Stop parsing subexpressions for and on tokens with // precedence lower than that of `BETWEEN`, such as `AND`, `IS`, etc. - let low = self.parse_subexpr(self.prec(Precedence::Between))?; + let low = self.parse_subexpr(self.dialect.prec_between())?; self.expect_keyword(Keyword::AND)?; - let high = self.parse_subexpr(self.prec(Precedence::Between))?; + let high = self.parse_subexpr(self.dialect.prec_between())?; Ok(Expr::Between { expr: Box::new(expr), negated, @@ -2972,108 +2972,7 @@ impl<'a> Parser<'a> { /// Get the precedence of the next token pub fn get_next_precedence(&self) -> Result { - // allow the dialect to override precedence logic - if let Some(precedence) = self.dialect.get_next_precedence(self) { - return precedence; - } - - macro_rules! p { - ($precedence:ident) => {self.prec(Precedence::$precedence)}; - } - - let token = self.peek_token(); - debug!("get_next_precedence() {:?}", token); - match token.token { - Token::Word(w) if w.keyword == Keyword::OR => Ok(p!(Or)), - Token::Word(w) if w.keyword == Keyword::AND => Ok(p!(And)), - Token::Word(w) if w.keyword == Keyword::XOR => Ok(p!(Xor)), - - Token::Word(w) if w.keyword == Keyword::AT => { - match (self.peek_nth_token(1).token, self.peek_nth_token(2).token) { - (Token::Word(w), Token::Word(w2)) - if w.keyword == Keyword::TIME && w2.keyword == Keyword::ZONE => - { - Ok(p!(AtTz)) - } - _ => Ok(p!(Unknown)), - } - } - - Token::Word(w) if w.keyword == Keyword::NOT => match self.peek_nth_token(1).token { - // The precedence of NOT varies depending on keyword that - // follows it. If it is followed by IN, BETWEEN, or LIKE, - // it takes on the precedence of those tokens. Otherwise, it - // is not an infix operator, and therefore has zero - // precedence. - Token::Word(w) if w.keyword == Keyword::IN => Ok(p!(Between)), - Token::Word(w) if w.keyword == Keyword::BETWEEN => Ok(p!(Between)), - Token::Word(w) if w.keyword == Keyword::LIKE => Ok(p!(Like)), - Token::Word(w) if w.keyword == Keyword::ILIKE => Ok(p!(Like)), - Token::Word(w) if w.keyword == Keyword::RLIKE => Ok(p!(Like)), - Token::Word(w) if w.keyword == Keyword::REGEXP => Ok(p!(Like)), - Token::Word(w) if w.keyword == Keyword::SIMILAR => Ok(p!(Like)), - _ => Ok(p!(Unknown)), - }, - Token::Word(w) if w.keyword == Keyword::IS => Ok(p!(Is)), - Token::Word(w) if w.keyword == Keyword::IN => Ok(p!(Between)), - Token::Word(w) if w.keyword == Keyword::BETWEEN => Ok(p!(Between)), - Token::Word(w) if w.keyword == Keyword::LIKE => Ok(p!(Like)), - Token::Word(w) if w.keyword == Keyword::ILIKE => Ok(p!(Like)), - Token::Word(w) if w.keyword == Keyword::RLIKE => Ok(p!(Like)), - Token::Word(w) if w.keyword == Keyword::REGEXP => Ok(p!(Like)), - Token::Word(w) if w.keyword == Keyword::SIMILAR => Ok(p!(Like)), - Token::Word(w) if w.keyword == Keyword::OPERATOR => Ok(p!(Between)), - Token::Word(w) if w.keyword == Keyword::DIV => Ok(p!(MulDivModOp)), - Token::Eq - | Token::Lt - | Token::LtEq - | Token::Neq - | Token::Gt - | Token::GtEq - | Token::DoubleEq - | Token::Tilde - | Token::TildeAsterisk - | Token::ExclamationMarkTilde - | Token::ExclamationMarkTildeAsterisk - | Token::DoubleTilde - | Token::DoubleTildeAsterisk - | Token::ExclamationMarkDoubleTilde - | Token::ExclamationMarkDoubleTildeAsterisk - | Token::Spaceship => Ok(p!(Eq)), - Token::Pipe => Ok(p!(Pipe)), - Token::Caret | Token::Sharp | Token::ShiftRight | Token::ShiftLeft => { - Ok(p!(Caret)) - } - Token::Ampersand => Ok(p!(Ampersand)), - Token::Plus | Token::Minus => Ok(p!(PlusMinus)), - Token::Mul | Token::Div | Token::DuckIntDiv | Token::Mod | Token::StringConcat => { - Ok(p!(MulDivModOp)) - } - Token::DoubleColon - | Token::ExclamationMark - | Token::LBracket - | Token::Overlap - | Token::CaretAt => Ok(p!(DoubleColon)), - Token::Colon if dialect_of!(self is SnowflakeDialect) => Ok(p!(DoubleColon)), - Token::Arrow - | Token::LongArrow - | Token::HashArrow - | Token::HashLongArrow - | Token::AtArrow - | Token::ArrowAt - | Token::HashMinus - | Token::AtQuestion - | Token::AtAt - | Token::Question - | Token::QuestionAnd - | Token::QuestionPipe - | Token::CustomBinaryOperator(_) => Ok(p!(PgOther)), - _ => Ok(p!(Unknown)), - } - } - - fn prec(&self, p: Precedence) -> u8 { - self.dialect.precedence_numeric(p) + self.dialect.get_next_precedence_full(self) } /// Return the first non-whitespace token that has not yet been processed @@ -8040,7 +7939,7 @@ impl<'a> Parser<'a> { format_clause: None, }) } else { - let body = self.parse_boxed_query_body(0)?; + let body = self.parse_boxed_query_body(self.dialect.prec_unknown())?; let order_by = if self.parse_keywords(&[Keyword::ORDER, Keyword::BY]) { let order_by_exprs = self.parse_comma_separated(Parser::parse_order_by_expr)?; @@ -11392,62 +11291,6 @@ impl<'a> Parser<'a> { } } - -/// Use to define the lexical Precedence of operators. -/// -/// Numeric values of enum members are used to define the default precedence of the operators. -/// -/// Uses (APPROXIMATELY) as a reference -/// higher number = higher precedence -/// -/// NOTE: The pg documentation is incomplete, e.g. the AT TIME ZONE operator -/// actually has higher precedence than addition. -/// See . -#[derive(Debug, Clone, Copy)] -#[repr(u8)] -pub enum Precedence { - DoubleColon, - AtTz, - MulDivModOp, - PlusMinus, - Xor, - Ampersand, - Caret, - Pipe, - Between, - Eq, - Like, - Is, - PgOther, - UnaryNot, - And, - Or, - Unknown, -} - -impl Precedence { - pub fn numeric(&self) -> u8 { - match self { - Precedence::DoubleColon => 50, - Precedence::AtTz => 41, - Precedence::MulDivModOp => 40, - Precedence::PlusMinus => 30, - Precedence::Xor => 24, - Precedence::Ampersand => 23, - Precedence::Caret => 22, - Precedence::Pipe => 21, - Precedence::Between | Precedence::Eq => 20, - Precedence::Like => 19, - Precedence::Is => 17, - Precedence::PgOther => 16, - Precedence::UnaryNot => 15, - Precedence::And => 10, - Precedence::Or => 5, - Precedence::Unknown => 0, - } - } -} - impl Word { pub fn to_ident(&self) -> Ident { Ident { diff --git a/tests/sqlparser_postgres.rs b/tests/sqlparser_postgres.rs index 7406bdd74..150f06913 100644 --- a/tests/sqlparser_postgres.rs +++ b/tests/sqlparser_postgres.rs @@ -4476,3 +4476,115 @@ fn test_unicode_string_literal() { } } } + +fn check_arrow_precedence(sql: &str, arrow_operator: BinaryOperator) { + assert_eq!( + pg().verified_stmt(sql), + Statement::Query(Box::new(Query { + with: None, + body: Box::new(SetExpr::Select(Box::new(Select { + distinct: None, + top: None, + projection: vec![SelectItem::UnnamedExpr(Expr::BinaryOp { + left: Box::new(Expr::BinaryOp { + left: Box::new(Expr::Identifier(Ident { + value: "foo".to_string(), + quote_style: None, + })), + op: arrow_operator, + right: Box::new(Expr::Value(Value::SingleQuotedString("bar".to_string()))), + }), + op: BinaryOperator::Eq, + right: Box::new(Expr::Value(Value::SingleQuotedString("spam".to_string()))), + })], + into: None, + from: vec![], + lateral_views: vec![], + prewhere: None, + selection: None, + group_by: GroupByExpr::Expressions(vec![], vec![]), + cluster_by: vec![], + distribute_by: vec![], + sort_by: vec![], + having: None, + named_window: vec![], + qualify: None, + window_before_qualify: false, + value_table_mode: None, + connect_by: None, + }))), + order_by: None, + limit: None, + limit_by: vec![], + offset: None, + fetch: None, + locks: vec![], + for_clause: None, + settings: None, + format_clause: None, + })) + ) +} + +#[test] +fn arrow_precedence() { + check_arrow_precedence("SELECT foo -> 'bar' = 'spam'", BinaryOperator::Arrow); +} + +#[test] +fn long_arrow_precedence() { + check_arrow_precedence("SELECT foo ->> 'bar' = 'spam'", BinaryOperator::LongArrow); +} + +#[test] +fn arrow_cast_precedence() { + // check this matches postgres where you would need `(foo -> 'bar')::TEXT` + let stmt = pg().verified_stmt("SELECT foo -> 'bar'::TEXT"); + assert_eq!( + stmt, + Statement::Query(Box::new(Query { + with: None, + body: Box::new(SetExpr::Select(Box::new(Select { + distinct: None, + top: None, + projection: vec![SelectItem::UnnamedExpr(Expr::BinaryOp { + left: Box::new(Expr::Identifier(Ident { + value: "foo".to_string(), + quote_style: None, + })), + op: BinaryOperator::Arrow, + right: Box::new(Expr::Cast { + kind: CastKind::DoubleColon, + expr: Box::new(Expr::Value(Value::SingleQuotedString("bar".to_string()))), + data_type: DataType::Text, + format: None, + }), + })], + into: None, + from: vec![], + lateral_views: vec![], + prewhere: None, + selection: None, + group_by: GroupByExpr::Expressions(vec![], vec![]), + cluster_by: vec![], + distribute_by: vec![], + sort_by: vec![], + having: None, + named_window: vec![], + qualify: None, + window_before_qualify: false, + value_table_mode: None, + connect_by: None, + }))), + order_by: None, + limit: None, + limit_by: vec![], + offset: None, + fetch: None, + locks: vec![], + for_clause: None, + settings: None, + format_clause: None, + })) + ) +} From f485bca236cf75a17de89ddafb061b704c224904 Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Thu, 1 Aug 2024 16:50:37 +0100 Subject: [PATCH 4/5] cleanup --- src/dialect/mod.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index d47770857..c19213223 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -24,12 +24,13 @@ mod redshift; mod snowflake; mod sqlite; -use crate::ast::{Expr, Statement}; use core::any::{Any, TypeId}; use core::fmt::Debug; use core::iter::Peekable; use core::str::Chars; +use log::debug; + pub use self::ansi::AnsiDialect; pub use self::bigquery::BigQueryDialect; pub use self::clickhouse::ClickHouseDialect; @@ -44,13 +45,13 @@ pub use self::redshift::RedshiftSqlDialect; pub use self::snowflake::SnowflakeDialect; pub use self::sqlite::SQLiteDialect; pub use crate::keywords; +use crate::ast::{Expr, Statement}; use crate::parser::{Parser, ParserError}; - use crate::keywords::Keyword; use crate::tokenizer::Token; + #[cfg(not(feature = "std"))] use alloc::boxed::Box; -use log::debug; /// Convenience check if a [`Parser`] uses a certain dialect. /// @@ -304,7 +305,9 @@ pub trait Dialect: Debug + Any { None } - /// Get the precedence of the next token + /// Get the precedence of the next token. This "full" method means all precedence logic and remain + /// in the dialect. while still allowing overriding the `get_next_precedence` method with the option to + /// fallback to the default behavior. /// /// Higher number => higher precedence fn get_next_precedence_full(&self, parser: &Parser) -> Result { From 96f1245ed797f3bdc8d568ebead107c31c4dd13f Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Thu, 1 Aug 2024 22:20:32 +0100 Subject: [PATCH 5/5] fmt --- src/dialect/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index c19213223..fc45545d4 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -44,10 +44,10 @@ pub use self::postgresql::PostgreSqlDialect; pub use self::redshift::RedshiftSqlDialect; pub use self::snowflake::SnowflakeDialect; pub use self::sqlite::SQLiteDialect; -pub use crate::keywords; use crate::ast::{Expr, Statement}; -use crate::parser::{Parser, ParserError}; +pub use crate::keywords; use crate::keywords::Keyword; +use crate::parser::{Parser, ParserError}; use crate::tokenizer::Token; #[cfg(not(feature = "std"))]