From 3f295b8be9d955047b71e1d361f184d9bb6a4de6 Mon Sep 17 00:00:00 2001 From: "aleksei.p" Date: Tue, 3 Dec 2024 12:21:58 +0100 Subject: [PATCH 01/12] Redshift: Fix parsing for quoted numbered columns --- src/dialect/redshift.rs | 10 +++++++++- tests/sqlparser_redshift.rs | 12 ++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/src/dialect/redshift.rs b/src/dialect/redshift.rs index 48eb00ab1..cea3c1bb4 100644 --- a/src/dialect/redshift.rs +++ b/src/dialect/redshift.rs @@ -41,10 +41,18 @@ impl Dialect for RedshiftSqlDialect { /// treating them as json path. If there is identifier then we assume /// there is no json path. fn is_proper_identifier_inside_quotes(&self, mut chars: Peekable>) -> bool { + // PartiQL uses square bracket as a start character and a quote is a beginning of quoted identifier + if let Some(quote_start) = chars.peek() { + if *quote_start == '"' { + return true; + } + }; chars.next(); let mut not_white_chars = chars.skip_while(|ch| ch.is_whitespace()).peekable(); if let Some(&ch) = not_white_chars.peek() { - return self.is_identifier_start(ch); + // PartiQL uses single quote as starting identification inside a quote + // It is a normal identifier if it has no single quote at the beginning + return ch != '\'' && self.is_identifier_start(ch); } false } diff --git a/tests/sqlparser_redshift.rs b/tests/sqlparser_redshift.rs index f0c1f0c74..ad32a01ef 100644 --- a/tests/sqlparser_redshift.rs +++ b/tests/sqlparser_redshift.rs @@ -353,3 +353,15 @@ fn test_parse_json_path_from() { _ => panic!(), } } + +#[test] +fn test_parse_select_numbered_columns() { + redshift_and_generic().verified_stmt(r#"SELECT 1 AS "1" FROM a"#); +} + +#[test] +fn test_parse_create_numbered_columns() { + redshift_and_generic().verified_stmt( + r#"CREATE TABLE test_table_1 ("1" INT, "d" VARCHAR(155), "2" DOUBLE PRECISION)"#, + ); +} From 00455a81f48f39614f230f95226da4c592d53072 Mon Sep 17 00:00:00 2001 From: "aleksei.p" Date: Tue, 3 Dec 2024 13:00:41 +0100 Subject: [PATCH 02/12] Update --- src/dialect/redshift.rs | 7 ++++--- tests/sqlparser_redshift.rs | 27 +++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/src/dialect/redshift.rs b/src/dialect/redshift.rs index cea3c1bb4..cc22a7198 100644 --- a/src/dialect/redshift.rs +++ b/src/dialect/redshift.rs @@ -41,7 +41,7 @@ impl Dialect for RedshiftSqlDialect { /// treating them as json path. If there is identifier then we assume /// there is no json path. fn is_proper_identifier_inside_quotes(&self, mut chars: Peekable>) -> bool { - // PartiQL uses square bracket as a start character and a quote is a beginning of quoted identifier + // PartiQL (used as json path query language in Redshift) uses square bracket as a start character and a quote is a beginning of quoted identifier if let Some(quote_start) = chars.peek() { if *quote_start == '"' { return true; @@ -51,8 +51,9 @@ impl Dialect for RedshiftSqlDialect { let mut not_white_chars = chars.skip_while(|ch| ch.is_whitespace()).peekable(); if let Some(&ch) = not_white_chars.peek() { // PartiQL uses single quote as starting identification inside a quote - // It is a normal identifier if it has no single quote at the beginning - return ch != '\'' && self.is_identifier_start(ch); + // It is a normal identifier if it has no single quote at the beginning. + // Additionally square bracket can contain quoted identifier. + return ch == '"' || ch != '\'' && self.is_identifier_start(ch); } false } diff --git a/tests/sqlparser_redshift.rs b/tests/sqlparser_redshift.rs index ad32a01ef..63cf0b13b 100644 --- a/tests/sqlparser_redshift.rs +++ b/tests/sqlparser_redshift.rs @@ -279,6 +279,31 @@ fn test_redshift_json_path() { }, expr_from_projection(only(&select.projection)) ); + + let sql = r#"SELECT db1.sc1.tbl1.col1[0]."id" FROM customer_orders_lineitem"#; + let select = dialects.verified_only_select(sql); + assert_eq!( + &Expr::JsonAccess { + value: Box::new(Expr::CompoundIdentifier(vec![ + Ident::new("db1"), + Ident::new("sc1"), + Ident::new("tbl1"), + Ident::new("col1") + ])), + path: JsonPath { + path: vec![ + JsonPathElem::Bracket { + key: Expr::Value(Value::Number("0".parse().unwrap(), false)) + }, + JsonPathElem::Dot { + key: "id".to_string(), + quoted: true, + } + ] + } + }, + expr_from_projection(only(&select.projection)) + ); } #[test] @@ -357,6 +382,8 @@ fn test_parse_json_path_from() { #[test] fn test_parse_select_numbered_columns() { redshift_and_generic().verified_stmt(r#"SELECT 1 AS "1" FROM a"#); + // RedShift specific case - quoted identifier inside square bracket + redshift().verified_stmt(r#"SELECT 1 AS ["1"] FROM a"#); } #[test] From 5e55dfe8d2f69e0aa3e2066a26490d28b4a44cd7 Mon Sep 17 00:00:00 2001 From: "aleksei.p" Date: Tue, 3 Dec 2024 15:07:01 +0100 Subject: [PATCH 03/12] update --- src/dialect/redshift.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dialect/redshift.rs b/src/dialect/redshift.rs index cc22a7198..c5244956c 100644 --- a/src/dialect/redshift.rs +++ b/src/dialect/redshift.rs @@ -53,7 +53,7 @@ impl Dialect for RedshiftSqlDialect { // PartiQL uses single quote as starting identification inside a quote // It is a normal identifier if it has no single quote at the beginning. // Additionally square bracket can contain quoted identifier. - return ch == '"' || ch != '\'' && self.is_identifier_start(ch); + return ch == '"' || self.is_identifier_start(ch); } false } From 9d5d71b5c83b8a58d06107ec38c53b2a272d85a1 Mon Sep 17 00:00:00 2001 From: "aleksei.p" Date: Tue, 3 Dec 2024 17:10:03 +0100 Subject: [PATCH 04/12] update --- src/dialect/redshift.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/dialect/redshift.rs b/src/dialect/redshift.rs index c5244956c..764e043a6 100644 --- a/src/dialect/redshift.rs +++ b/src/dialect/redshift.rs @@ -41,7 +41,11 @@ impl Dialect for RedshiftSqlDialect { /// treating them as json path. If there is identifier then we assume /// there is no json path. fn is_proper_identifier_inside_quotes(&self, mut chars: Peekable>) -> bool { - // PartiQL (used as json path query language in Redshift) uses square bracket as a start character and a quote is a beginning of quoted identifier + // PartiQL (used as json path query language in Redshift) uses square bracket as + // a start character and a quote is a beginning of quoted identifier. + // Skipping analyzing token such as `"a"` and analyze only token that + // can be part of json path potentially. + // For ex., `[0]`, `['a']` (seems part of json path) or `["a"]` (normal quoted identifier) if let Some(quote_start) = chars.peek() { if *quote_start == '"' { return true; @@ -52,7 +56,8 @@ impl Dialect for RedshiftSqlDialect { if let Some(&ch) = not_white_chars.peek() { // PartiQL uses single quote as starting identification inside a quote // It is a normal identifier if it has no single quote at the beginning. - // Additionally square bracket can contain quoted identifier. + // Square bracket can contain quoted identifier. + // For ex., `["a"]`, but this is not a part of json path, and it is a normal quoted identifier. return ch == '"' || self.is_identifier_start(ch); } false From 77f2b2cd036d4c8572f4042ec53dfbc66a161d64 Mon Sep 17 00:00:00 2001 From: "aleksei.p" Date: Thu, 5 Dec 2024 01:42:16 +0100 Subject: [PATCH 05/12] update --- src/dialect/mod.rs | 9 ++++++ src/dialect/redshift.rs | 22 +++++++++++++- src/tokenizer.rs | 58 ++++++++++++++++++++++++++++++------- tests/sqlparser_redshift.rs | 3 ++ 4 files changed, 80 insertions(+), 12 deletions(-) diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index a8993e685..a43cbbc51 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -138,6 +138,15 @@ pub trait Dialect: Debug + Any { true } + /// Determine if nested quote start is presented and return it + fn nested_quote_start( + &self, + _quote_start: char, + mut _chars: Peekable>, + ) -> Option { + None + } + /// Determine if a character is a valid start character for an unquoted identifier fn is_identifier_start(&self, ch: char) -> bool; diff --git a/src/dialect/redshift.rs b/src/dialect/redshift.rs index 764e043a6..8dc83bc69 100644 --- a/src/dialect/redshift.rs +++ b/src/dialect/redshift.rs @@ -45,7 +45,7 @@ impl Dialect for RedshiftSqlDialect { // a start character and a quote is a beginning of quoted identifier. // Skipping analyzing token such as `"a"` and analyze only token that // can be part of json path potentially. - // For ex., `[0]`, `['a']` (seems part of json path) or `["a"]` (normal quoted identifier) + // For ex., `[0]` (seems part of json path) or `["a"]` (normal quoted identifier) if let Some(quote_start) = chars.peek() { if *quote_start == '"' { return true; @@ -63,6 +63,26 @@ impl Dialect for RedshiftSqlDialect { false } + /// RedShift support nested quoted identifier like `["a"]`. + /// Determine if nested quote started and return it. + fn nested_quote_start( + &self, + quote_start: char, + mut chars: Peekable>, + ) -> Option { + if quote_start != '[' { + return None; + } + + chars.next(); // skip opening quote start + + if chars.skip_while(|ch| ch.is_whitespace()).peekable().peek() == Some(&'"') { + Some('"') + } else { + None + } + } + fn is_identifier_start(&self, ch: char) -> bool { // Extends Postgres dialect with sharp PostgreSqlDialect {}.is_identifier_start(ch) || ch == '#' diff --git a/src/tokenizer.rs b/src/tokenizer.rs index aacfc16fa..a00db81be 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1081,19 +1081,34 @@ impl<'a> Tokenizer<'a> { .dialect .is_proper_identifier_inside_quotes(chars.peekable.clone()) => { - let error_loc = chars.location(); - chars.next(); // consume the opening quote - let quote_end = Word::matching_end_quote(quote_start); - let (s, last_char) = self.parse_quoted_ident(chars, quote_end); + let word = if let Some(nested_quote_start) = self + .dialect + .nested_quote_start(quote_start, chars.peekable.clone()) + { + chars.next(); // consume the opening quote + + let quote_end = Word::matching_end_quote(quote_start); + let error_loc = chars.location(); + + peeking_take_while(chars, |ch| ch.is_whitespace()); + let nested_word = + self.tokenize_quoted_identifier(nested_quote_start, chars)?; + peeking_take_while(chars, |ch| ch.is_whitespace()); + + if chars.peek() != Some("e_end) { + return self.tokenizer_error( + error_loc, + format!("Expected close delimiter '{quote_end}' before EOF."), + ); + } + + chars.next(); // consume the closing nested quote - if last_char == Some(quote_end) { - Ok(Some(Token::make_word(&s, Some(quote_start)))) + format!("{nested_quote_start}{nested_word}{nested_quote_start}") } else { - self.tokenizer_error( - error_loc, - format!("Expected close delimiter '{quote_end}' before EOF."), - ) - } + self.tokenize_quoted_identifier(quote_start, chars)? + }; + Ok(Some(Token::make_word(&word, Some(quote_start)))) } // numbers and period '0'..='9' | '.' => { @@ -1597,6 +1612,27 @@ impl<'a> Tokenizer<'a> { s } + /// Tokenize an identifier or keyword, after the first char is already consumed. + fn tokenize_quoted_identifier( + &self, + quote_start: char, + chars: &mut State, + ) -> Result { + let error_loc = chars.location(); + chars.next(); // consume the opening quote + let quote_end = Word::matching_end_quote(quote_start); + let (s, last_char) = self.parse_quoted_ident(chars, quote_end); + + if last_char == Some(quote_end) { + Ok(s) + } else { + self.tokenizer_error( + error_loc, + format!("Expected close delimiter '{quote_end}' before EOF."), + ) + } + } + /// Read a single quoted string, starting with the opening quote. fn tokenize_escaped_single_quoted_string( &self, diff --git a/tests/sqlparser_redshift.rs b/tests/sqlparser_redshift.rs index 63cf0b13b..05090ac58 100644 --- a/tests/sqlparser_redshift.rs +++ b/tests/sqlparser_redshift.rs @@ -384,6 +384,9 @@ fn test_parse_select_numbered_columns() { redshift_and_generic().verified_stmt(r#"SELECT 1 AS "1" FROM a"#); // RedShift specific case - quoted identifier inside square bracket redshift().verified_stmt(r#"SELECT 1 AS ["1"] FROM a"#); + redshift().verified_stmt(r#"SELECT 1 AS ["[="] FROM a"#); + redshift().verified_stmt(r#"SELECT 1 AS ["=]"] FROM a"#); + redshift().verified_stmt(r#"SELECT 1 AS ["a[b]"] FROM a"#); } #[test] From 3868d24a9624640fd743cbd46f3095a0a2573d9b Mon Sep 17 00:00:00 2001 From: "aleksei.p" Date: Thu, 5 Dec 2024 01:49:24 +0100 Subject: [PATCH 06/12] update --- src/tokenizer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index a00db81be..59fefe531 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1612,7 +1612,7 @@ impl<'a> Tokenizer<'a> { s } - /// Tokenize an identifier or keyword, after the first char is already consumed. + /// Read a quoted identifier fn tokenize_quoted_identifier( &self, quote_start: char, From 799ededf08e29c5678f35001abca77fe922c326f Mon Sep 17 00:00:00 2001 From: "aleksei.p" Date: Thu, 5 Dec 2024 20:56:44 +0100 Subject: [PATCH 07/12] update --- src/dialect/mod.rs | 18 +++++--------- src/dialect/redshift.rs | 55 ++++++++++++----------------------------- src/tokenizer.rs | 55 +++++++++++++++++++++-------------------- 3 files changed, 50 insertions(+), 78 deletions(-) diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index a43cbbc51..437958885 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -133,17 +133,11 @@ pub trait Dialect: Debug + Any { None } - /// Determine if quoted characters are proper for identifier - fn is_proper_identifier_inside_quotes(&self, mut _chars: Peekable>) -> bool { - true - } - - /// Determine if nested quote start is presented and return it - fn nested_quote_start( + /// Determine if special way quoted characters are presented + fn special_delimited_identifier_start( &self, - _quote_start: char, mut _chars: Peekable>, - ) -> Option { + ) -> Option<(char, Option)> { None } @@ -865,11 +859,11 @@ mod tests { self.0.supports_string_literal_backslash_escape() } - fn is_proper_identifier_inside_quotes( + fn special_delimited_identifier_start( &self, chars: std::iter::Peekable>, - ) -> bool { - self.0.is_proper_identifier_inside_quotes(chars) + ) -> Option<(char, Option)> { + self.0.special_delimited_identifier_start(chars) } fn supports_filter_during_aggregation(&self) -> bool { diff --git a/src/dialect/redshift.rs b/src/dialect/redshift.rs index 8dc83bc69..27c1b0e1d 100644 --- a/src/dialect/redshift.rs +++ b/src/dialect/redshift.rs @@ -32,55 +32,32 @@ pub struct RedshiftSqlDialect {} // in the Postgres dialect, the query will be parsed as an array, while in the Redshift dialect it will // be a json path impl Dialect for RedshiftSqlDialect { - fn is_delimited_identifier_start(&self, ch: char) -> bool { - ch == '"' || ch == '[' - } - - /// Determine if quoted characters are proper for identifier + /// Determine if quoted characters are looks like special case of quotation begining with `[`. /// It's needed to distinguish treating square brackets as quotes from /// treating them as json path. If there is identifier then we assume /// there is no json path. - fn is_proper_identifier_inside_quotes(&self, mut chars: Peekable>) -> bool { - // PartiQL (used as json path query language in Redshift) uses square bracket as - // a start character and a quote is a beginning of quoted identifier. - // Skipping analyzing token such as `"a"` and analyze only token that - // can be part of json path potentially. - // For ex., `[0]` (seems part of json path) or `["a"]` (normal quoted identifier) - if let Some(quote_start) = chars.peek() { - if *quote_start == '"' { - return true; - } - }; - chars.next(); - let mut not_white_chars = chars.skip_while(|ch| ch.is_whitespace()).peekable(); - if let Some(&ch) = not_white_chars.peek() { - // PartiQL uses single quote as starting identification inside a quote - // It is a normal identifier if it has no single quote at the beginning. - // Square bracket can contain quoted identifier. - // For ex., `["a"]`, but this is not a part of json path, and it is a normal quoted identifier. - return ch == '"' || self.is_identifier_start(ch); - } - false - } - - /// RedShift support nested quoted identifier like `["a"]`. - /// Determine if nested quote started and return it. - fn nested_quote_start( + fn special_delimited_identifier_start( &self, - quote_start: char, mut chars: Peekable>, - ) -> Option { - if quote_start != '[' { + ) -> Option<(char, Option)> { + if chars.peek() != Some(&'[') { return None; } - chars.next(); // skip opening quote start + chars.next(); + + let mut not_white_chars = chars.skip_while(|ch| ch.is_whitespace()).peekable(); - if chars.skip_while(|ch| ch.is_whitespace()).peekable().peek() == Some(&'"') { - Some('"') - } else { - None + if let Some(&ch) = not_white_chars.peek() { + if ch == '"' { + return Some(('[', Some('"'))); + } + if self.is_identifier_start(ch) { + return Some(('[', None)); + } } + + None } fn is_identifier_start(&self, ch: char) -> bool { diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 59fefe531..762d62214 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1075,40 +1075,41 @@ impl<'a> Tokenizer<'a> { Ok(Some(Token::DoubleQuotedString(s))) } // delimited (quoted) identifier - quote_start - if self.dialect.is_delimited_identifier_start(ch) - && self - .dialect - .is_proper_identifier_inside_quotes(chars.peekable.clone()) => + quote_start if self.dialect.is_delimited_identifier_start(ch) => { + let word = self.tokenize_quoted_identifier(quote_start, chars)?; + Ok(Some(Token::make_word(&word, Some(quote_start)))) + } + // special (quoted) identifier + _ if self + .dialect + .special_delimited_identifier_start(chars.peekable.clone()) + .is_some() => { - let word = if let Some(nested_quote_start) = self + let (quote_start, nested_delimiter) = self .dialect - .nested_quote_start(quote_start, chars.peekable.clone()) - { - chars.next(); // consume the opening quote + .special_delimited_identifier_start(chars.peekable.clone()) + .unwrap(); - let quote_end = Word::matching_end_quote(quote_start); - let error_loc = chars.location(); + let mut word = vec![]; - peeking_take_while(chars, |ch| ch.is_whitespace()); - let nested_word = - self.tokenize_quoted_identifier(nested_quote_start, chars)?; - peeking_take_while(chars, |ch| ch.is_whitespace()); + let identifier_quote_start = if let Some(nested_delimiter) = nested_delimiter { + chars.next(); // skip the first delimiter + word.push(peeking_take_while(chars, |ch| ch.is_whitespace())); + word.push(format!("{nested_delimiter}")); + nested_delimiter + } else { + quote_start + }; - if chars.peek() != Some("e_end) { - return self.tokenizer_error( - error_loc, - format!("Expected close delimiter '{quote_end}' before EOF."), - ); - } + word.push(self.tokenize_quoted_identifier(identifier_quote_start, chars)?); - chars.next(); // consume the closing nested quote + if let Some(nested_delimiter) = nested_delimiter { + word.push(format!("{}", Word::matching_end_quote(nested_delimiter))); + word.push(peeking_take_while(chars, |ch| ch.is_whitespace())); + chars.next(); // skip close of first delimiter + } - format!("{nested_quote_start}{nested_word}{nested_quote_start}") - } else { - self.tokenize_quoted_identifier(quote_start, chars)? - }; - Ok(Some(Token::make_word(&word, Some(quote_start)))) + Ok(Some(Token::make_word(&word.concat(), Some(quote_start)))) } // numbers and period '0'..='9' | '.' => { From d6fe579129c46a21dae46f8e1e869d46599f712c Mon Sep 17 00:00:00 2001 From: "aleksei.p" Date: Thu, 5 Dec 2024 21:17:02 +0100 Subject: [PATCH 08/12] update --- src/tokenizer.rs | 43 +++++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 762d62214..8429e84c3 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1085,29 +1085,40 @@ impl<'a> Tokenizer<'a> { .special_delimited_identifier_start(chars.peekable.clone()) .is_some() => { - let (quote_start, nested_delimiter) = self + let (quote_start, nested_quote_start) = self .dialect .special_delimited_identifier_start(chars.peekable.clone()) .unwrap(); - let mut word = vec![]; - - let identifier_quote_start = if let Some(nested_delimiter) = nested_delimiter { - chars.next(); // skip the first delimiter - word.push(peeking_take_while(chars, |ch| ch.is_whitespace())); - word.push(format!("{nested_delimiter}")); - nested_delimiter - } else { - quote_start + let Some(nested_quote_start) = nested_quote_start else { + let word = self.tokenize_quoted_identifier(quote_start, chars)?; + return Ok(Some(Token::make_word(&word, Some(quote_start)))); }; - word.push(self.tokenize_quoted_identifier(identifier_quote_start, chars)?); - - if let Some(nested_delimiter) = nested_delimiter { - word.push(format!("{}", Word::matching_end_quote(nested_delimiter))); - word.push(peeking_take_while(chars, |ch| ch.is_whitespace())); - chars.next(); // skip close of first delimiter + let mut word = vec![]; + let quote_end = Word::matching_end_quote(quote_start); + let nested_quote_end = Word::matching_end_quote(nested_quote_start); + let error_loc = chars.location(); + + chars.next(); // skip the first delimiter + word.push(peeking_take_while(chars, |ch| ch.is_whitespace())); + if chars.peek() != Some(&nested_quote_start) { + return self.tokenizer_error( + error_loc, + format!("Expected nested delimiter '{nested_quote_start}' before EOF."), + ); + } + word.push(format!("{nested_quote_start}")); + word.push(self.tokenize_quoted_identifier(nested_quote_end, chars)?); + word.push(format!("{nested_quote_end}")); + word.push(peeking_take_while(chars, |ch| ch.is_whitespace())); + if chars.peek() != Some("e_end) { + return self.tokenizer_error( + error_loc, + format!("Expected close delimiter '{quote_end}' before EOF."), + ); } + chars.next(); // skip close delimiter Ok(Some(Token::make_word(&word.concat(), Some(quote_start)))) } From 9aeaf1c4f3d984c49871f0e967c3d58190e33f6e Mon Sep 17 00:00:00 2001 From: "aleksei.p" Date: Thu, 5 Dec 2024 21:57:55 +0100 Subject: [PATCH 09/12] update --- src/dialect/mod.rs | 34 ++++++++++++++++++++++------------ src/dialect/redshift.rs | 6 +++++- src/tokenizer.rs | 14 +++++++++----- 3 files changed, 36 insertions(+), 18 deletions(-) diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index 437958885..106cdcb81 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -128,19 +128,25 @@ pub trait Dialect: Debug + Any { ch == '"' || ch == '`' } - /// Return the character used to quote identifiers. - fn identifier_quote_style(&self, _identifier: &str) -> Option { - None + /// Determine if a character starts a potential nested quoted identifier. + /// RedShift support old way of quotation with `[` and it can cover even nested quoted identifier. + fn is_nested_delimited_identifier_start(&self, _ch: char) -> bool { + false } - /// Determine if special way quoted characters are presented - fn special_delimited_identifier_start( + /// Determine if nested quoted characters are presented + fn nested_delimited_identifier( &self, mut _chars: Peekable>, ) -> Option<(char, Option)> { None } + /// Return the character used to quote identifiers. + fn identifier_quote_style(&self, _identifier: &str) -> Option { + None + } + /// Determine if a character is a valid start character for an unquoted identifier fn is_identifier_start(&self, ch: char) -> bool; @@ -851,6 +857,17 @@ mod tests { self.0.is_delimited_identifier_start(ch) } + fn is_nested_delimited_identifier_start(&self, ch: char) -> bool { + self.0.is_nested_delimited_identifier_start(ch) + } + + fn nested_delimited_identifier( + &self, + chars: std::iter::Peekable>, + ) -> Option<(char, Option)> { + self.0.nested_delimited_identifier(chars) + } + fn identifier_quote_style(&self, identifier: &str) -> Option { self.0.identifier_quote_style(identifier) } @@ -859,13 +876,6 @@ mod tests { self.0.supports_string_literal_backslash_escape() } - fn special_delimited_identifier_start( - &self, - chars: std::iter::Peekable>, - ) -> Option<(char, Option)> { - self.0.special_delimited_identifier_start(chars) - } - fn supports_filter_during_aggregation(&self) -> bool { self.0.supports_filter_during_aggregation() } diff --git a/src/dialect/redshift.rs b/src/dialect/redshift.rs index 27c1b0e1d..2637c3f81 100644 --- a/src/dialect/redshift.rs +++ b/src/dialect/redshift.rs @@ -32,11 +32,15 @@ pub struct RedshiftSqlDialect {} // in the Postgres dialect, the query will be parsed as an array, while in the Redshift dialect it will // be a json path impl Dialect for RedshiftSqlDialect { + fn is_nested_delimited_identifier_start(&self, ch: char) -> bool { + ch == '[' + } + /// Determine if quoted characters are looks like special case of quotation begining with `[`. /// It's needed to distinguish treating square brackets as quotes from /// treating them as json path. If there is identifier then we assume /// there is no json path. - fn special_delimited_identifier_start( + fn nested_delimited_identifier( &self, mut chars: Peekable>, ) -> Option<(char, Option)> { diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 8429e84c3..cf1b138fe 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1080,14 +1080,18 @@ impl<'a> Tokenizer<'a> { Ok(Some(Token::make_word(&word, Some(quote_start)))) } // special (quoted) identifier - _ if self - .dialect - .special_delimited_identifier_start(chars.peekable.clone()) - .is_some() => + quote_start + if self + .dialect + .is_nested_delimited_identifier_start(quote_start) + && self + .dialect + .nested_delimited_identifier(chars.peekable.clone()) + .is_some() => { let (quote_start, nested_quote_start) = self .dialect - .special_delimited_identifier_start(chars.peekable.clone()) + .nested_delimited_identifier(chars.peekable.clone()) .unwrap(); let Some(nested_quote_start) = nested_quote_start else { From ab4d2de058dc481139bbc860a4985c82384ab691 Mon Sep 17 00:00:00 2001 From: "aleksei.p" Date: Fri, 6 Dec 2024 14:57:41 +0100 Subject: [PATCH 10/12] update --- src/dialect/mod.rs | 25 ++++++++++++++++++++----- src/dialect/redshift.rs | 24 +++++++++++++++++++----- src/tokenizer.rs | 19 ++++++++++++------- tests/sqlparser_redshift.rs | 33 +++++++++++++++++++-------------- 4 files changed, 70 insertions(+), 31 deletions(-) diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index 106cdcb81..78b013604 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -129,13 +129,28 @@ pub trait Dialect: Debug + Any { } /// Determine if a character starts a potential nested quoted identifier. - /// RedShift support old way of quotation with `[` and it can cover even nested quoted identifier. + /// Example: RedShift supports the following quote styles to all mean the same thing: + /// ```sql + /// SELECT 1 AS foo; + /// SELECT 1 AS "foo"; + /// SELECT 1 AS [foo]; + /// SELECT 1 AS ["foo"]; + /// ``` fn is_nested_delimited_identifier_start(&self, _ch: char) -> bool { false } - /// Determine if nested quoted characters are presented - fn nested_delimited_identifier( + /// Only applicable whenever [`Self::is_nested_delimited_identifier_start`] returns true + /// If the next sequence of tokens potentially represent a nested identifier, then this method + /// returns a tuple containing the outer quote style, and if present, the inner (nested) quote style. + /// + /// Example (Redshift): + /// ```text + /// `["foo"]` => (Some(`[`), Some(`"`)) + /// `[foo]` => (Some(`[`), None) + /// `"foo"` => None + /// ``` + fn peek_nested_delimited_identifier_quotes( &self, mut _chars: Peekable>, ) -> Option<(char, Option)> { @@ -861,11 +876,11 @@ mod tests { self.0.is_nested_delimited_identifier_start(ch) } - fn nested_delimited_identifier( + fn peek_nested_delimited_identifier_quotes( &self, chars: std::iter::Peekable>, ) -> Option<(char, Option)> { - self.0.nested_delimited_identifier(chars) + self.0.peek_nested_delimited_identifier_quotes(chars) } fn identifier_quote_style(&self, identifier: &str) -> Option { diff --git a/src/dialect/redshift.rs b/src/dialect/redshift.rs index 2637c3f81..c3eac8a75 100644 --- a/src/dialect/redshift.rs +++ b/src/dialect/redshift.rs @@ -32,15 +32,29 @@ pub struct RedshiftSqlDialect {} // in the Postgres dialect, the query will be parsed as an array, while in the Redshift dialect it will // be a json path impl Dialect for RedshiftSqlDialect { + /// Determine if a character starts a potential nested quoted identifier. + /// Example: RedShift supports the following quote styles to all mean the same thing: + /// ```sql + /// SELECT 1 AS foo; + /// SELECT 1 AS "foo"; + /// SELECT 1 AS [foo]; + /// SELECT 1 AS ["foo"]; + /// ``` fn is_nested_delimited_identifier_start(&self, ch: char) -> bool { ch == '[' } - /// Determine if quoted characters are looks like special case of quotation begining with `[`. - /// It's needed to distinguish treating square brackets as quotes from - /// treating them as json path. If there is identifier then we assume - /// there is no json path. - fn nested_delimited_identifier( + /// Only applicable whenever [`Self::is_nested_delimited_identifier_start`] returns true + /// If the next sequence of tokens potentially represent a nested identifier, then this method + /// returns a tuple containing the outer quote style, and if present, the inner (nested) quote style. + /// + /// Example (Redshift): + /// ```text + /// `["foo"]` => (Some(`[`), Some(`"`)) + /// `[foo]` => (Some(`[`), None) + /// `"foo"` => None + /// ``` + fn peek_nested_delimited_identifier_quotes( &self, mut chars: Peekable>, ) -> Option<(char, Option)> { diff --git a/src/tokenizer.rs b/src/tokenizer.rs index cf1b138fe..aef81cad3 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1079,20 +1079,25 @@ impl<'a> Tokenizer<'a> { let word = self.tokenize_quoted_identifier(quote_start, chars)?; Ok(Some(Token::make_word(&word, Some(quote_start)))) } - // special (quoted) identifier + // Potentially nested delimited (quoted) identifier quote_start if self .dialect .is_nested_delimited_identifier_start(quote_start) && self .dialect - .nested_delimited_identifier(chars.peekable.clone()) + .peek_nested_delimited_identifier_quotes(chars.peekable.clone()) .is_some() => { - let (quote_start, nested_quote_start) = self + let Some((quote_start, nested_quote_start)) = self .dialect - .nested_delimited_identifier(chars.peekable.clone()) - .unwrap(); + .peek_nested_delimited_identifier_quotes(chars.peekable.clone()) + else { + return self.tokenizer_error( + chars.location(), + format!("Expected nested delimiter '{quote_start}' before EOF."), + ); + }; let Some(nested_quote_start) = nested_quote_start else { let word = self.tokenize_quoted_identifier(quote_start, chars)?; @@ -1112,9 +1117,9 @@ impl<'a> Tokenizer<'a> { format!("Expected nested delimiter '{nested_quote_start}' before EOF."), ); } - word.push(format!("{nested_quote_start}")); + word.push(nested_quote_start.into()); word.push(self.tokenize_quoted_identifier(nested_quote_end, chars)?); - word.push(format!("{nested_quote_end}")); + word.push(nested_quote_end.into()); word.push(peeking_take_while(chars, |ch| ch.is_whitespace())); if chars.peek() != Some("e_end) { return self.tokenizer_error( diff --git a/tests/sqlparser_redshift.rs b/tests/sqlparser_redshift.rs index 05090ac58..dc435368a 100644 --- a/tests/sqlparser_redshift.rs +++ b/tests/sqlparser_redshift.rs @@ -176,6 +176,8 @@ fn parse_delimited_identifiers() { } redshift().verified_stmt(r#"CREATE TABLE "foo" ("bar" "int")"#); + // An alias starting with a number + redshift().verified_stmt(r#"CREATE TABLE "foo" ("1" INT)"#); redshift().verified_stmt(r#"ALTER TABLE foo ADD CONSTRAINT "bar" PRIMARY KEY (baz)"#); //TODO verified_stmt(r#"UPDATE foo SET "bar" = 5"#); } @@ -222,7 +224,7 @@ fn test_redshift_json_path() { path: JsonPath { path: vec![ JsonPathElem::Bracket { - key: Expr::Value(Value::Number("0".parse().unwrap(), false)) + key: Expr::Value(number("0")) }, JsonPathElem::Dot { key: "o_orderkey".to_string(), @@ -245,7 +247,7 @@ fn test_redshift_json_path() { path: JsonPath { path: vec![ JsonPathElem::Bracket { - key: Expr::Value(Value::Number("0".parse().unwrap(), false)) + key: Expr::Value(number("0")) }, JsonPathElem::Bracket { key: Expr::Value(Value::SingleQuotedString("id".to_owned())) @@ -269,7 +271,7 @@ fn test_redshift_json_path() { path: JsonPath { path: vec![ JsonPathElem::Bracket { - key: Expr::Value(Value::Number("0".parse().unwrap(), false)) + key: Expr::Value(number("0")) }, JsonPathElem::Bracket { key: Expr::Value(Value::SingleQuotedString("id".to_owned())) @@ -293,7 +295,7 @@ fn test_redshift_json_path() { path: JsonPath { path: vec![ JsonPathElem::Bracket { - key: Expr::Value(Value::Number("0".parse().unwrap(), false)) + key: Expr::Value(number("0")) }, JsonPathElem::Dot { key: "id".to_string(), @@ -320,7 +322,7 @@ fn test_parse_json_path_from() { &Some(JsonPath { path: vec![ JsonPathElem::Bracket { - key: Expr::Value(Value::Number("0".parse().unwrap(), false)) + key: Expr::Value(number("0")) }, JsonPathElem::Dot { key: "a".to_string(), @@ -344,7 +346,7 @@ fn test_parse_json_path_from() { &Some(JsonPath { path: vec![ JsonPathElem::Bracket { - key: Expr::Value(Value::Number("0".parse().unwrap(), false)) + key: Expr::Value(number("0")) }, JsonPathElem::Dot { key: "a".to_string(), @@ -381,17 +383,20 @@ fn test_parse_json_path_from() { #[test] fn test_parse_select_numbered_columns() { + // An alias starting with a number redshift_and_generic().verified_stmt(r#"SELECT 1 AS "1" FROM a"#); - // RedShift specific case - quoted identifier inside square bracket + redshift_and_generic().verified_stmt(r#"SELECT 1 AS "1abc" FROM a"#); +} + +#[test] +fn test_parse_nested_quoted_identifier() { redshift().verified_stmt(r#"SELECT 1 AS ["1"] FROM a"#); + redshift().verified_stmt(r#"SELECT 1 AS [ " 1 " ]"#); redshift().verified_stmt(r#"SELECT 1 AS ["[="] FROM a"#); redshift().verified_stmt(r#"SELECT 1 AS ["=]"] FROM a"#); redshift().verified_stmt(r#"SELECT 1 AS ["a[b]"] FROM a"#); -} - -#[test] -fn test_parse_create_numbered_columns() { - redshift_and_generic().verified_stmt( - r#"CREATE TABLE test_table_1 ("1" INT, "d" VARCHAR(155), "2" DOUBLE PRECISION)"#, - ); + // invalid query + assert!(redshift() + .parse_sql_statements(r#"SELECT 1 AS ["1]"#) + .is_err()); } From fe5a3d634ded4b4b65b2de9cc981d8d62023f8af Mon Sep 17 00:00:00 2001 From: "aleksei.p" Date: Fri, 6 Dec 2024 17:21:10 +0100 Subject: [PATCH 11/12] update --- src/dialect/mod.rs | 5 +++-- src/dialect/redshift.rs | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index 78b013604..eaa3e9b77 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -146,8 +146,9 @@ pub trait Dialect: Debug + Any { /// /// Example (Redshift): /// ```text - /// `["foo"]` => (Some(`[`), Some(`"`)) - /// `[foo]` => (Some(`[`), None) + /// `["foo"]` => Some(`[`, Some(`"`)) + /// `[foo]` => Some(`[`, None) + /// `[0]` => None /// `"foo"` => None /// ``` fn peek_nested_delimited_identifier_quotes( diff --git a/src/dialect/redshift.rs b/src/dialect/redshift.rs index c3eac8a75..55405ba53 100644 --- a/src/dialect/redshift.rs +++ b/src/dialect/redshift.rs @@ -50,8 +50,9 @@ impl Dialect for RedshiftSqlDialect { /// /// Example (Redshift): /// ```text - /// `["foo"]` => (Some(`[`), Some(`"`)) - /// `[foo]` => (Some(`[`), None) + /// `["foo"]` => Some(`[`, Some(`"`)) + /// `[foo]` => Some(`[`, None) + /// `[0]` => None /// `"foo"` => None /// ``` fn peek_nested_delimited_identifier_quotes( From b7a280bf461d7684b8d769bce9dbdf08990e8036 Mon Sep 17 00:00:00 2001 From: "aleksei.p" Date: Sat, 7 Dec 2024 12:10:59 +0100 Subject: [PATCH 12/12] update --- src/tokenizer.rs | 4 ++-- tests/sqlparser_redshift.rs | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index aef81cad3..9269f4fe6 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1110,7 +1110,7 @@ impl<'a> Tokenizer<'a> { let error_loc = chars.location(); chars.next(); // skip the first delimiter - word.push(peeking_take_while(chars, |ch| ch.is_whitespace())); + peeking_take_while(chars, |ch| ch.is_whitespace()); if chars.peek() != Some(&nested_quote_start) { return self.tokenizer_error( error_loc, @@ -1120,7 +1120,7 @@ impl<'a> Tokenizer<'a> { word.push(nested_quote_start.into()); word.push(self.tokenize_quoted_identifier(nested_quote_end, chars)?); word.push(nested_quote_end.into()); - word.push(peeking_take_while(chars, |ch| ch.is_whitespace())); + peeking_take_while(chars, |ch| ch.is_whitespace()); if chars.peek() != Some("e_end) { return self.tokenizer_error( error_loc, diff --git a/tests/sqlparser_redshift.rs b/tests/sqlparser_redshift.rs index dc435368a..2f206d15b 100644 --- a/tests/sqlparser_redshift.rs +++ b/tests/sqlparser_redshift.rs @@ -391,10 +391,11 @@ fn test_parse_select_numbered_columns() { #[test] fn test_parse_nested_quoted_identifier() { redshift().verified_stmt(r#"SELECT 1 AS ["1"] FROM a"#); - redshift().verified_stmt(r#"SELECT 1 AS [ " 1 " ]"#); redshift().verified_stmt(r#"SELECT 1 AS ["[="] FROM a"#); redshift().verified_stmt(r#"SELECT 1 AS ["=]"] FROM a"#); redshift().verified_stmt(r#"SELECT 1 AS ["a[b]"] FROM a"#); + // trim spaces + redshift().one_statement_parses_to(r#"SELECT 1 AS [ " 1 " ]"#, r#"SELECT 1 AS [" 1 "]"#); // invalid query assert!(redshift() .parse_sql_statements(r#"SELECT 1 AS ["1]"#)