You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2023/01/01 12:31:42 UTC

[arrow-datafusion] branch master updated: Update sqlparser to `29.0.0` (#4770)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new 9a8190a46 Update sqlparser to `29.0.0` (#4770)
9a8190a46 is described below

commit 9a8190a46e574459e95f456a18a33d2f1f458533
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Sun Jan 1 07:31:36 2023 -0500

    Update sqlparser to `29.0.0` (#4770)
    
    * Sql planner support for rollup/cube/grouping sets ast nodes
    
    * Trigger build
    
    * Update sqlparser to 29.0.0
    
    * Update for api changes
    
    * Update one test
    
    * sqlparser update
    
    * fmt + clippy
    
    * reduce stack overhead
    
    Co-authored-by: Jefffrey <22...@users.noreply.github.com>
---
 datafusion-cli/Cargo.lock                    |  4 +-
 datafusion/common/Cargo.toml                 |  2 +-
 datafusion/core/Cargo.toml                   |  3 +-
 datafusion/core/tests/sql/explain_analyze.rs |  3 +-
 datafusion/expr/Cargo.toml                   |  2 +-
 datafusion/sql/Cargo.toml                    |  2 +-
 datafusion/sql/src/parser.rs                 | 26 +++++----
 datafusion/sql/src/planner.rs                | 80 ++++++++++++++++++++++++----
 8 files changed, 92 insertions(+), 30 deletions(-)

diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock
index a5485a0cf..decdaf257 100644
--- a/datafusion-cli/Cargo.lock
+++ b/datafusion-cli/Cargo.lock
@@ -2256,9 +2256,9 @@ checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
 
 [[package]]
 name = "sqlparser"
-version = "0.28.0"
+version = "0.29.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "249ae674b9f636b8ff64d8bfe218774cf05a26de40fd9f358669dccc4c0a9d7d"
+checksum = "9f484338c3e7772b9208d45f6e49aed0ea8439cdbe67c10ebf5505828143e113"
 dependencies = [
  "log",
 ]
diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml
index 7df95532b..dbcfcf094 100644
--- a/datafusion/common/Cargo.toml
+++ b/datafusion/common/Cargo.toml
@@ -46,4 +46,4 @@ cranelift-module = { version = "0.89.0", optional = true }
 object_store = { version = "0.5.0", default-features = false, optional = true }
 parquet = { version = "29.0.0", default-features = false, optional = true }
 pyo3 = { version = "0.17.1", optional = true }
-sqlparser = "0.28"
+sqlparser = "0.29"
diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml
index eaa01ca46..491db7bb6 100644
--- a/datafusion/core/Cargo.toml
+++ b/datafusion/core/Cargo.toml
@@ -90,7 +90,7 @@ pyo3 = { version = "0.17.1", optional = true }
 rand = "0.8"
 rayon = { version = "1.5", optional = true }
 smallvec = { version = "1.6", features = ["union"] }
-sqlparser = "0.28"
+sqlparser = "0.29"
 tempfile = "3"
 tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync", "fs", "parking_lot"] }
 tokio-stream = "0.1"
@@ -111,7 +111,6 @@ env_logger = "0.10"
 parquet-test-utils = { path = "../../parquet-test-utils" }
 rstest = "0.16.0"
 sqllogictest = "0.10.0"
-sqlparser = "0.28"
 test-utils = { path = "../../test-utils" }
 thiserror = "1.0.37"
 
diff --git a/datafusion/core/tests/sql/explain_analyze.rs b/datafusion/core/tests/sql/explain_analyze.rs
index e950791c8..82a24a04b 100644
--- a/datafusion/core/tests/sql/explain_analyze.rs
+++ b/datafusion/core/tests/sql/explain_analyze.rs
@@ -898,8 +898,7 @@ async fn explain_nested() {
             .set_bool(OPT_EXPLAIN_PHYSICAL_PLAN_ONLY, explain_phy_plan_flag);
         let ctx = SessionContext::with_config(config);
         let sql = "EXPLAIN explain select 1";
-        let dataframe = ctx.sql(sql).await.unwrap();
-        let err = dataframe.create_physical_plan().await.unwrap_err();
+        let err = ctx.sql(sql).await.unwrap_err();
         assert!(err.to_string().contains("Explain must be root of the plan"));
     }
 
diff --git a/datafusion/expr/Cargo.toml b/datafusion/expr/Cargo.toml
index 87a72f7c5..02809c726 100644
--- a/datafusion/expr/Cargo.toml
+++ b/datafusion/expr/Cargo.toml
@@ -39,4 +39,4 @@ ahash = { version = "0.8", default-features = false, features = ["runtime-rng"]
 arrow = { version = "29.0.0", default-features = false }
 datafusion-common = { path = "../common", version = "15.0.0" }
 log = "^0.4"
-sqlparser = "0.28"
+sqlparser = "0.29"
diff --git a/datafusion/sql/Cargo.toml b/datafusion/sql/Cargo.toml
index 12f4534e6..fb1679888 100644
--- a/datafusion/sql/Cargo.toml
+++ b/datafusion/sql/Cargo.toml
@@ -41,4 +41,4 @@ arrow-schema = "29.0.0"
 datafusion-common = { path = "../common", version = "15.0.0" }
 datafusion-expr = { path = "../expr", version = "15.0.0" }
 log = "^0.4"
-sqlparser = "0.28"
+sqlparser = "0.29"
diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs
index 68e368e49..2cc92d04c 100644
--- a/datafusion/sql/src/parser.rs
+++ b/datafusion/sql/src/parser.rs
@@ -25,7 +25,7 @@ use sqlparser::{
     },
     dialect::{keywords::Keyword, Dialect, GenericDialect},
     parser::{Parser, ParserError},
-    tokenizer::{Token, Tokenizer},
+    tokenizer::{Token, TokenWithLocation, Tokenizer},
 };
 use std::{collections::HashMap, str::FromStr};
 use std::{collections::VecDeque, fmt};
@@ -124,7 +124,7 @@ impl<'a> DFParser<'a> {
         let tokens = tokenizer.tokenize()?;
 
         Ok(DFParser {
-            parser: Parser::new(tokens, dialect),
+            parser: Parser::new(dialect).with_tokens(tokens),
         })
     }
 
@@ -165,13 +165,17 @@ impl<'a> DFParser<'a> {
     }
 
     /// Report an unexpected token
-    fn expected<T>(&self, expected: &str, found: Token) -> Result<T, ParserError> {
+    fn expected<T>(
+        &self,
+        expected: &str,
+        found: TokenWithLocation,
+    ) -> Result<T, ParserError> {
         parser_err!(format!("Expected {expected}, found: {found}"))
     }
 
     /// Parse a new expression
     pub fn parse_statement(&mut self) -> Result<Statement, ParserError> {
-        match self.parser.peek_token() {
+        match self.parser.peek_token().token {
             Token::Word(w) => {
                 match w.keyword {
                     Keyword::CREATE => {
@@ -227,7 +231,7 @@ impl<'a> DFParser<'a> {
         }
 
         loop {
-            if let Token::Word(_) = self.parser.peek_token() {
+            if let Token::Word(_) = self.parser.peek_token().token {
                 let identifier = self.parser.parse_identifier()?;
                 partitions.push(identifier.to_string());
             } else {
@@ -262,7 +266,7 @@ impl<'a> DFParser<'a> {
         loop {
             if let Some(constraint) = self.parser.parse_optional_table_constraint()? {
                 constraints.push(constraint);
-            } else if let Token::Word(_) = self.parser.peek_token() {
+            } else if let Token::Word(_) = self.parser.peek_token().token {
                 let column_def = self.parse_column_def()?;
                 columns.push(column_def);
             } else {
@@ -379,9 +383,10 @@ impl<'a> DFParser<'a> {
 
     /// Parses the set of valid formats
     fn parse_file_format(&mut self) -> Result<String, ParserError> {
-        match self.parser.next_token() {
+        let token = self.parser.next_token();
+        match &token.token {
             Token::Word(w) => parse_file_type(&w.value),
-            unexpected => self.expected("one of PARQUET, NDJSON, or CSV", unexpected),
+            _ => self.expected("one of PARQUET, NDJSON, or CSV", token),
         }
     }
 
@@ -389,9 +394,10 @@ impl<'a> DFParser<'a> {
     fn parse_file_compression_type(
         &mut self,
     ) -> Result<CompressionTypeVariant, ParserError> {
-        match self.parser.next_token() {
+        let token = self.parser.next_token();
+        match &token.token {
             Token::Word(w) => CompressionTypeVariant::from_str(&w.value),
-            unexpected => self.expected("one of GZIP, BZIP2, XZ", unexpected),
+            _ => self.expected("one of GZIP, BZIP2, XZ", token),
         }
     }
 
diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs
index 0ee5913cb..3bd5edbc5 100644
--- a/datafusion/sql/src/planner.rs
+++ b/datafusion/sql/src/planner.rs
@@ -2235,15 +2235,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
                     normalize_ident(function.name.0[0].clone())
                 };
 
-                // first, check SQL reserved words
-                if name == "rollup" {
-                    let args = self.function_args_to_expr(function.args, schema)?;
-                    return Ok(Expr::GroupingSet(GroupingSet::Rollup(args)));
-                } else if name == "cube" {
-                    let args = self.function_args_to_expr(function.args, schema)?;
-                    return Ok(Expr::GroupingSet(GroupingSet::Cube(args)));
-                }
-
                 // next, scalar built-in
                 if let Ok(fun) = BuiltinScalarFunction::from_str(&name) {
                     let args = self.function_args_to_expr(function.args, schema)?;
@@ -2347,6 +2338,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
                 }
             }
 
+            SQLExpr::Rollup(exprs) => self.sql_rollup_to_expr(exprs, schema, planner_context),
+            SQLExpr::Cube(exprs) => self.sql_cube_to_expr(exprs,schema, planner_context),
+            SQLExpr::GroupingSets(exprs) => self.sql_grouping_sets_to_expr(exprs, schema, planner_context),
+
             SQLExpr::Floor { expr, field: _field } => {
                 let fun = BuiltinScalarFunction::Floor;
                 let args = vec![self.sql_expr_to_logical_expr(*expr, schema, planner_context)?];
@@ -2387,6 +2382,67 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
             })
     }
 
+    fn sql_rollup_to_expr(
+        &self,
+        exprs: Vec<Vec<SQLExpr>>,
+        schema: &DFSchema,
+        planner_context: &mut PlannerContext,
+    ) -> Result<Expr> {
+        let args: Result<Vec<_>> = exprs
+            .into_iter()
+            .map(|v| {
+                if v.len() != 1 {
+                    Err(DataFusionError::Internal(
+                        "Tuple expressions are not supported for Rollup expressions"
+                            .to_string(),
+                    ))
+                } else {
+                    self.sql_expr_to_logical_expr(v[0].clone(), schema, planner_context)
+                }
+            })
+            .collect();
+        Ok(Expr::GroupingSet(GroupingSet::Rollup(args?)))
+    }
+
+    fn sql_cube_to_expr(
+        &self,
+        exprs: Vec<Vec<SQLExpr>>,
+        schema: &DFSchema,
+        planner_context: &mut PlannerContext,
+    ) -> Result<Expr> {
+        let args: Result<Vec<_>> = exprs
+            .into_iter()
+            .map(|v| {
+                if v.len() != 1 {
+                    Err(DataFusionError::Internal(
+                        "Tuple expressions not are supported for Cube expressions"
+                            .to_string(),
+                    ))
+                } else {
+                    self.sql_expr_to_logical_expr(v[0].clone(), schema, planner_context)
+                }
+            })
+            .collect();
+        Ok(Expr::GroupingSet(GroupingSet::Cube(args?)))
+    }
+
+    fn sql_grouping_sets_to_expr(
+        &self,
+        exprs: Vec<Vec<SQLExpr>>,
+        schema: &DFSchema,
+        planner_context: &mut PlannerContext,
+    ) -> Result<Expr> {
+        let args: Result<Vec<Vec<_>>> = exprs
+            .into_iter()
+            .map(|v| {
+                v.into_iter()
+                    .map(|e| self.sql_expr_to_logical_expr(e, schema, planner_context))
+                    .collect()
+            })
+            .collect();
+        Ok(Expr::GroupingSet(GroupingSet::GroupingSets(args?)))
+    }
+
     fn parse_exists_subquery(
         &self,
         subquery: Query,
@@ -2634,6 +2690,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
             SQLExpr::Identifier(i) => i.to_string(),
             SQLExpr::Value(v) => match v {
                 Value::SingleQuotedString(s) => s.to_string(),
+                Value::DollarQuotedString(s) => s.to_string(),
                 Value::Number(_, _) | Value::Boolean(_) => v.to_string(),
                 Value::DoubleQuotedString(_)
                 | Value::UnQuotedString(_)
@@ -5664,11 +5721,12 @@ mod tests {
         quick_test(sql, expected);
     }
 
-    #[ignore] // see https://github.com/apache/arrow-datafusion/issues/2469
     #[test]
     fn aggregate_with_grouping_sets() {
         let sql = "SELECT id, state, age, COUNT(*) FROM person GROUP BY id, GROUPING SETS ((state), (state, age), (id, state))";
-        let expected = "TBD";
+        let expected = "Projection: person.id, person.state, person.age, COUNT(UInt8(1))\
+        \n  Aggregate: groupBy=[[person.id, GROUPING SETS ((person.state), (person.state, person.age), (person.id, person.state))]], aggr=[[COUNT(UInt8(1))]]\
+        \n    TableScan: person";
         quick_test(sql, expected);
     }