You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "aprimadi (via GitHub)" <gi...@apache.org> on 2023/05/06 05:34:28 UTC

[GitHub] [arrow-datafusion] aprimadi opened a new pull request, #6257: Enable parser to parse create external clauses in random order

aprimadi opened a new pull request, #6257:
URL: https://github.com/apache/arrow-datafusion/pull/6257

   # Which issue does this PR close?
   
   Closes #6248
   
   # Rationale for this change
   
   N/A
   
   # What changes are included in this PR?
   
   - Modify create external table parser
   - Add some tests
   
   # Are these changes tested?
   
   Yes
   
   # Are there any user-facing changes?
   
   No


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-datafusion] alamb commented on a diff in pull request #6257: Enable parser to parse create external clauses in random order

Posted by "alamb (via GitHub)" <gi...@apache.org>.
alamb commented on code in PR #6257:
URL: https://github.com/apache/arrow-datafusion/pull/6257#discussion_r1187716273


##########
datafusion/sql/src/parser.rs:
##########
@@ -379,59 +379,92 @@ impl<'a> DFParser<'a> {
                 .parse_keywords(&[Keyword::IF, Keyword::NOT, Keyword::EXISTS]);
         let table_name = self.parser.parse_object_name()?;
         let (columns, _) = self.parse_columns()?;
-        self.parser
-            .expect_keywords(&[Keyword::STORED, Keyword::AS])?;
 
-        // THIS is the main difference: we parse a different file format.
-        let file_type = self.parse_file_format()?;
-
-        let has_header = self.parse_csv_has_header();
-
-        let has_delimiter = self.parse_has_delimiter();
-        let delimiter = match has_delimiter {
-            true => self.parse_delimiter()?,
-            false => ',',
-        };
-
-        let file_compression_type = if self.parse_has_file_compression_type() {
-            self.parse_file_compression_type()?
-        } else {
-            CompressionTypeVariant::UNCOMPRESSED
-        };
-
-        let table_partition_cols = if self.parse_has_partition() {
-            self.parse_partitions()?
-        } else {
-            vec![]
-        };
+        #[derive(Default)]

Review Comment:
   👍 



##########
datafusion/sql/src/parser.rs:
##########
@@ -379,59 +379,92 @@ impl<'a> DFParser<'a> {
                 .parse_keywords(&[Keyword::IF, Keyword::NOT, Keyword::EXISTS]);
         let table_name = self.parser.parse_object_name()?;
         let (columns, _) = self.parse_columns()?;
-        self.parser
-            .expect_keywords(&[Keyword::STORED, Keyword::AS])?;
 
-        // THIS is the main difference: we parse a different file format.
-        let file_type = self.parse_file_format()?;
-
-        let has_header = self.parse_csv_has_header();
-
-        let has_delimiter = self.parse_has_delimiter();
-        let delimiter = match has_delimiter {
-            true => self.parse_delimiter()?,
-            false => ',',
-        };
-
-        let file_compression_type = if self.parse_has_file_compression_type() {
-            self.parse_file_compression_type()?
-        } else {
-            CompressionTypeVariant::UNCOMPRESSED
-        };
-
-        let table_partition_cols = if self.parse_has_partition() {
-            self.parse_partitions()?
-        } else {
-            vec![]
-        };
+        #[derive(Default)]
+        struct Builder {
+            file_type: Option<String>,
+            location: Option<String>,
+            has_header: Option<bool>,
+            delimiter: Option<char>,
+            file_compression_type: Option<CompressionTypeVariant>,
+            table_partition_cols: Option<Vec<String>>,
+            order_exprs: Option<Vec<OrderByExpr>>,
+            options: Option<HashMap<String, String>>,
+        }
+        let mut builder = Builder::default();
 
-        let order_exprs = if self.parse_has_order() {
-            self.parse_order_by_exprs()?
-        } else {
-            vec![]
-        };
+        fn ensure_not_set<T>(field: &Option<T>, name: &str) -> Result<(), ParserError> {
+            if field.is_some() {
+                return Err(ParserError::ParserError(format!(
+                    "{name} specified more than once",
+                )));
+            }
+            Ok(())
+        }
 
-        let options = if self.parse_has_options() {
-            self.parse_options()?
-        } else {
-            HashMap::new()
-        };
+        loop {
+            if self.parser.parse_keyword(Keyword::STORED) {
+                self.parser.expect_keyword(Keyword::AS)?;
+                ensure_not_set(&builder.file_type, "STORED AS")?;
+                builder.file_type = Some(self.parse_file_format()?);
+            } else if self.parser.parse_keyword(Keyword::LOCATION) {
+                ensure_not_set(&builder.location, "LOCATION")?;
+                builder.location = Some(self.parser.parse_literal_string()?);
+            } else if self
+                .parser
+                .parse_keywords(&[Keyword::WITH, Keyword::HEADER])
+            {
+                self.parser.expect_keyword(Keyword::ROW)?;
+                ensure_not_set(&builder.has_header, "WITH HEADER ROW")?;
+                builder.has_header = Some(true);
+            } else if self.parser.parse_keywords(&[Keyword::WITH, Keyword::ORDER]) {
+                ensure_not_set(&builder.order_exprs, "WITH ORDER")?;
+                builder.order_exprs = Some(self.parse_order_by_exprs()?);
+            } else if self.parser.parse_keyword(Keyword::DELIMITER) {
+                ensure_not_set(&builder.delimiter, "DELIMITER")?;
+                builder.delimiter = Some(self.parse_delimiter()?);
+            } else if self.parser.parse_keyword(Keyword::COMPRESSION) {
+                self.parser.expect_keyword(Keyword::TYPE)?;
+                ensure_not_set(&builder.file_compression_type, "COMPRESSION TYPE")?;
+                builder.file_compression_type = Some(self.parse_file_compression_type()?);
+            } else if self.parser.parse_keyword(Keyword::PARTITIONED) {
+                self.parser.expect_keyword(Keyword::BY)?;
+                ensure_not_set(&builder.table_partition_cols, "PARTITIONED BY")?;
+                builder.table_partition_cols = Some(self.parse_partitions()?)
+            } else if self.parser.parse_keyword(Keyword::OPTIONS) {
+                ensure_not_set(&builder.options, "OPTIONS")?;
+                builder.options = Some(self.parse_options()?);
+            } else {
+                break;

Review Comment:
   I agree this would make a better user experience and I think it would be a good follow on project. I will file the ticket



##########
datafusion/core/tests/sqllogictests/test_files/create_external_table.slt:
##########
@@ -0,0 +1,93 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+###############
+# Error tests #
+###############
+
+# Missing `STORED AS` and `LOCATION` clauses
+statement error DataFusion error: SQL error: ParserError\("Missing STORED AS clause in CREATE EXTERNAL TABLE statement"\)

Review Comment:
   👍  these tests are so nice -- thank you @aprimadi 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-datafusion] alamb commented on pull request #6257: Enable parser to parse create external clauses in arbitrary order

Posted by "alamb (via GitHub)" <gi...@apache.org>.
alamb commented on PR #6257:
URL: https://github.com/apache/arrow-datafusion/pull/6257#issuecomment-1538794084

   Thanks @aprimadi 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-datafusion] alamb merged pull request #6257: Enable parser to parse create external clauses in arbitrary order

Posted by "alamb (via GitHub)" <gi...@apache.org>.
alamb merged PR #6257:
URL: https://github.com/apache/arrow-datafusion/pull/6257


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-datafusion] alamb commented on a diff in pull request #6257: Enable parser to parse create external clauses in arbitrary order

Posted by "alamb (via GitHub)" <gi...@apache.org>.
alamb commented on code in PR #6257:
URL: https://github.com/apache/arrow-datafusion/pull/6257#discussion_r1187725826


##########
datafusion/sql/src/parser.rs:
##########
@@ -379,59 +379,92 @@ impl<'a> DFParser<'a> {
                 .parse_keywords(&[Keyword::IF, Keyword::NOT, Keyword::EXISTS]);
         let table_name = self.parser.parse_object_name()?;
         let (columns, _) = self.parse_columns()?;
-        self.parser
-            .expect_keywords(&[Keyword::STORED, Keyword::AS])?;
 
-        // THIS is the main difference: we parse a different file format.
-        let file_type = self.parse_file_format()?;
-
-        let has_header = self.parse_csv_has_header();
-
-        let has_delimiter = self.parse_has_delimiter();
-        let delimiter = match has_delimiter {
-            true => self.parse_delimiter()?,
-            false => ',',
-        };
-
-        let file_compression_type = if self.parse_has_file_compression_type() {
-            self.parse_file_compression_type()?
-        } else {
-            CompressionTypeVariant::UNCOMPRESSED
-        };
-
-        let table_partition_cols = if self.parse_has_partition() {
-            self.parse_partitions()?
-        } else {
-            vec![]
-        };
+        #[derive(Default)]
+        struct Builder {
+            file_type: Option<String>,
+            location: Option<String>,
+            has_header: Option<bool>,
+            delimiter: Option<char>,
+            file_compression_type: Option<CompressionTypeVariant>,
+            table_partition_cols: Option<Vec<String>>,
+            order_exprs: Option<Vec<OrderByExpr>>,
+            options: Option<HashMap<String, String>>,
+        }
+        let mut builder = Builder::default();
 
-        let order_exprs = if self.parse_has_order() {
-            self.parse_order_by_exprs()?
-        } else {
-            vec![]
-        };
+        fn ensure_not_set<T>(field: &Option<T>, name: &str) -> Result<(), ParserError> {
+            if field.is_some() {
+                return Err(ParserError::ParserError(format!(
+                    "{name} specified more than once",
+                )));
+            }
+            Ok(())
+        }
 
-        let options = if self.parse_has_options() {
-            self.parse_options()?
-        } else {
-            HashMap::new()
-        };
+        loop {
+            if self.parser.parse_keyword(Keyword::STORED) {
+                self.parser.expect_keyword(Keyword::AS)?;
+                ensure_not_set(&builder.file_type, "STORED AS")?;
+                builder.file_type = Some(self.parse_file_format()?);
+            } else if self.parser.parse_keyword(Keyword::LOCATION) {
+                ensure_not_set(&builder.location, "LOCATION")?;
+                builder.location = Some(self.parser.parse_literal_string()?);
+            } else if self
+                .parser
+                .parse_keywords(&[Keyword::WITH, Keyword::HEADER])
+            {
+                self.parser.expect_keyword(Keyword::ROW)?;
+                ensure_not_set(&builder.has_header, "WITH HEADER ROW")?;
+                builder.has_header = Some(true);
+            } else if self.parser.parse_keywords(&[Keyword::WITH, Keyword::ORDER]) {
+                ensure_not_set(&builder.order_exprs, "WITH ORDER")?;
+                builder.order_exprs = Some(self.parse_order_by_exprs()?);
+            } else if self.parser.parse_keyword(Keyword::DELIMITER) {
+                ensure_not_set(&builder.delimiter, "DELIMITER")?;
+                builder.delimiter = Some(self.parse_delimiter()?);
+            } else if self.parser.parse_keyword(Keyword::COMPRESSION) {
+                self.parser.expect_keyword(Keyword::TYPE)?;
+                ensure_not_set(&builder.file_compression_type, "COMPRESSION TYPE")?;
+                builder.file_compression_type = Some(self.parse_file_compression_type()?);
+            } else if self.parser.parse_keyword(Keyword::PARTITIONED) {
+                self.parser.expect_keyword(Keyword::BY)?;
+                ensure_not_set(&builder.table_partition_cols, "PARTITIONED BY")?;
+                builder.table_partition_cols = Some(self.parse_partitions()?)
+            } else if self.parser.parse_keyword(Keyword::OPTIONS) {
+                ensure_not_set(&builder.options, "OPTIONS")?;
+                builder.options = Some(self.parse_options()?);
+            } else {
+                break;

Review Comment:
   https://github.com/apache/arrow-datafusion/issues/6288



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-datafusion] tz70s commented on a diff in pull request #6257: Enable parser to parse create external clauses in random order

Posted by "tz70s (via GitHub)" <gi...@apache.org>.
tz70s commented on code in PR #6257:
URL: https://github.com/apache/arrow-datafusion/pull/6257#discussion_r1186779655


##########
datafusion/sql/src/parser.rs:
##########
@@ -379,59 +379,92 @@ impl<'a> DFParser<'a> {
                 .parse_keywords(&[Keyword::IF, Keyword::NOT, Keyword::EXISTS]);
         let table_name = self.parser.parse_object_name()?;
         let (columns, _) = self.parse_columns()?;
-        self.parser
-            .expect_keywords(&[Keyword::STORED, Keyword::AS])?;
 
-        // THIS is the main difference: we parse a different file format.
-        let file_type = self.parse_file_format()?;
-
-        let has_header = self.parse_csv_has_header();
-
-        let has_delimiter = self.parse_has_delimiter();
-        let delimiter = match has_delimiter {
-            true => self.parse_delimiter()?,
-            false => ',',
-        };
-
-        let file_compression_type = if self.parse_has_file_compression_type() {
-            self.parse_file_compression_type()?
-        } else {
-            CompressionTypeVariant::UNCOMPRESSED
-        };
-
-        let table_partition_cols = if self.parse_has_partition() {
-            self.parse_partitions()?
-        } else {
-            vec![]
-        };
+        #[derive(Default)]
+        struct Builder {
+            file_type: Option<String>,
+            location: Option<String>,
+            has_header: Option<bool>,
+            delimiter: Option<char>,
+            file_compression_type: Option<CompressionTypeVariant>,
+            table_partition_cols: Option<Vec<String>>,
+            order_exprs: Option<Vec<OrderByExpr>>,
+            options: Option<HashMap<String, String>>,
+        }
+        let mut builder = Builder::default();
 
-        let order_exprs = if self.parse_has_order() {
-            self.parse_order_by_exprs()?
-        } else {
-            vec![]
-        };
+        fn ensure_not_set<T>(field: &Option<T>, name: &str) -> Result<(), ParserError> {
+            if field.is_some() {
+                return Err(ParserError::ParserError(format!(
+                    "{name} specified more than once",
+                )));
+            }
+            Ok(())
+        }
 
-        let options = if self.parse_has_options() {
-            self.parse_options()?
-        } else {
-            HashMap::new()
-        };
+        loop {
+            if self.parser.parse_keyword(Keyword::STORED) {
+                self.parser.expect_keyword(Keyword::AS)?;
+                ensure_not_set(&builder.file_type, "STORED AS")?;
+                builder.file_type = Some(self.parse_file_format()?);
+            } else if self.parser.parse_keyword(Keyword::LOCATION) {
+                ensure_not_set(&builder.location, "LOCATION")?;
+                builder.location = Some(self.parser.parse_literal_string()?);
+            } else if self
+                .parser
+                .parse_keywords(&[Keyword::WITH, Keyword::HEADER])
+            {
+                self.parser.expect_keyword(Keyword::ROW)?;
+                ensure_not_set(&builder.has_header, "WITH HEADER ROW")?;
+                builder.has_header = Some(true);
+            } else if self.parser.parse_keywords(&[Keyword::WITH, Keyword::ORDER]) {
+                ensure_not_set(&builder.order_exprs, "WITH ORDER")?;
+                builder.order_exprs = Some(self.parse_order_by_exprs()?);
+            } else if self.parser.parse_keyword(Keyword::DELIMITER) {
+                ensure_not_set(&builder.delimiter, "DELIMITER")?;
+                builder.delimiter = Some(self.parse_delimiter()?);
+            } else if self.parser.parse_keyword(Keyword::COMPRESSION) {
+                self.parser.expect_keyword(Keyword::TYPE)?;
+                ensure_not_set(&builder.file_compression_type, "COMPRESSION TYPE")?;
+                builder.file_compression_type = Some(self.parse_file_compression_type()?);
+            } else if self.parser.parse_keyword(Keyword::PARTITIONED) {
+                self.parser.expect_keyword(Keyword::BY)?;
+                ensure_not_set(&builder.table_partition_cols, "PARTITIONED BY")?;
+                builder.table_partition_cols = Some(self.parse_partitions()?)
+            } else if self.parser.parse_keyword(Keyword::OPTIONS) {
+                ensure_not_set(&builder.options, "OPTIONS")?;
+                builder.options = Some(self.parse_options()?);
+            } else {
+                break;

Review Comment:
   Not sure will it be worth adding more checks like `expect_one_of_keywords` (or other smarter ways) to make the user experience better? (though I couldn't find it's universally applied to all parser paths)
   
   e.g. a typo like 
   
   ```sql
   CREATE EXTERNAL TABLE t(c1 int)
   STORED AS CSV
   WITH HEAD ROW
   LOCATION 'foo.csv';
   ```
   
   will turn into error `sql parser error: Missing LOCATION clause in CREATE EXTERNAL TABLE statement` which is a bit confusing.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org