You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/11/22 15:35:39 UTC

[GitHub] [arrow-datafusion] alamb commented on a diff in pull request #4221: Nonstring partitioned cols

alamb commented on code in PR #4221:
URL: https://github.com/apache/arrow-datafusion/pull/4221#discussion_r1029481591


##########
datafusion/core/src/datasource/listing_table_factory.rs:
##########
@@ -88,17 +89,45 @@ impl TableProviderFactory for ListingTableFactory {
             ),
         };
 
-        let provided_schema = if cmd.schema.fields().is_empty() {
-            None
+        let (provided_schema, table_partition_cols) = if cmd.schema.fields().is_empty() {
+            (
+                None,
+                cmd.table_partition_cols
+                    .iter()
+                    .zip((0..cmd.table_partition_cols.len()).map(|_| DataType::Utf8))
+                    .map(|x| (x.0.clone(), x.1))
+                    .collect::<Vec<_>>(),
+            )
         } else {
-            Some(Arc::new(cmd.schema.as_ref().to_owned().into()))
+            let schema: SchemaRef = Arc::new(cmd.schema.as_ref().to_owned().into());
+            let table_partition_cols = cmd
+                .table_partition_cols
+                .iter()
+                .map(|col| {
+                    (
+                        col.clone(),
+                        schema.field_with_name(col).unwrap().data_type().clone(),

Review Comment:
   Are you sure it is ok to `unwrap()` here (which will panic)? Given the function returns a `Result` perhaps we could return an error if the field was not present in the schema.



##########
datafusion/proto/src/logical_plan.rs:
##########
@@ -876,7 +890,7 @@ impl AsLogicalPlan for LogicalPlanNode {
                         FileFormatType::Avro(protobuf::AvroFormat {})
                     } else {
                         return Err(proto_error(format!(
-                            "Error converting file format, {:?} is invalid as a datafusion foramt.",
+                            "Error converting file format, {:?} is invalid as a datafusion format.",

Review Comment:
   👍 



##########
datafusion/core/tests/path_partition.rs:
##########
@@ -56,7 +57,11 @@ async fn parquet_distinct_partition_col() -> Result<()> {
             "year=2021/month=10/day=09/file.parquet",
             "year=2021/month=10/day=28/file.parquet",
         ],
-        &["year", "month", "day"],
+        &[
+            ("year", DataType::Int32),

Review Comment:
   Nice



##########
datafusion/core/src/datasource/listing_table_factory.rs:
##########
@@ -88,17 +89,45 @@ impl TableProviderFactory for ListingTableFactory {
             ),
         };
 
-        let provided_schema = if cmd.schema.fields().is_empty() {
-            None
+        let (provided_schema, table_partition_cols) = if cmd.schema.fields().is_empty() {
+            (
+                None,
+                cmd.table_partition_cols
+                    .iter()
+                    .zip((0..cmd.table_partition_cols.len()).map(|_| DataType::Utf8))
+                    .map(|x| (x.0.clone(), x.1))

Review Comment:
   ```suggestion
                       .map(|x| (x.clone(), DataType::Utf8))
   ```



##########
datafusion/core/src/datasource/listing/table.rs:
##########
@@ -270,16 +268,19 @@ impl ListingOptions {
     ///
     /// ```
     /// use std::sync::Arc;
+    /// use arrow::datatypes::DataType;
     /// use datafusion::datasource::{listing::ListingOptions, file_format::parquet::ParquetFormat};
     ///
     /// let listing_options = ListingOptions::new(Arc::new(ParquetFormat::default()))
-    ///     .with_table_partition_cols(vec!["col_a".to_string(), "col_b".to_string()]);
+    ///     .with_table_partition_cols(vec![("col_a".to_string(), DataType::Utf8),
+    ///         ("col_b".to_string(), DataType::Utf8)]);
     ///
-    /// assert_eq!(listing_options.table_partition_cols, vec!["col_a", "col_b"]);
+    /// assert_eq!(listing_options.table_partition_cols, vec![("col_a".to_string(), DataType::Utf8),
+    ///     ("col_b".to_string(), DataType::Utf8)]);
     /// ```
     pub fn with_table_partition_cols(
         mut self,
-        table_partition_cols: Vec<String>,
+        table_partition_cols: Vec<(String, DataType)>,

Review Comment:
   It would help me read the code if this was a named struct so the code could refer to `.name` and `.datatype` rather than `.0` and `.1`. But I don't think it is necessary to merge this PR
   
   ```rust
   #[derive(Clone)]
   struct PartitionColumn {
     name: String,
     data_type: DataType
   }
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org