You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "alamb (via GitHub)" <gi...@apache.org> on 2023/05/03 15:13:21 UTC

[GitHub] [arrow-datafusion] alamb commented on a diff in pull request #6202: Supply consistent format output for FileScanConfig params

alamb commented on code in PR #6202:
URL: https://github.com/apache/arrow-datafusion/pull/6202#discussion_r1183826587


##########
datafusion/core/src/physical_optimizer/combine_partial_final_agg.rs:
##########
@@ -362,7 +362,7 @@ mod tests {
         let expected = &[
             "AggregateExec: mode=Final, gby=[], aggr=[COUNT(2)]",
             "AggregateExec: mode=Partial, gby=[], aggr=[COUNT(1)]",
-            "ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[a, b, c]",
+            "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c], limit=None, output_ordering=[]",

Review Comment:
   Would it be possible to keep the existing behavior and only print out `limit`, `projection` and `output_ordering` if they were non empty?
   
   So like in this case it would be
   
   ```
   file_groups={1 group: [[x]]}, projection=[a, b, c]",
   ```
   
   Because `limit` is `None` and `output_ordering` is empty?
   
   The rationale is to keep the display easier to read by keeping the output concise



##########
datafusion/core/src/physical_plan/file_format/mod.rs:
##########
@@ -234,6 +234,39 @@ impl FileScanConfig {
     }
 }
 
+impl Display for FileScanConfig {
+    fn fmt(&self, f: &mut Formatter) -> FmtResult {
+        let (schema, _, ordering) = self.project();
+
+        let output_ordering_str = ordering
+            .map(|v| make_output_ordering_string(&v))
+            .unwrap_or(String::from("[]"));
+
+        write!(
+            f,
+            "file_groups={}, projection={}, limit={:?}, output_ordering={}",
+            FileGroupsDisplay(&self.file_groups),
+            ProjectSchemaDisplay(&schema),
+            self.limit,
+            output_ordering_str
+        )
+    }
+}
+
+fn make_output_ordering_string(ordering: &[PhysicalSortExpr]) -> String {

Review Comment:
   I think you could avoid the extra copy / materialization by doing something like
   
   ```rust
   struct DisplayableOrdering( &[PhysicalSortExpr]);
   
   impl Display for DisplayableOrdering {
       fn fmt(&self, f: &mut Formatter) -> FmtResult {
          // use self.0 here and call write! to f
       } 
   }
   ```
   
   
   Alternately you could probably use join here instead well: https://doc.rust-lang.org/std/primitive.slice.html#method.join
   https://doc.rust-lang.org/std/primitive.slice.html#method.join



##########
datafusion/core/src/physical_plan/file_format/avro.rs:
##########
@@ -136,12 +136,7 @@ impl ExecutionPlan for AvroExec {
     ) -> std::fmt::Result {
         match t {
             DisplayFormatType::Default => {
-                write!(
-                    f,
-                    "AvroExec: files={}, limit={:?}",
-                    super::FileGroupsDisplay(&self.base_config.file_groups),
-                    self.base_config.limit,
-                )
+                write!(f, "AvroExec: {}", self.base_config())

Review Comment:
   ❤️  that really looks much nicer



##########
datafusion/core/tests/sqllogictests/test_files/explain.slt:
##########
@@ -47,7 +47,41 @@ ProjectionExec: expr=[c1@0 as c1]
   CoalesceBatchesExec: target_batch_size=8192
     FilterExec: c2@1 > 10
       RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-        CsvExec: files={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, has_header=true, limit=None, projection=[c1, c2]
+        CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2], limit=None, output_ordering=[], has_header=true
+
+# explain_csv_exec_scan_config
+
+statement ok
+CREATE EXTERNAL TABLE aggregate_test_100_with_order (
+        c1  VARCHAR NOT NULL,
+        c2  TINYINT NOT NULL,
+        c3  SMALLINT NOT NULL,
+        c4  SMALLINT NOT NULL,
+        c5  INTEGER NOT NULL,
+        c6  BIGINT NOT NULL,
+        c7  SMALLINT NOT NULL,
+        c8  INT NOT NULL,
+        c9  INT UNSIGNED NOT NULL,
+        c10 BIGINT UNSIGNED NOT NULL,
+        c11 FLOAT NOT NULL,
+        c12 DOUBLE NOT NULL,
+        c13 VARCHAR NOT NULL
+    )
+STORED AS CSV
+WITH HEADER ROW
+WITH ORDER (c1 ASC)
+LOCATION '../../testing/data/csv/aggregate_test_100.csv';
+
+query TT
+explain SELECT c1 FROM aggregate_test_100_with_order order by c1 ASC limit 10
+----
+logical_plan
+Limit: skip=0, fetch=10
+  Sort: aggregate_test_100_with_order.c1 ASC NULLS LAST, fetch=10
+    TableScan: aggregate_test_100_with_order projection=[c1]
+physical_plan
+GlobalLimitExec: skip=0, fetch=10
+  CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1], limit=None, output_ordering=[c1@0 ASC NULLS LAST], has_header=true

Review Comment:
   ❤️ 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org