You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by cg...@apache.org on 2023/02/14 19:43:26 UTC
[drill] branch master updated: DRILL-8401: Skip nested MAP column without children when creating parquet tables (#2757)

This is an automated email from the ASF dual-hosted git repository.

cgivre pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git


The following commit(s) were added to refs/heads/master by this push:
     new e28355d06f DRILL-8401: Skip nested MAP column without children when creating parquet tables (#2757)
e28355d06f is described below

commit e28355d06f51ee31ade538241b3b1d4c285234d9
Author: James Turton <91...@users.noreply.github.com>
AuthorDate: Tue Feb 14 21:43:18 2023 +0200

    DRILL-8401: Skip nested MAP column without children when creating parquet tables (#2757)
---
 .../drill/exec/store/parquet/ParquetRecordWriter.java   | 16 ++++++++++++++++
 .../exec/physical/impl/writer/TestParquetWriter.java    | 17 ++++++++++++-----
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetRecordWriter.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetRecordWriter.java
index 5858b08f4b..d58747c711 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetRecordWriter.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetRecordWriter.java
@@ -253,6 +253,7 @@ public class ParquetRecordWriter extends ParquetOutputRecordWriter {
   private void newSchema() {
     List<Type> types = new ArrayList<>();
     for (MaterializedField field : batchSchema) {
+      pruneUnsupported(field);
       if (!supportsField(field)) {
         continue;
       }
@@ -298,6 +299,21 @@ public class ParquetRecordWriter extends ParquetOutputRecordWriter {
     setUp(schema, consumer);
   }
 
+  /**
+   * Recursively prunes childless MAPs from the field tree proceeding depth
+   * first so that fields that are rendered childless by removals of their
+   * descendants are eventually correctly removed themselves.
+   * @param field a top level field.
+   */
+  private void pruneUnsupported(MaterializedField field) {
+    for (MaterializedField child: field.getChildren()) {
+      pruneUnsupported(child);
+      if (!supportsField(child)) {
+        field.removeChild(child);
+      }
+    }
+  }
+
   @Override
   public boolean supportsField(MaterializedField field) {
     return super.supportsField(field)
diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/writer/TestParquetWriter.java b/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/writer/TestParquetWriter.java
index 707a552819..002edf2aed 100644
--- a/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/writer/TestParquetWriter.java
+++ b/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/writer/TestParquetWriter.java
@@ -1515,20 +1515,27 @@ public class TestParquetWriter extends ClusterTest {
     }
   }
 
-  @Test
+  @Test // DRILL-8272
   public void testResultWithEmptyMap() throws Exception {
     String fileName = "emptyMap.json";
 
-    FileUtils.writeStringToFile(new File(dirTestWatcher.getRootDir(), fileName),
-      "{\"sample\": {}, \"a\": \"a\"}", Charset.defaultCharset());
+    // Create a test JSON object that includes two nested empty objects. The
+    // first has a string property as sibling so its parent, non_empty_child
+    // should not be eliminated from the output schema while the second has no
+    // siblings so its parent empty_child should be eliminated.
+    FileUtils.writeStringToFile(
+        new File(dirTestWatcher.getRootDir(), fileName),
+        "{\"non_empty_child\": { \"empty\": {}, \"b\": \"b\"}, \"empty_child\": { \"empty\": {} }, \"a\": \"a\"}",
+        Charset.defaultCharset()
+    );
 
     run("create table dfs.tmp.t1 as SELECT * from dfs.`%s` t", fileName);
 
     testBuilder()
       .sqlQuery("select * from dfs.tmp.t1")
       .unOrdered()
-      .baselineColumns("a")
-      .baselineValues("a")
+      .baselineColumns("a", "non_empty_child")
+      .baselineValues("a", mapOf("b", "b"))
       .go();
   }