You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by cg...@apache.org on 2023/02/14 19:43:26 UTC
[drill] branch master updated: DRILL-8401: Skip nested MAP column without children when creating parquet tables (#2757)
This is an automated email from the ASF dual-hosted git repository.
cgivre pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git
The following commit(s) were added to refs/heads/master by this push:
new e28355d06f DRILL-8401: Skip nested MAP column without children when creating parquet tables (#2757)
e28355d06f is described below
commit e28355d06f51ee31ade538241b3b1d4c285234d9
Author: James Turton <91...@users.noreply.github.com>
AuthorDate: Tue Feb 14 21:43:18 2023 +0200
DRILL-8401: Skip nested MAP column without children when creating parquet tables (#2757)
---
.../drill/exec/store/parquet/ParquetRecordWriter.java | 16 ++++++++++++++++
.../exec/physical/impl/writer/TestParquetWriter.java | 17 ++++++++++++-----
2 files changed, 28 insertions(+), 5 deletions(-)
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetRecordWriter.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetRecordWriter.java
index 5858b08f4b..d58747c711 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetRecordWriter.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetRecordWriter.java
@@ -253,6 +253,7 @@ public class ParquetRecordWriter extends ParquetOutputRecordWriter {
private void newSchema() {
List<Type> types = new ArrayList<>();
for (MaterializedField field : batchSchema) {
+ pruneUnsupported(field);
if (!supportsField(field)) {
continue;
}
@@ -298,6 +299,21 @@ public class ParquetRecordWriter extends ParquetOutputRecordWriter {
setUp(schema, consumer);
}
+ /**
+ * Recursively prunes childless MAPs from the field tree proceeding depth
+ * first so that fields that are rendered childless by removals of their
+ * descendants are eventually correctly removed themselves.
+ * @param field a top level field.
+ */
+ private void pruneUnsupported(MaterializedField field) {
+ for (MaterializedField child: field.getChildren()) {
+ pruneUnsupported(child);
+ if (!supportsField(child)) {
+ field.removeChild(child);
+ }
+ }
+ }
+
@Override
public boolean supportsField(MaterializedField field) {
return super.supportsField(field)
diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/writer/TestParquetWriter.java b/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/writer/TestParquetWriter.java
index 707a552819..002edf2aed 100644
--- a/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/writer/TestParquetWriter.java
+++ b/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/writer/TestParquetWriter.java
@@ -1515,20 +1515,27 @@ public class TestParquetWriter extends ClusterTest {
}
}
- @Test
+ @Test // DRILL-8272
public void testResultWithEmptyMap() throws Exception {
String fileName = "emptyMap.json";
- FileUtils.writeStringToFile(new File(dirTestWatcher.getRootDir(), fileName),
- "{\"sample\": {}, \"a\": \"a\"}", Charset.defaultCharset());
+ // Create a test JSON object that includes two nested empty objects. The
+ // first has a string property as sibling so its parent, non_empty_child
+ // should not be eliminated from the output schema while the second has no
+ // siblings so its parent empty_child should be eliminated.
+ FileUtils.writeStringToFile(
+ new File(dirTestWatcher.getRootDir(), fileName),
+ "{\"non_empty_child\": { \"empty\": {}, \"b\": \"b\"}, \"empty_child\": { \"empty\": {} }, \"a\": \"a\"}",
+ Charset.defaultCharset()
+ );
run("create table dfs.tmp.t1 as SELECT * from dfs.`%s` t", fileName);
testBuilder()
.sqlQuery("select * from dfs.tmp.t1")
.unOrdered()
- .baselineColumns("a")
- .baselineValues("a")
+ .baselineColumns("a", "non_empty_child")
+ .baselineValues("a", mapOf("b", "b"))
.go();
}