You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by ja...@apache.org on 2015/04/29 09:30:53 UTC

[5/6] drill git commit: DRILL-2842: Fix reading of parquet files with large file level metadata.

DRILL-2842: Fix reading of parquet files with large file level metadata.

Add result verification of the read operation. The actual bug happened when we were trying to read the meta-data to make read assignments, but its good to have the test case around to make sure that we can read a parquet file with a lot of columns.


Project: http://git-wip-us.apache.org/repos/asf/drill/repo
Commit: http://git-wip-us.apache.org/repos/asf/drill/commit/a7a60a24
Tree: http://git-wip-us.apache.org/repos/asf/drill/tree/a7a60a24
Diff: http://git-wip-us.apache.org/repos/asf/drill/diff/a7a60a24

Branch: refs/heads/master
Commit: a7a60a2455f3a0ec6ae9cfac12a132ca1a84cb66
Parents: 4a23037
Author: Jason Altekruse <al...@gmail.com>
Authored: Tue Apr 21 15:16:06 2015 -0700
Committer: Jacques Nadeau <ja...@apache.org>
Committed: Wed Apr 29 00:28:44 2015 -0700

----------------------------------------------------------------------
 .../exec/store/parquet/FooterGatherer.java      |  2 +-
 .../exec/fn/interp/TestConstantFolding.java     | 52 ++++++++++++++++++--
 .../physical/impl/writer/TestParquetWriter.java | 38 ++++++++++++++
 3 files changed, 86 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/drill/blob/a7a60a24/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/FooterGatherer.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/FooterGatherer.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/FooterGatherer.java
index 0bb86e1..d8495c9 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/FooterGatherer.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/FooterGatherer.java
@@ -152,7 +152,7 @@ public class FooterGatherer {
         footerBytes = new byte[size];
 
         readFully(file, fileLength - size - FOOTER_METADATA_SIZE, footerBytes, 0, size - origFooterRead);
-        System.arraycopy(origFooterBytes, 0, footerBytes, size - footerBytes.length, origFooterRead);
+        System.arraycopy(origFooterBytes, 0, footerBytes, size - origFooterRead, origFooterRead);
       }else{
         int start = footerBytes.length - (size + FOOTER_METADATA_SIZE);
         footerBytes = ArrayUtils.subarray(footerBytes, start, start + size);

http://git-wip-us.apache.org/repos/asf/drill/blob/a7a60a24/exec/java-exec/src/test/java/org/apache/drill/exec/fn/interp/TestConstantFolding.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/fn/interp/TestConstantFolding.java b/exec/java-exec/src/test/java/org/apache/drill/exec/fn/interp/TestConstantFolding.java
index b17935a..2c23df4 100644
--- a/exec/java-exec/src/test/java/org/apache/drill/exec/fn/interp/TestConstantFolding.java
+++ b/exec/java-exec/src/test/java/org/apache/drill/exec/fn/interp/TestConstantFolding.java
@@ -17,6 +17,9 @@
  ******************************************************************************/
 package org.apache.drill.exec.fn.interp;
 
+import com.google.common.base.Joiner;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Lists;
 import org.apache.drill.PlanTestBase;
 import org.apache.drill.exec.util.JsonStringArrayList;
 import org.apache.hadoop.io.Text;
@@ -27,6 +30,7 @@ import org.junit.rules.TemporaryFolder;
 
 import java.io.File;
 import java.io.PrintWriter;
+import java.util.List;
 
 public class TestConstantFolding extends PlanTestBase {
 
@@ -41,34 +45,72 @@ public class TestConstantFolding extends PlanTestBase {
   public static class SmallFileCreator {
 
     private final TemporaryFolder folder;
+    private static final List<String> values = Lists.newArrayList("1","2","3");
+    private static final String jsonRecord =  "{\"col1\" : 1,\"col2\" : 2, \"col3\" : 3}";
+    private String record;
 
     public SmallFileCreator(TemporaryFolder folder) {
       this.folder = folder;
+      this.record = null;
     }
 
-    public void createFiles(int smallFileLines, int bigFileLines) throws Exception{
+    public SmallFileCreator setRecord(String record) {
+      this.record = record;
+      return this;
+    }
+
+    public void createFiles(int smallFileLines, int bigFileLines, String extension, String delimiter) throws Exception{
+      if (record == null) {
+        if (extension.equals("csv") || extension.equals("tsv")) {
+          record = Joiner.on(delimiter).join(values);
+        } else if (extension.equals("json") ){
+          record = jsonRecord;
+        } else {
+          throw new UnsupportedOperationException(
+              String.format("Extension %s not supported by %s",
+                  extension, SmallFileCreator.class.getSimpleName()));
+        }
+      }
       PrintWriter out;
       for (String fileAndFolderName : new String[]{"bigfile", "BIGFILE_2"}) {
         File bigFolder = folder.newFolder(fileAndFolderName);
-        File bigFile = new File (bigFolder, fileAndFolderName + ".csv");
+        File bigFile = new File (bigFolder, fileAndFolderName + "." + extension);
         out = new PrintWriter(bigFile);
         for (int i = 0; i < bigFileLines; i++ ) {
-          out.println("1,2,3");
+          out.println(record);
         }
         out.close();
       }
 
       for (String fileAndFolderName : new String[]{"smallfile", "SMALLFILE_2"}) {
         File smallFolder = folder.newFolder(fileAndFolderName);
-        File smallFile = new File (smallFolder, fileAndFolderName + ".csv");
+        File smallFile = new File (smallFolder, fileAndFolderName + "." + extension);
         out = new PrintWriter(smallFile);
         for (int i = 0; i < smallFileLines; i++ ) {
-          out.println("1,2,3");
+          out.println(record);
         }
         out.close();
       }
     }
 
+    public void createFiles(int smallFileLines, int bigFileLines, String extension) throws Exception{
+      String delimiter;
+      if (extension.equals("json")) {
+        delimiter = null;
+      } else if (extension.equals("csv")) {
+        delimiter = ",";
+      } else if (extension.equals("tsv")) {
+        delimiter = "\t";
+      } else {
+        throw new UnsupportedOperationException("Extension not recognized, please explicitly provide a delimiter.");
+      }
+      createFiles(smallFileLines, bigFileLines, extension, delimiter);
+    }
+
+    public void createFiles(int smallFileLines, int bigFileLines) throws Exception{
+      createFiles(smallFileLines, bigFileLines, "csv", ",");
+    }
+
   }
 
   @Test

http://git-wip-us.apache.org/repos/asf/drill/blob/a7a60a24/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/writer/TestParquetWriter.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/writer/TestParquetWriter.java b/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/writer/TestParquetWriter.java
index 89837e7..5670e1e 100644
--- a/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/writer/TestParquetWriter.java
+++ b/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/writer/TestParquetWriter.java
@@ -23,6 +23,7 @@ import java.sql.Date;
 
 import org.apache.drill.BaseTestQuery;
 import org.apache.drill.exec.ExecConstants;
+import org.apache.drill.exec.fn.interp.TestConstantFolding;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -30,11 +31,15 @@ import org.joda.time.DateTime;
 import org.junit.Assert;
 import org.junit.BeforeClass;
 import org.junit.Ignore;
+import org.junit.Rule;
 import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
 
 public class TestParquetWriter extends BaseTestQuery {
 //  private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(TestParquetWriter.class);
 
+  @Rule
+  public TemporaryFolder folder = new TemporaryFolder();
   static FileSystem fs;
 
   @BeforeClass
@@ -53,6 +58,39 @@ public class TestParquetWriter extends BaseTestQuery {
   }
 
   @Test
+  public void testLargeFooter() throws Exception {
+    StringBuffer sb = new StringBuffer();
+    // create a JSON document with a lot of columns
+    sb.append("{");
+    final int numCols = 1000;
+    String[] colNames = new String[numCols];
+    Object[] values = new Object[numCols];
+    for (int i = 0 ; i < numCols - 1; i++) {
+      sb.append(String.format("\"col_%d\" : 100,", i));
+      colNames[i] = "col_" + i;
+      values[i] = 100l;
+    }
+    // add one column without a comma after it
+    sb.append(String.format("\"col_%d\" : 100", numCols - 1));
+    sb.append("}");
+    colNames[numCols - 1] = "col_" + (numCols - 1);
+    values[numCols - 1] = 100l;
+
+    // write it to a file in the temp directory for the test
+    new TestConstantFolding.SmallFileCreator(folder).setRecord(sb.toString()).createFiles(1, 1, "json");
+
+    String path = folder.getRoot().toPath().toString();
+    test("use dfs_test.tmp");
+    test("create table WIDE_PARQUET_TABLE_TestParquetWriter_testLargeFooter as select * from dfs.`" + path + "/smallfile/smallfile.json`");
+    testBuilder()
+        .sqlQuery("select * from dfs_test.tmp.WIDE_PARQUET_TABLE_TestParquetWriter_testLargeFooter")
+        .unOrdered()
+        .baselineColumns(colNames)
+        .baselineValues(values)
+        .build().run();
+  }
+
+  @Test
   public void testComplex() throws Exception {
     String selection = "*";
     String inputTable = "cp.`donuts.json`";