You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by kl...@apache.org on 2020/08/25 08:04:14 UTC
[hive] branch master updated: HIVE-24023: Hive parquet reader can't read files with length=0 (Karen Coppage, reviewed by Marta Kuczora)

This is an automated email from the ASF dual-hosted git repository.

klcopp pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new 387f0da  HIVE-24023: Hive parquet reader can't read files with length=0 (Karen Coppage, reviewed by Marta Kuczora)
387f0da is described below

commit 387f0da9155a0e7b47ec39aeb9002c2b4cd75656
Author: Karen Coppage <ka...@gmail.com>
AuthorDate: Tue Aug 25 10:03:57 2020 +0200

    HIVE-24023: Hive parquet reader can't read files with length=0 (Karen Coppage, reviewed by Marta Kuczora)
    
    Closes #1388
---
 .../ql/io/parquet/ParquetRecordReaderBase.java     |  3 +
 .../hadoop/hive/ql/stats/BasicStatsNoJobTask.java  |  2 +-
 .../hadoop/hive/ql/TestTxnCommandsForMmTable.java  | 81 ++++++++++++++++++++--
 3 files changed, 80 insertions(+), 6 deletions(-)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java
index 577051d..c52bc9d 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java
@@ -75,6 +75,9 @@ public class ParquetRecordReaderBase {
     final org.apache.hadoop.mapred.InputSplit oldSplit,
     final JobConf conf
   ) throws IOException {
+    if (oldSplit.getLength() == 0) {
+      return null;
+    }
     ParquetInputSplit split;
     if (oldSplit instanceof FileSplit) {
       final Path finalPath = ((FileSplit) oldSplit).getPath();
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStatsNoJobTask.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStatsNoJobTask.java
index 53b3065..c6533cf 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStatsNoJobTask.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStatsNoJobTask.java
@@ -187,7 +187,7 @@ public class BasicStatsNoJobTask implements IStatsProcessor {
           Utilities.FILE_OP_LOGGER.debug("Computing stats for {}", file);
           if (!file.isDirectory()) {
             InputFormat<?, ?> inputFormat = ReflectionUtil.newInstance(partish.getInputFormatClass(), jc);
-            InputSplit dummySplit = new FileSplit(file.getPath(), 0, 0, new String[] { partish.getLocation() });
+            InputSplit dummySplit = new FileSplit(file.getPath(), 0, -1, new String[] { partish.getLocation() });
             if (file.getLen() == 0) {
               numFiles += 1;
             } else {
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommandsForMmTable.java b/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommandsForMmTable.java
index 535bf11..4d25f88 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommandsForMmTable.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommandsForMmTable.java
@@ -19,6 +19,7 @@
 package org.apache.hadoop.hive.ql;
 
 import java.io.File;
+import java.util.Collections;
 import java.util.List;
 
 import org.apache.hadoop.fs.FileStatus;
@@ -481,10 +482,85 @@ public class TestTxnCommandsForMmTable extends TxnCommandsBaseForTests {
     verifyDirAndResult(0, true);
   }
 
+  @Test
+  public void testImpalaTruncatedMmTableVectorized() throws Exception {
+    testImpalaTruncatedMmTable(true);
+  }
+
+  @Test
+  public void testImpalaTruncatedMmTableNonVectorized() throws Exception {
+    testImpalaTruncatedMmTable(false);
+  }
+
+  /**
+   * Impala truncates insert-only tables by writing a base directory (like insert overwrite) containing a completely
+   * empty file. Make sure that Hive reads these bases correctly.
+   *
+   * @throws Exception
+   */
+  private void testImpalaTruncatedMmTable(boolean vectorized) throws Exception {
+    if (!vectorized) {
+      d.getConf().setBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, false);
+    }
+    FileSystem fs = FileSystem.get(hiveConf);
+    FileStatus[] status;
+    Path tblLocation = new Path(TEST_WAREHOUSE_DIR + "/" +
+        (TableExtended.MMTBL).toString().toLowerCase());
+
+    // 1. Insert two rows to an MM table
+    runStatementOnDriver("drop table " + TableExtended.MMTBL);
+    runStatementOnDriver("create table " + TableExtended.MMTBL + "(a int,b int) stored as parquet "
+        + "TBLPROPERTIES ('transactional'='true', 'transactional_properties'='insert_only')");
+    runStatementOnDriver("insert into " + TableExtended.MMTBL + "(a,b) values(1,2)");
+    runStatementOnDriver("insert into " + TableExtended.MMTBL + "(a,b) values(3,4)");
+    status = fs.listStatus(tblLocation, FileUtils.STAGING_DIR_PATH_FILTER);
+    // There should be 2 delta dirs in the location
+    Assert.assertEquals(2, status.length);
+    for (int i = 0; i < status.length; i++) {
+      Assert.assertTrue(status[i].getPath().getName().matches("delta_.*"));
+    }
+
+    // 2. Simulate Impala truncating the table: write a base dir (base_0000003) containing a file with no data. We
+    // have to delete this file (it's not completely empty, it contains metadata) and create completely empty file
+    runStatementOnDriver("insert overwrite  table " + TableExtended.MMTBL + " select * from "
+        + TableExtended.MMTBL + " where 1=2");
+    status = fs.listStatus(tblLocation, FileUtils.STAGING_DIR_PATH_FILTER);
+    // There should be 2 delta dirs, plus 1 base dir in the location
+    Assert.assertEquals(3, status.length);
+    verifyDir(2, true);
+    Path basePath = new Path(tblLocation, "base_0000003");
+    Assert.assertTrue("Deleting file under base failed", fs.delete(new Path(basePath, "000000_0")));
+    fs.create(new Path(basePath, "empty"));
+
+    // 3. Verify query result. Selecting from a truncated table should return nothing.
+    List<String> rs = runStatementOnDriver("select a,b from " + TableExtended.MMTBL + " order by a,b");
+    Assert.assertEquals(Collections.emptyList(), rs);
+
+    // 4. Perform a major compaction. Cleaner should remove the 2 delta dirs.
+    runStatementOnDriver("alter table "+ TableExtended.MMTBL + " compact 'MAJOR'");
+    runWorker(hiveConf);
+    runCleaner(hiveConf);
+    verifyDir(0, true);
+    rs = runStatementOnDriver("select a,b from " + TableExtended.MMTBL + " order by a,b");
+    Assert.assertEquals(Collections.emptyList(), rs);
+    if (!vectorized) {
+      d.getConf().setBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, true);
+    }
+  }
+
   private void verifyDirAndResult(int expectedDeltas) throws Exception {
     verifyDirAndResult(expectedDeltas, false);
   }
   private void verifyDirAndResult(int expectedDeltas, boolean expectBaseDir) throws Exception {
+    verifyDir(expectedDeltas, expectBaseDir);
+
+    // Verify query result
+    int [][] resultData = new int[][] {{1,2}, {3,4}};
+    List<String> rs = runStatementOnDriver("select a,b from " + TableExtended.MMTBL + " order by a,b");
+    Assert.assertEquals(stringifyValues(resultData), rs);
+  }
+
+  private void verifyDir(int expectedDeltas, boolean expectBaseDir) throws Exception {
     FileSystem fs = FileSystem.get(hiveConf);
     // Verify the content of subdirs
     FileStatus[] status = fs.listStatus(new Path(TEST_WAREHOUSE_DIR + "/" +
@@ -508,10 +584,5 @@ public class TestTxnCommandsForMmTable extends TxnCommandsBaseForTests {
     } else {
       Assert.assertEquals("0 base directories expected", 0, sawBaseTimes);
     }
-
-    // Verify query result
-    int [][] resultData = new int[][] {{1,2}, {3,4}};
-    List<String> rs = runStatementOnDriver("select a,b from " + TableExtended.MMTBL + " order by a,b");
-    Assert.assertEquals(stringifyValues(resultData), rs);
   }
 }