You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by kl...@apache.org on 2020/08/25 08:04:14 UTC
[hive] branch master updated: HIVE-24023: Hive parquet reader can't
read files with length=0 (Karen Coppage, reviewed by Marta Kuczora)
This is an automated email from the ASF dual-hosted git repository.
klcopp pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 387f0da HIVE-24023: Hive parquet reader can't read files with length=0 (Karen Coppage, reviewed by Marta Kuczora)
387f0da is described below
commit 387f0da9155a0e7b47ec39aeb9002c2b4cd75656
Author: Karen Coppage <ka...@gmail.com>
AuthorDate: Tue Aug 25 10:03:57 2020 +0200
HIVE-24023: Hive parquet reader can't read files with length=0 (Karen Coppage, reviewed by Marta Kuczora)
Closes #1388
---
.../ql/io/parquet/ParquetRecordReaderBase.java | 3 +
.../hadoop/hive/ql/stats/BasicStatsNoJobTask.java | 2 +-
.../hadoop/hive/ql/TestTxnCommandsForMmTable.java | 81 ++++++++++++++++++++--
3 files changed, 80 insertions(+), 6 deletions(-)
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java
index 577051d..c52bc9d 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java
@@ -75,6 +75,9 @@ public class ParquetRecordReaderBase {
final org.apache.hadoop.mapred.InputSplit oldSplit,
final JobConf conf
) throws IOException {
+ if (oldSplit.getLength() == 0) {
+ return null;
+ }
ParquetInputSplit split;
if (oldSplit instanceof FileSplit) {
final Path finalPath = ((FileSplit) oldSplit).getPath();
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStatsNoJobTask.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStatsNoJobTask.java
index 53b3065..c6533cf 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStatsNoJobTask.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStatsNoJobTask.java
@@ -187,7 +187,7 @@ public class BasicStatsNoJobTask implements IStatsProcessor {
Utilities.FILE_OP_LOGGER.debug("Computing stats for {}", file);
if (!file.isDirectory()) {
InputFormat<?, ?> inputFormat = ReflectionUtil.newInstance(partish.getInputFormatClass(), jc);
- InputSplit dummySplit = new FileSplit(file.getPath(), 0, 0, new String[] { partish.getLocation() });
+ InputSplit dummySplit = new FileSplit(file.getPath(), 0, -1, new String[] { partish.getLocation() });
if (file.getLen() == 0) {
numFiles += 1;
} else {
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommandsForMmTable.java b/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommandsForMmTable.java
index 535bf11..4d25f88 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommandsForMmTable.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommandsForMmTable.java
@@ -19,6 +19,7 @@
package org.apache.hadoop.hive.ql;
import java.io.File;
+import java.util.Collections;
import java.util.List;
import org.apache.hadoop.fs.FileStatus;
@@ -481,10 +482,85 @@ public class TestTxnCommandsForMmTable extends TxnCommandsBaseForTests {
verifyDirAndResult(0, true);
}
+ @Test
+ public void testImpalaTruncatedMmTableVectorized() throws Exception {
+ testImpalaTruncatedMmTable(true);
+ }
+
+ @Test
+ public void testImpalaTruncatedMmTableNonVectorized() throws Exception {
+ testImpalaTruncatedMmTable(false);
+ }
+
+ /**
+ * Impala truncates insert-only tables by writing a base directory (like insert overwrite) containing a completely
+ * empty file. Make sure that Hive reads these bases correctly.
+ *
+ * @throws Exception
+ */
+ private void testImpalaTruncatedMmTable(boolean vectorized) throws Exception {
+ if (!vectorized) {
+ d.getConf().setBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, false);
+ }
+ FileSystem fs = FileSystem.get(hiveConf);
+ FileStatus[] status;
+ Path tblLocation = new Path(TEST_WAREHOUSE_DIR + "/" +
+ (TableExtended.MMTBL).toString().toLowerCase());
+
+ // 1. Insert two rows to an MM table
+ runStatementOnDriver("drop table " + TableExtended.MMTBL);
+ runStatementOnDriver("create table " + TableExtended.MMTBL + "(a int,b int) stored as parquet "
+ + "TBLPROPERTIES ('transactional'='true', 'transactional_properties'='insert_only')");
+ runStatementOnDriver("insert into " + TableExtended.MMTBL + "(a,b) values(1,2)");
+ runStatementOnDriver("insert into " + TableExtended.MMTBL + "(a,b) values(3,4)");
+ status = fs.listStatus(tblLocation, FileUtils.STAGING_DIR_PATH_FILTER);
+ // There should be 2 delta dirs in the location
+ Assert.assertEquals(2, status.length);
+ for (int i = 0; i < status.length; i++) {
+ Assert.assertTrue(status[i].getPath().getName().matches("delta_.*"));
+ }
+
+ // 2. Simulate Impala truncating the table: write a base dir (base_0000003) containing a file with no data. We
+ // have to delete this file (it's not completely empty, it contains metadata) and create completely empty file
+ runStatementOnDriver("insert overwrite table " + TableExtended.MMTBL + " select * from "
+ + TableExtended.MMTBL + " where 1=2");
+ status = fs.listStatus(tblLocation, FileUtils.STAGING_DIR_PATH_FILTER);
+ // There should be 2 delta dirs, plus 1 base dir in the location
+ Assert.assertEquals(3, status.length);
+ verifyDir(2, true);
+ Path basePath = new Path(tblLocation, "base_0000003");
+ Assert.assertTrue("Deleting file under base failed", fs.delete(new Path(basePath, "000000_0")));
+ fs.create(new Path(basePath, "empty"));
+
+ // 3. Verify query result. Selecting from a truncated table should return nothing.
+ List<String> rs = runStatementOnDriver("select a,b from " + TableExtended.MMTBL + " order by a,b");
+ Assert.assertEquals(Collections.emptyList(), rs);
+
+ // 4. Perform a major compaction. Cleaner should remove the 2 delta dirs.
+ runStatementOnDriver("alter table "+ TableExtended.MMTBL + " compact 'MAJOR'");
+ runWorker(hiveConf);
+ runCleaner(hiveConf);
+ verifyDir(0, true);
+ rs = runStatementOnDriver("select a,b from " + TableExtended.MMTBL + " order by a,b");
+ Assert.assertEquals(Collections.emptyList(), rs);
+ if (!vectorized) {
+ d.getConf().setBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, true);
+ }
+ }
+
private void verifyDirAndResult(int expectedDeltas) throws Exception {
verifyDirAndResult(expectedDeltas, false);
}
private void verifyDirAndResult(int expectedDeltas, boolean expectBaseDir) throws Exception {
+ verifyDir(expectedDeltas, expectBaseDir);
+
+ // Verify query result
+ int [][] resultData = new int[][] {{1,2}, {3,4}};
+ List<String> rs = runStatementOnDriver("select a,b from " + TableExtended.MMTBL + " order by a,b");
+ Assert.assertEquals(stringifyValues(resultData), rs);
+ }
+
+ private void verifyDir(int expectedDeltas, boolean expectBaseDir) throws Exception {
FileSystem fs = FileSystem.get(hiveConf);
// Verify the content of subdirs
FileStatus[] status = fs.listStatus(new Path(TEST_WAREHOUSE_DIR + "/" +
@@ -508,10 +584,5 @@ public class TestTxnCommandsForMmTable extends TxnCommandsBaseForTests {
} else {
Assert.assertEquals("0 base directories expected", 0, sawBaseTimes);
}
-
- // Verify query result
- int [][] resultData = new int[][] {{1,2}, {3,4}};
- List<String> rs = runStatementOnDriver("select a,b from " + TableExtended.MMTBL + " order by a,b");
- Assert.assertEquals(stringifyValues(resultData), rs);
}
}