You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ay...@apache.org on 2022/12/23 03:49:17 UTC
[hive] branch master updated: HIVE-26884: Iceberg: V2 Vectorization returns wrong results with deletes. (#3890). (Ayush Saxena, reviewed by Denys Kuzmenko)
This is an automated email from the ASF dual-hosted git repository.
ayushsaxena pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 84e65538e10 HIVE-26884: Iceberg: V2 Vectorization returns wrong results with deletes. (#3890). (Ayush Saxena, reviewed by Denys Kuzmenko)
84e65538e10 is described below
commit 84e65538e1009e163d8e81f8645aef9344e24499
Author: Ayush Saxena <ay...@apache.org>
AuthorDate: Fri Dec 23 09:19:04 2022 +0530
HIVE-26884: Iceberg: V2 Vectorization returns wrong results with deletes. (#3890). (Ayush Saxena, reviewed by Denys Kuzmenko)
---
.../hive/vector/TestHiveIcebergVectorization.java | 52 ++++++++++++++++++++++
.../vector/VectorizedParquetRecordReader.java | 4 +-
2 files changed, 54 insertions(+), 2 deletions(-)
diff --git a/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/vector/TestHiveIcebergVectorization.java b/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/vector/TestHiveIcebergVectorization.java
index 34fc0673153..c35fa22568c 100644
--- a/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/vector/TestHiveIcebergVectorization.java
+++ b/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/vector/TestHiveIcebergVectorization.java
@@ -21,10 +21,12 @@ package org.apache.iceberg.mr.hive.vector;
import java.util.Iterator;
import java.util.List;
+import java.util.Map;
import java.util.function.Function;
import java.util.stream.Collectors;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.type.Date;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
@@ -53,6 +55,7 @@ import org.apache.iceberg.mr.hive.HiveIcebergStorageHandlerWithEngineBase;
import org.apache.iceberg.mr.hive.TestTables;
import org.apache.iceberg.mr.hive.serde.objectinspector.IcebergObjectInspector;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.types.Types;
import org.junit.Assert;
import org.junit.Assume;
@@ -188,6 +191,55 @@ public class TestHiveIcebergVectorization extends HiveIcebergStorageHandlerWithE
validation.apply(1501);
}
+ @Test
+ public void testHiveDeleteFilterWithFilteredParquetBlock() {
+ Assume.assumeTrue(
+ isVectorized && testTableType == TestTables.TestTableType.HIVE_CATALOG && fileFormat == FileFormat.PARQUET);
+
+ Schema schema = new Schema(
+ optional(1, "customer_id", Types.LongType.get()),
+ optional(2, "customer_age", Types.IntegerType.get()),
+ optional(3, "date_col", Types.DateType.get())
+ );
+
+ // Generate 10600 records so that we end up with multiple batches to work with during the read.
+ List<Record> records = TestHelper.generateRandomRecords(schema, 10600, 0L);
+
+ // Fill id and date column with deterministic values
+ for (int i = 0; i < records.size(); ++i) {
+ records.get(i).setField("customer_id", (long) i);
+ if (i % 3 == 0) {
+ records.get(i).setField("date_col", Date.valueOf("2022-04-28"));
+ } else if (i % 3 == 1) {
+ records.get(i).setField("date_col", Date.valueOf("2022-04-29"));
+ } else {
+ records.get(i).setField("date_col", Date.valueOf("2022-04-30"));
+ }
+ }
+ Map<String, String> props = Maps.newHashMap();
+ props.put("parquet.block.size", "8192");
+ testTables.createTable(shell, "vectordelete", schema, PartitionSpec.unpartitioned(), fileFormat, records, 2, props);
+
+ // Check there is some rows before we do an update
+ List<Object[]> results = shell.executeStatement("select * from vectordelete where date_col=date'2022-04-29'");
+
+ Assert.assertNotEquals(0, results.size());
+
+ // Capture the number of entries with both column, to validate after update value
+ List<Object[]> postUpdateResult = shell.executeStatement(
+ "select * from vectordelete where date_col=date'2022-04-29' OR date_col=date'2022-04-30'");
+
+ Assert.assertNotEquals(0, postUpdateResult.size());
+
+ // Do an update on the column, and check if the count is 0, since we changed the value for that column
+ shell.executeStatement("update vectordelete set date_col=date'2022-04-30' where date_col=date'2022-04-29'");
+ results = shell.executeStatement("select * from vectordelete where date_col=date'2022-04-29'");
+ Assert.assertEquals(0, results.size());
+
+ results = shell.executeStatement("select * from vectordelete where date_col=date'2022-04-30'");
+ Assert.assertEquals(postUpdateResult.size(), results.size());
+ }
+
/**
* Creates a mock vectorized ORC read job for a particular data file and a read schema (projecting on all columns)
* @param schema readSchema
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java
index c1c0a120686..33e828ff18b 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java
@@ -233,11 +233,11 @@ public class VectorizedParquetRecordReader extends ParquetRecordReaderBase
long allRowsInFile = 0;
int blockIndex = 0;
for (BlockMetaData block : parquetMetadata.getBlocks()) {
- rowGroupNumToRowPos.put(blockIndex++, allRowsInFile);
- allRowsInFile += block.getRowCount();
if (offsets.contains(block.getStartingPos())) {
+ rowGroupNumToRowPos.put(blockIndex++, allRowsInFile);
blocks.add(block);
}
+ allRowsInFile += block.getRowCount();
}
// verify we found them all
if (blocks.size() != rowGroupOffsets.length) {