You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ay...@apache.org on 2022/12/23 03:49:17 UTC

[hive] branch master updated: HIVE-26884: Iceberg: V2 Vectorization returns wrong results with deletes. (#3890). (Ayush Saxena, reviewed by Denys Kuzmenko)

This is an automated email from the ASF dual-hosted git repository.

ayushsaxena pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new 84e65538e10 HIVE-26884: Iceberg: V2 Vectorization returns wrong results with deletes. (#3890). (Ayush Saxena, reviewed by Denys Kuzmenko)
84e65538e10 is described below

commit 84e65538e1009e163d8e81f8645aef9344e24499
Author: Ayush Saxena <ay...@apache.org>
AuthorDate: Fri Dec 23 09:19:04 2022 +0530

    HIVE-26884: Iceberg: V2 Vectorization returns wrong results with deletes. (#3890). (Ayush Saxena, reviewed by Denys Kuzmenko)
---
 .../hive/vector/TestHiveIcebergVectorization.java  | 52 ++++++++++++++++++++++
 .../vector/VectorizedParquetRecordReader.java      |  4 +-
 2 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/vector/TestHiveIcebergVectorization.java b/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/vector/TestHiveIcebergVectorization.java
index 34fc0673153..c35fa22568c 100644
--- a/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/vector/TestHiveIcebergVectorization.java
+++ b/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/vector/TestHiveIcebergVectorization.java
@@ -21,10 +21,12 @@ package org.apache.iceberg.mr.hive.vector;
 
 import java.util.Iterator;
 import java.util.List;
+import java.util.Map;
 import java.util.function.Function;
 import java.util.stream.Collectors;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.type.Date;
 import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.ql.exec.Utilities;
 import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
@@ -53,6 +55,7 @@ import org.apache.iceberg.mr.hive.HiveIcebergStorageHandlerWithEngineBase;
 import org.apache.iceberg.mr.hive.TestTables;
 import org.apache.iceberg.mr.hive.serde.objectinspector.IcebergObjectInspector;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.relocated.com.google.common.collect.Maps;
 import org.apache.iceberg.types.Types;
 import org.junit.Assert;
 import org.junit.Assume;
@@ -188,6 +191,55 @@ public class TestHiveIcebergVectorization extends HiveIcebergStorageHandlerWithE
     validation.apply(1501);
   }
 
+  @Test
+  public void testHiveDeleteFilterWithFilteredParquetBlock() {
+    Assume.assumeTrue(
+        isVectorized && testTableType == TestTables.TestTableType.HIVE_CATALOG && fileFormat == FileFormat.PARQUET);
+
+    Schema schema = new Schema(
+        optional(1, "customer_id", Types.LongType.get()),
+        optional(2, "customer_age", Types.IntegerType.get()),
+        optional(3, "date_col", Types.DateType.get())
+    );
+
+    // Generate 10600 records so that we end up with multiple batches to work with during the read.
+    List<Record> records = TestHelper.generateRandomRecords(schema, 10600, 0L);
+
+    // Fill id and date column with deterministic values
+    for (int i = 0; i < records.size(); ++i) {
+      records.get(i).setField("customer_id", (long) i);
+      if (i % 3 == 0) {
+        records.get(i).setField("date_col", Date.valueOf("2022-04-28"));
+      } else if (i % 3 == 1) {
+        records.get(i).setField("date_col", Date.valueOf("2022-04-29"));
+      } else {
+        records.get(i).setField("date_col", Date.valueOf("2022-04-30"));
+      }
+    }
+    Map<String, String> props = Maps.newHashMap();
+    props.put("parquet.block.size", "8192");
+    testTables.createTable(shell, "vectordelete", schema, PartitionSpec.unpartitioned(), fileFormat, records, 2, props);
+
+    // Check there is some rows before we do an update
+    List<Object[]> results = shell.executeStatement("select * from vectordelete where date_col=date'2022-04-29'");
+
+    Assert.assertNotEquals(0, results.size());
+
+    // Capture the number of entries with both column, to validate after update value
+    List<Object[]> postUpdateResult = shell.executeStatement(
+        "select * from vectordelete where date_col=date'2022-04-29' OR date_col=date'2022-04-30'");
+
+    Assert.assertNotEquals(0, postUpdateResult.size());
+
+    // Do an update on the column, and check if the count is 0, since we changed the value for that column
+    shell.executeStatement("update vectordelete set date_col=date'2022-04-30' where date_col=date'2022-04-29'");
+    results = shell.executeStatement("select * from vectordelete where date_col=date'2022-04-29'");
+    Assert.assertEquals(0, results.size());
+
+    results = shell.executeStatement("select * from vectordelete where date_col=date'2022-04-30'");
+    Assert.assertEquals(postUpdateResult.size(), results.size());
+  }
+
   /**
    * Creates a mock vectorized ORC read job for a particular data file and a read schema (projecting on all columns)
    * @param schema readSchema
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java
index c1c0a120686..33e828ff18b 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java
@@ -233,11 +233,11 @@ public class VectorizedParquetRecordReader extends ParquetRecordReaderBase
     long allRowsInFile = 0;
     int blockIndex = 0;
     for (BlockMetaData block : parquetMetadata.getBlocks()) {
-      rowGroupNumToRowPos.put(blockIndex++, allRowsInFile);
-      allRowsInFile += block.getRowCount();
       if (offsets.contains(block.getStartingPos())) {
+        rowGroupNumToRowPos.put(blockIndex++, allRowsInFile);
         blocks.add(block);
       }
+      allRowsInFile += block.getRowCount();
     }
     // verify we found them all
     if (blocks.size() != rowGroupOffsets.length) {