You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by xu...@apache.org on 2018/01/17 08:02:52 UTC

hive git commit: HIVE-18411: Fix ArrayIndexOutOfBoundsException for VectorizedListColumnReader (Colin Ma, reviewed by Ferdinand Xu)

Repository: hive
Updated Branches:
  refs/heads/master 7942bc6c9 -> 17abdb211


HIVE-18411: Fix ArrayIndexOutOfBoundsException for VectorizedListColumnReader (Colin Ma, reviewed by Ferdinand Xu)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/17abdb21
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/17abdb21
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/17abdb21

Branch: refs/heads/master
Commit: 17abdb211c1b2b749fc7d8265d31e6c5987cea4b
Parents: 7942bc6
Author: Ferdinand Xu <ch...@intel.com>
Authored: Wed Jan 17 15:39:54 2018 +0800
Committer: Ferdinand Xu <ch...@intel.com>
Committed: Wed Jan 17 15:39:54 2018 +0800

----------------------------------------------------------------------
 .../vector/VectorizedListColumnReader.java      |  5 +++
 .../parquet/TestVectorizedListColumnReader.java | 33 ++++++++++++++++++++
 2 files changed, 38 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/17abdb21/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java
index 12af77c..04fa129 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java
@@ -19,6 +19,7 @@ import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
 import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
 import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
 import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
@@ -52,6 +53,10 @@ public class VectorizedListColumnReader extends BaseVectorizedColumnReader {
   @Override
   public void readBatch(int total, ColumnVector column, TypeInfo columnType) throws IOException {
     ListColumnVector lcv = (ListColumnVector) column;
+    // before readBatch, initial the size of offsets & lengths as the default value,
+    // the actual size will be assigned in setChildrenInfo() after reading complete.
+    lcv.offsets = new long[VectorizedRowBatch.DEFAULT_SIZE];
+    lcv.lengths = new long[VectorizedRowBatch.DEFAULT_SIZE];
     // Because the length of ListColumnVector.child can't be known now,
     // the valueList will save all data for ListColumnVector temporary.
     List<Object> valueList = new ArrayList<>();

http://git-wip-us.apache.org/repos/asf/hive/blob/17abdb21/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedListColumnReader.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedListColumnReader.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedListColumnReader.java
index 8ea5d25..d241fc8 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedListColumnReader.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedListColumnReader.java
@@ -166,6 +166,14 @@ public class TestVectorizedListColumnReader extends VectorizedColumnReaderTestBa
     removeFile();
   }
 
+  @Test
+  public void testVectorizedRowBatchSizeChange() throws Exception {
+    removeFile();
+    writeListData(initWriterFromFile(), false, 1200);
+    testVectorizedRowBatchSizeChangeListRead();
+    removeFile();
+  }
+
   private void testListReadAllType(boolean isDictionaryEncoding, int elementNum) throws Exception {
     testListRead(isDictionaryEncoding, "int", elementNum);
     testListRead(isDictionaryEncoding, "long", elementNum);
@@ -337,4 +345,29 @@ public class TestVectorizedListColumnReader extends VectorizedColumnReaderTestBa
       reader.close();
     }
   }
+
+  private void testVectorizedRowBatchSizeChangeListRead() throws Exception {
+    Configuration conf = new Configuration();
+    conf.set(IOConstants.COLUMNS, "list_binary_field_for_repeat_test");
+    conf.set(IOConstants.COLUMNS_TYPES, "array<string>");
+    conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
+    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
+    VectorizedParquetRecordReader reader = createTestParquetReader(
+        "message hive_schema {repeated binary list_binary_field_for_repeat_test;}", conf);
+    VectorizedRowBatch previous = reader.createValue();
+    try {
+      while (reader.next(NullWritable.get(), previous)) {
+        ListColumnVector vector = (ListColumnVector) previous.cols[0];
+        // When deal with big data, the VectorizedRowBatch will be used for the different file split
+        // to cache the data. Here is the situation: the first split only have 100 rows,
+        // and VectorizedRowBatch cache them, meanwhile, the size of VectorizedRowBatch will be
+        // updated to 100. The following code is to simulate the size change, but there will be no
+        // ArrayIndexOutOfBoundsException when process the next split which has more than 100 rows.
+        vector.lengths = new long[100];
+        vector.offsets = new long[100];
+      }
+    } finally {
+      reader.close();
+    }
+  }
 }