You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by xu...@apache.org on 2018/01/17 08:02:52 UTC
hive git commit: HIVE-18411: Fix ArrayIndexOutOfBoundsException for
VectorizedListColumnReader (Colin Ma, reviewed by Ferdinand Xu)
Repository: hive
Updated Branches:
refs/heads/master 7942bc6c9 -> 17abdb211
HIVE-18411: Fix ArrayIndexOutOfBoundsException for VectorizedListColumnReader (Colin Ma, reviewed by Ferdinand Xu)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/17abdb21
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/17abdb21
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/17abdb21
Branch: refs/heads/master
Commit: 17abdb211c1b2b749fc7d8265d31e6c5987cea4b
Parents: 7942bc6
Author: Ferdinand Xu <ch...@intel.com>
Authored: Wed Jan 17 15:39:54 2018 +0800
Committer: Ferdinand Xu <ch...@intel.com>
Committed: Wed Jan 17 15:39:54 2018 +0800
----------------------------------------------------------------------
.../vector/VectorizedListColumnReader.java | 5 +++
.../parquet/TestVectorizedListColumnReader.java | 33 ++++++++++++++++++++
2 files changed, 38 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/17abdb21/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java
index 12af77c..04fa129 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java
@@ -19,6 +19,7 @@ import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
@@ -52,6 +53,10 @@ public class VectorizedListColumnReader extends BaseVectorizedColumnReader {
@Override
public void readBatch(int total, ColumnVector column, TypeInfo columnType) throws IOException {
ListColumnVector lcv = (ListColumnVector) column;
+ // before readBatch, initial the size of offsets & lengths as the default value,
+ // the actual size will be assigned in setChildrenInfo() after reading complete.
+ lcv.offsets = new long[VectorizedRowBatch.DEFAULT_SIZE];
+ lcv.lengths = new long[VectorizedRowBatch.DEFAULT_SIZE];
// Because the length of ListColumnVector.child can't be known now,
// the valueList will save all data for ListColumnVector temporary.
List<Object> valueList = new ArrayList<>();
http://git-wip-us.apache.org/repos/asf/hive/blob/17abdb21/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedListColumnReader.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedListColumnReader.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedListColumnReader.java
index 8ea5d25..d241fc8 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedListColumnReader.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestVectorizedListColumnReader.java
@@ -166,6 +166,14 @@ public class TestVectorizedListColumnReader extends VectorizedColumnReaderTestBa
removeFile();
}
+ @Test
+ public void testVectorizedRowBatchSizeChange() throws Exception {
+ removeFile();
+ writeListData(initWriterFromFile(), false, 1200);
+ testVectorizedRowBatchSizeChangeListRead();
+ removeFile();
+ }
+
private void testListReadAllType(boolean isDictionaryEncoding, int elementNum) throws Exception {
testListRead(isDictionaryEncoding, "int", elementNum);
testListRead(isDictionaryEncoding, "long", elementNum);
@@ -337,4 +345,29 @@ public class TestVectorizedListColumnReader extends VectorizedColumnReaderTestBa
reader.close();
}
}
+
+ private void testVectorizedRowBatchSizeChangeListRead() throws Exception {
+ Configuration conf = new Configuration();
+ conf.set(IOConstants.COLUMNS, "list_binary_field_for_repeat_test");
+ conf.set(IOConstants.COLUMNS_TYPES, "array<string>");
+ conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
+ conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
+ VectorizedParquetRecordReader reader = createTestParquetReader(
+ "message hive_schema {repeated binary list_binary_field_for_repeat_test;}", conf);
+ VectorizedRowBatch previous = reader.createValue();
+ try {
+ while (reader.next(NullWritable.get(), previous)) {
+ ListColumnVector vector = (ListColumnVector) previous.cols[0];
+ // When deal with big data, the VectorizedRowBatch will be used for the different file split
+ // to cache the data. Here is the situation: the first split only have 100 rows,
+ // and VectorizedRowBatch cache them, meanwhile, the size of VectorizedRowBatch will be
+ // updated to 100. The following code is to simulate the size change, but there will be no
+ // ArrayIndexOutOfBoundsException when process the next split which has more than 100 rows.
+ vector.lengths = new long[100];
+ vector.offsets = new long[100];
+ }
+ } finally {
+ reader.close();
+ }
+ }
}