You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by kg...@apache.org on 2021/11/18 07:29:58 UTC
[hive] branch master updated: HIVE-25443 : Arrow SerDe Cannot serialize/deserialize complex data types When there are more than 1024 values (#2581) (Syed Shameerur Rahman reviewed by Zoltan Haindrich)
This is an automated email from the ASF dual-hosted git repository.
kgyrtkirk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 8e693d1 HIVE-25443 : Arrow SerDe Cannot serialize/deserialize complex data types When there are more than 1024 values (#2581) (Syed Shameerur Rahman reviewed by Zoltan Haindrich)
8e693d1 is described below
commit 8e693d1b36e1ff0aacd802d16e1a3d0ec72ef04b
Author: Syed Shameerur Rahman <rh...@amazon.com>
AuthorDate: Thu Nov 18 12:59:50 2021 +0530
HIVE-25443 : Arrow SerDe Cannot serialize/deserialize complex data types When there are more than 1024 values (#2581) (Syed Shameerur Rahman reviewed by Zoltan Haindrich)
---
.../hive/ql/io/arrow/ArrowColumnarBatchSerDe.java | 4 +-
.../hadoop/hive/ql/io/arrow/Deserializer.java | 3 ++
.../ql/io/arrow/TestArrowColumnarBatchSerDe.java | 43 ++++++++++++++++++++++
3 files changed, 48 insertions(+), 2 deletions(-)
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/ArrowColumnarBatchSerDe.java b/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/ArrowColumnarBatchSerDe.java
index fdef3b8..ceb794f 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/ArrowColumnarBatchSerDe.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/ArrowColumnarBatchSerDe.java
@@ -210,9 +210,9 @@ public class ArrowColumnarBatchSerDe extends AbstractSerDe {
static ListColumnVector toStructListVector(MapColumnVector mapVector) {
final StructColumnVector structVector;
final ListColumnVector structListVector;
- structVector = new StructColumnVector();
+ structVector = new StructColumnVector(mapVector.childCount);
structVector.fields = new ColumnVector[] {mapVector.keys, mapVector.values};
- structListVector = new ListColumnVector();
+ structListVector = new ListColumnVector(mapVector.childCount, null);
structListVector.child = structVector;
structListVector.childCount = mapVector.childCount;
structListVector.isRepeating = mapVector.isRepeating;
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/Deserializer.java b/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/Deserializer.java
index ac4d237..ce8488f 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/Deserializer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/Deserializer.java
@@ -391,6 +391,7 @@ class Deserializer {
private void readList(FieldVector arrowVector, ListColumnVector hiveVector, ListTypeInfo typeInfo) {
final int size = arrowVector.getValueCount();
+ hiveVector.ensureSize(size, false);
final ArrowBuf offsets = arrowVector.getOffsetBuffer();
final int OFFSET_WIDTH = 4;
@@ -412,6 +413,7 @@ class Deserializer {
private void readMap(FieldVector arrowVector, MapColumnVector hiveVector, MapTypeInfo typeInfo) {
final int size = arrowVector.getValueCount();
+ hiveVector.ensureSize(size, false);
final ListTypeInfo mapStructListTypeInfo = toStructListTypeInfo(typeInfo);
final ListColumnVector mapStructListVector = toStructListVector(hiveVector);
final StructColumnVector mapStructVector = (StructColumnVector) mapStructListVector.child;
@@ -430,6 +432,7 @@ class Deserializer {
private void readStruct(FieldVector arrowVector, StructColumnVector hiveVector, StructTypeInfo typeInfo) {
final int size = arrowVector.getValueCount();
+ hiveVector.ensureSize(size, false);
final List<TypeInfo> fieldTypeInfos = typeInfo.getAllStructFieldTypeInfos();
final int fieldSize = arrowVector.getChildrenFromFields().size();
for (int i = 0; i < fieldSize; i++) {
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/arrow/TestArrowColumnarBatchSerDe.java b/ql/src/test/org/apache/hadoop/hive/ql/io/arrow/TestArrowColumnarBatchSerDe.java
index d803063..a4b296b 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/arrow/TestArrowColumnarBatchSerDe.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/arrow/TestArrowColumnarBatchSerDe.java
@@ -17,6 +17,7 @@
*/
package org.apache.hadoop.hive.ql.io.arrow;
+import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ARROW_BATCH_SIZE;
import com.google.common.base.Joiner;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
@@ -157,6 +158,7 @@ public class TestArrowColumnarBatchSerDe {
@Before
public void setUp() {
conf = new Configuration();
+ conf.setInt(HIVE_ARROW_BATCH_SIZE.varname, 1025);
}
private static ByteWritable byteW(int value) {
@@ -1024,4 +1026,45 @@ public class TestArrowColumnarBatchSerDe {
initAndSerializeAndDeserialize(schema, toList(DECIMAL_ROWS));
}
+ @Test
+ public void testListBooleanWithMoreThan1024Values() throws SerDeException {
+ String[][] schema = {
+ {"boolean_list", "array<boolean>"},
+ };
+
+ Object[][] rows = new Object[1025][1];
+ for (int i = 0; i < 1025; i++) {
+ rows[i][0] = new BooleanWritable(true);
+ }
+
+ initAndSerializeAndDeserialize(schema, toList(rows));
+ }
+
+ @Test
+ public void testStructBooleanWithMoreThan1024Values() throws SerDeException {
+ String[][] schema = {
+ {"boolean_struct", "struct<boolean1:boolean>"},
+ };
+
+ Object[][] rows = new Object[1025][1];
+ for (int i = 0; i < 1025; i++) {
+ rows[i][0] = new BooleanWritable(true);
+ }
+
+ initAndSerializeAndDeserialize(schema, toStruct(rows));
+ }
+
+ @Test
+ public void testMapIntergerWithMoreThan1024Values() throws SerDeException {
+ String[][] schema = {
+ {"int_map", "map<string,int>"},
+ };
+
+ Object[][] rows = new Object[1025][1];
+ for (int i = 0; i < 1025; i++) {
+ rows[i][0] = intW(i);
+ }
+
+ initAndSerializeAndDeserialize(schema, toMap(rows));
+ }
}