You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by kg...@apache.org on 2021/11/18 07:29:58 UTC

[hive] branch master updated: HIVE-25443 : Arrow SerDe Cannot serialize/deserialize complex data types When there are more than 1024 values (#2581) (Syed Shameerur Rahman reviewed by Zoltan Haindrich)

This is an automated email from the ASF dual-hosted git repository.

kgyrtkirk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new 8e693d1  HIVE-25443 : Arrow SerDe Cannot serialize/deserialize complex data types When there are more than 1024 values (#2581) (Syed Shameerur Rahman reviewed by Zoltan Haindrich)
8e693d1 is described below

commit 8e693d1b36e1ff0aacd802d16e1a3d0ec72ef04b
Author: Syed Shameerur Rahman <rh...@amazon.com>
AuthorDate: Thu Nov 18 12:59:50 2021 +0530

    HIVE-25443 : Arrow SerDe Cannot serialize/deserialize complex data types When there are more than 1024 values (#2581) (Syed Shameerur Rahman reviewed by Zoltan Haindrich)
---
 .../hive/ql/io/arrow/ArrowColumnarBatchSerDe.java  |  4 +-
 .../hadoop/hive/ql/io/arrow/Deserializer.java      |  3 ++
 .../ql/io/arrow/TestArrowColumnarBatchSerDe.java   | 43 ++++++++++++++++++++++
 3 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/ArrowColumnarBatchSerDe.java b/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/ArrowColumnarBatchSerDe.java
index fdef3b8..ceb794f 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/ArrowColumnarBatchSerDe.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/ArrowColumnarBatchSerDe.java
@@ -210,9 +210,9 @@ public class ArrowColumnarBatchSerDe extends AbstractSerDe {
   static ListColumnVector toStructListVector(MapColumnVector mapVector) {
     final StructColumnVector structVector;
     final ListColumnVector structListVector;
-    structVector = new StructColumnVector();
+    structVector = new StructColumnVector(mapVector.childCount);
     structVector.fields = new ColumnVector[] {mapVector.keys, mapVector.values};
-    structListVector = new ListColumnVector();
+    structListVector = new ListColumnVector(mapVector.childCount, null);
     structListVector.child = structVector;
     structListVector.childCount = mapVector.childCount;
     structListVector.isRepeating = mapVector.isRepeating;
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/Deserializer.java b/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/Deserializer.java
index ac4d237..ce8488f 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/Deserializer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/Deserializer.java
@@ -391,6 +391,7 @@ class Deserializer {
 
   private void readList(FieldVector arrowVector, ListColumnVector hiveVector, ListTypeInfo typeInfo) {
     final int size = arrowVector.getValueCount();
+    hiveVector.ensureSize(size, false);
     final ArrowBuf offsets = arrowVector.getOffsetBuffer();
     final int OFFSET_WIDTH = 4;
 
@@ -412,6 +413,7 @@ class Deserializer {
 
   private void readMap(FieldVector arrowVector, MapColumnVector hiveVector, MapTypeInfo typeInfo) {
     final int size = arrowVector.getValueCount();
+    hiveVector.ensureSize(size, false);
     final ListTypeInfo mapStructListTypeInfo = toStructListTypeInfo(typeInfo);
     final ListColumnVector mapStructListVector = toStructListVector(hiveVector);
     final StructColumnVector mapStructVector = (StructColumnVector) mapStructListVector.child;
@@ -430,6 +432,7 @@ class Deserializer {
 
   private void readStruct(FieldVector arrowVector, StructColumnVector hiveVector, StructTypeInfo typeInfo) {
     final int size = arrowVector.getValueCount();
+    hiveVector.ensureSize(size, false);
     final List<TypeInfo> fieldTypeInfos = typeInfo.getAllStructFieldTypeInfos();
     final int fieldSize = arrowVector.getChildrenFromFields().size();
     for (int i = 0; i < fieldSize; i++) {
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/arrow/TestArrowColumnarBatchSerDe.java b/ql/src/test/org/apache/hadoop/hive/ql/io/arrow/TestArrowColumnarBatchSerDe.java
index d803063..a4b296b 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/arrow/TestArrowColumnarBatchSerDe.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/arrow/TestArrowColumnarBatchSerDe.java
@@ -17,6 +17,7 @@
  */
 package org.apache.hadoop.hive.ql.io.arrow;
 
+import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ARROW_BATCH_SIZE;
 import com.google.common.base.Joiner;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
@@ -157,6 +158,7 @@ public class TestArrowColumnarBatchSerDe {
   @Before
   public void setUp() {
     conf = new Configuration();
+    conf.setInt(HIVE_ARROW_BATCH_SIZE.varname, 1025);
   }
 
   private static ByteWritable byteW(int value) {
@@ -1024,4 +1026,45 @@ public class TestArrowColumnarBatchSerDe {
     initAndSerializeAndDeserialize(schema, toList(DECIMAL_ROWS));
   }
 
+  @Test
+  public void testListBooleanWithMoreThan1024Values() throws SerDeException {
+    String[][] schema = {
+            {"boolean_list", "array<boolean>"},
+    };
+
+    Object[][] rows = new Object[1025][1];
+    for (int i = 0; i < 1025; i++) {
+      rows[i][0] = new BooleanWritable(true);
+    }
+
+    initAndSerializeAndDeserialize(schema, toList(rows));
+  }
+
+  @Test
+  public void testStructBooleanWithMoreThan1024Values() throws SerDeException {
+    String[][] schema = {
+            {"boolean_struct", "struct<boolean1:boolean>"},
+    };
+
+    Object[][] rows = new Object[1025][1];
+    for (int i = 0; i < 1025; i++) {
+      rows[i][0] = new BooleanWritable(true);
+    }
+
+    initAndSerializeAndDeserialize(schema, toStruct(rows));
+  }
+
+  @Test
+  public void testMapIntergerWithMoreThan1024Values() throws SerDeException {
+    String[][] schema = {
+            {"int_map", "map<string,int>"},
+    };
+
+    Object[][] rows = new Object[1025][1];
+    for (int i = 0; i < 1025; i++) {
+      rows[i][0] = intW(i);
+    }
+
+    initAndSerializeAndDeserialize(schema, toMap(rows));
+  }
 }