You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by et...@apache.org on 2023/04/26 07:13:37 UTC

[iceberg] 02/03: Add vectorization support for UTF8String

This is an automated email from the ASF dual-hosted git repository.

etudenhoefner pushed a commit to branch spark-uuid-read-write-support-3.4
in repository https://gitbox.apache.org/repos/asf/iceberg.git

commit ced5a0737fa608645f5ae386b387944912cb697d
Author: Eduard Tudenhoefner <et...@gmail.com>
AuthorDate: Mon Apr 24 17:17:08 2023 +0200

    Add vectorization support for UTF8String
---
 .../java/org/apache/iceberg/util/RandomUtil.java   |  4 ++++
 .../GenericArrowVectorAccessorFactory.java         | 27 +++++++++++++++++++++-
 .../vectorized/ArrowVectorAccessorFactory.java     |  7 ++++++
 3 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/api/src/test/java/org/apache/iceberg/util/RandomUtil.java b/api/src/test/java/org/apache/iceberg/util/RandomUtil.java
index a84dc4d8f8..9131e61661 100644
--- a/api/src/test/java/org/apache/iceberg/util/RandomUtil.java
+++ b/api/src/test/java/org/apache/iceberg/util/RandomUtil.java
@@ -182,6 +182,10 @@ public class RandomUtil {
         BigInteger unscaled = new BigInteger(String.valueOf(value + 1));
         BigDecimal bd = new BigDecimal(unscaled, type.scale());
         return negate(value) ? bd.negate() : bd;
+      case UUID:
+        byte[] uuidBytes = new byte[16];
+        random.nextBytes(uuidBytes);
+        return uuidBytes;
       default:
         throw new IllegalArgumentException(
             "Cannot generate random value for unknown type: " + primitive);
diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactory.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactory.java
index e9305e399c..a988516bc6 100644
--- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactory.java
+++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactory.java
@@ -218,7 +218,8 @@ public class GenericArrowVectorAccessorFactory<
         return new FixedSizeBinaryBackedDecimalAccessor<>(
             (FixedSizeBinaryVector) vector, decimalFactorySupplier.get());
       }
-      return new FixedSizeBinaryAccessor<>((FixedSizeBinaryVector) vector);
+      return new FixedSizeBinaryAccessor<>(
+          (FixedSizeBinaryVector) vector, stringFactorySupplier.get());
     }
     throw new UnsupportedOperationException("Unsupported vector: " + vector.getClass());
   }
@@ -558,16 +559,32 @@ public class GenericArrowVectorAccessorFactory<
       extends ArrowVectorAccessor<DecimalT, Utf8StringT, ArrayT, ChildVectorT> {
 
     private final FixedSizeBinaryVector vector;
+    private final StringFactory<Utf8StringT> stringFactory;
 
     FixedSizeBinaryAccessor(FixedSizeBinaryVector vector) {
       super(vector);
       this.vector = vector;
+      this.stringFactory = null;
+    }
+
+    FixedSizeBinaryAccessor(
+        FixedSizeBinaryVector vector, StringFactory<Utf8StringT> stringFactory) {
+      super(vector);
+      this.vector = vector;
+      this.stringFactory = stringFactory;
     }
 
     @Override
     public byte[] getBinary(int rowId) {
       return vector.get(rowId);
     }
+
+    @Override
+    public Utf8StringT getUTF8String(int rowId) {
+      return null == stringFactory
+          ? super.getUTF8String(rowId)
+          : stringFactory.ofRow(vector, rowId);
+    }
   }
 
   private static class ArrayAccessor<
@@ -794,6 +811,14 @@ public class GenericArrowVectorAccessorFactory<
     /** Create a UTF8 String from the row value in the arrow vector. */
     Utf8StringT ofRow(VarCharVector vector, int rowId);
 
+    /** Create a UTF8 String from the row value in the FixedSizeBinaryVector vector. */
+    default Utf8StringT ofRow(FixedSizeBinaryVector vector, int rowId) {
+      throw new UnsupportedOperationException(
+          String.format(
+              "Creating %s from a FixedSizeBinaryVector is not supported",
+              getGenericClass().getSimpleName()));
+    }
+
     /** Create a UTF8 String from the byte array. */
     Utf8StringT ofBytes(byte[] bytes);
 
diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java
index e32ebcb02b..29e938bb09 100644
--- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java
+++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java
@@ -21,10 +21,12 @@ package org.apache.iceberg.spark.data.vectorized;
 import java.math.BigDecimal;
 import java.nio.ByteBuffer;
 import org.apache.arrow.memory.ArrowBuf;
+import org.apache.arrow.vector.FixedSizeBinaryVector;
 import org.apache.arrow.vector.ValueVector;
 import org.apache.arrow.vector.VarCharVector;
 import org.apache.arrow.vector.complex.ListVector;
 import org.apache.iceberg.arrow.vectorized.GenericArrowVectorAccessorFactory;
+import org.apache.iceberg.util.UUIDUtil;
 import org.apache.spark.sql.types.Decimal;
 import org.apache.spark.sql.vectorized.ArrowColumnVector;
 import org.apache.spark.sql.vectorized.ColumnarArray;
@@ -74,6 +76,11 @@ final class ArrowVectorAccessorFactory
           null, vector.getDataBuffer().memoryAddress() + start, end - start);
     }
 
+    @Override
+    public UTF8String ofRow(FixedSizeBinaryVector vector, int rowId) {
+      return UTF8String.fromString(UUIDUtil.convert(vector.get(rowId)).toString());
+    }
+
     @Override
     public UTF8String ofBytes(byte[] bytes) {
       return UTF8String.fromBytes(bytes);