You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@iceberg.apache.org by GitBox <gi...@apache.org> on 2022/06/27 17:21:21 UTC
[GitHub] [iceberg] kbendick commented on a diff in pull request #5137: Arrow: Avoid extra dictionary buffer copy

kbendick commented on code in PR #5137:
URL: https://github.com/apache/iceberg/pull/5137#discussion_r907617653


##########
spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java:
##########
@@ -78,6 +79,17 @@ public UTF8String ofRow(VarCharVector vector, int rowId) {
     public UTF8String ofBytes(byte[] bytes) {
       return UTF8String.fromBytes(bytes);
     }
+
+    @Override
+    public UTF8String ofByteBuffer(ByteBuffer byteBuffer) {
+      if (byteBuffer.hasArray()) {
+        return UTF8String.fromBytes(
+                byteBuffer.array(), byteBuffer.arrayOffset() + byteBuffer.position(), byteBuffer.remaining());

Review Comment:
   Nit: over-indented (should be 4 spaces from the start of `return` on the line above).



##########
spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java:
##########
@@ -78,6 +79,17 @@ public UTF8String ofRow(VarCharVector vector, int rowId) {
     public UTF8String ofBytes(byte[] bytes) {
       return UTF8String.fromBytes(bytes);
     }
+
+    @Override
+    public UTF8String ofByteBuffer(ByteBuffer byteBuffer) {
+      if (byteBuffer.hasArray()) {
+        return UTF8String.fromBytes(
+                byteBuffer.array(), byteBuffer.arrayOffset() + byteBuffer.position(), byteBuffer.remaining());

Review Comment:
   Nit: indentation



##########
arrow/src/main/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactory.java:
##########
@@ -372,22 +365,26 @@ public final Utf8StringT getUTF8String(int rowId) {
 
   private static class DictionaryStringAccessor<DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable>
           extends ArrowVectorAccessor<DecimalT, Utf8StringT, ArrayT, ChildVectorT> {
-    private final Utf8StringT[] decodedDictionary;
+    private final Dictionary dictionary;
+    private final StringFactory<Utf8StringT> stringFactory;
     private final IntVector offsetVector;
+    private final Utf8StringT[] cache;
 
     DictionaryStringAccessor(IntVector vector, Dictionary dictionary, StringFactory<Utf8StringT> stringFactory) {
       super(vector);
       this.offsetVector = vector;
-      this.decodedDictionary = IntStream.rangeClosed(0, dictionary.getMaxId())
-          .mapToObj(dictionary::decodeToBinary)
-          .map(binary -> stringFactory.ofBytes(binary.getBytes()))
-          .toArray(genericArray(stringFactory.getGenericClass()));
+      this.dictionary = dictionary;
+      this.stringFactory = stringFactory;
+      this.cache = genericArray(stringFactory.getGenericClass(), dictionary.getMaxId() + 1);

Review Comment:
   Question: why are you adding 1 here?



##########
arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowVectorAccessors.java:
##########
@@ -66,5 +67,14 @@ public String ofRow(VarCharVector vector, int rowId) {
     public String ofBytes(byte[] bytes) {
       return new String(bytes, StandardCharsets.UTF_8);
     }
+
+    @Override
+    public String ofByteBuffer(ByteBuffer byteBuffer) {
+      if (byteBuffer.hasArray()) {
+        return new String(byteBuffer.array(), byteBuffer.arrayOffset() + byteBuffer.position(),
+                byteBuffer.remaining(), StandardCharsets.UTF_8);

Review Comment:
   Nit: over-indented (should be 4 spaces from the start of `return` on the line above).



##########
spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java:
##########
@@ -78,6 +79,17 @@ public UTF8String ofRow(VarCharVector vector, int rowId) {
     public UTF8String ofBytes(byte[] bytes) {
       return UTF8String.fromBytes(bytes);
     }
+
+    @Override
+    public UTF8String ofByteBuffer(ByteBuffer byteBuffer) {
+      if (byteBuffer.hasArray()) {
+        return UTF8String.fromBytes(
+                byteBuffer.array(), byteBuffer.arrayOffset() + byteBuffer.position(), byteBuffer.remaining());

Review Comment:
   Nit: indentation



##########
spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java:
##########
@@ -78,6 +79,17 @@ public UTF8String ofRow(VarCharVector vector, int rowId) {
     public UTF8String ofBytes(byte[] bytes) {
       return UTF8String.fromBytes(bytes);
     }
+
+    @Override
+    public UTF8String ofByteBuffer(ByteBuffer byteBuffer) {
+      if (byteBuffer.hasArray()) {
+        return UTF8String.fromBytes(
+                byteBuffer.array(), byteBuffer.arrayOffset() + byteBuffer.position(), byteBuffer.remaining());

Review Comment:
   Nit: indentation



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@iceberg.apache.org
For additional commands, e-mail: issues-help@iceberg.apache.org