You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by do...@apache.org on 2021/08/29 02:44:42 UTC
[orc] branch main updated: ORC-852: Allow DynamicByteArray to return a ByteBuffer (#754)

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new 9836870  ORC-852: Allow DynamicByteArray to return a ByteBuffer (#754)
9836870 is described below

commit 98368707a3562e82fcb486f7b864f737b9a2787e
Author: belugabehr <12...@users.noreply.github.com>
AuthorDate: Sat Aug 28 22:44:37 2021 -0400

    ORC-852: Allow DynamicByteArray to return a ByteBuffer (#754)
    
    ### What changes were proposed in this pull request?
    
    Allow `DyanmicByteArray` and `Dictionary` to return a ByteBuffer of its content.
    
    ### Why are the changes needed?
    
    Performance.
    
    ### How was this patch tested?
    
    No change to functionality. Existing unit tests used.
---
 .../src/java/org/apache/orc/impl/Dictionary.java   |  3 +++
 .../java/org/apache/orc/impl/DictionaryUtils.java  | 22 ++++++++++++++++++++++
 .../java/org/apache/orc/impl/DynamicByteArray.java | 12 ++++++++++++
 .../apache/orc/impl/StringHashTableDictionary.java | 15 +++++++++++----
 .../org/apache/orc/impl/StringRedBlackTree.java    |  6 ++++++
 5 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/java/core/src/java/org/apache/orc/impl/Dictionary.java b/java/core/src/java/org/apache/orc/impl/Dictionary.java
index 776e455..430f343 100644
--- a/java/core/src/java/org/apache/orc/impl/Dictionary.java
+++ b/java/core/src/java/org/apache/orc/impl/Dictionary.java
@@ -22,6 +22,7 @@ import org.apache.hadoop.io.Text;
 
 import java.io.IOException;
 import java.io.OutputStream;
+import java.nio.ByteBuffer;
 
 
 /**
@@ -52,6 +53,8 @@ public interface Dictionary {
    */
   void getText(Text result, int position);
 
+  ByteBuffer getText(int position);
+
   /**
    * Given the position index, write the original string, before being encoded,
    * to the OutputStream.
diff --git a/java/core/src/java/org/apache/orc/impl/DictionaryUtils.java b/java/core/src/java/org/apache/orc/impl/DictionaryUtils.java
index 1de37df..144de02 100644
--- a/java/core/src/java/org/apache/orc/impl/DictionaryUtils.java
+++ b/java/core/src/java/org/apache/orc/impl/DictionaryUtils.java
@@ -21,6 +21,7 @@ import org.apache.hadoop.io.Text;
 
 import java.io.IOException;
 import java.io.OutputStream;
+import java.nio.ByteBuffer;
 
 public class DictionaryUtils {
   private DictionaryUtils() {
@@ -47,6 +48,27 @@ public class DictionaryUtils {
   }
 
   /**
+   * Return a {@code ByteBuffer} containing the data at a certain offset within a
+   * {@code DynamicByteArray}.
+   *
+   * @param position position in the keyOffsets
+   * @param keyOffsets starting offset of the key (in byte) in the byte array
+   * @param byteArray storing raw bytes of all keys seen in dictionary
+   * @return the number of bytes written to the output stream
+   */
+  public static ByteBuffer getTextInternal(int position, DynamicIntArray keyOffsets,
+      DynamicByteArray byteArray) {
+    final int offset = keyOffsets.get(position);
+    final int length;
+    if (position + 1 == keyOffsets.size()) {
+      length = byteArray.size() - offset;
+    } else {
+      length = keyOffsets.get(position + 1) - offset;
+    }
+    return byteArray.get(offset, length);
+  }
+
+  /**
    * Write a UTF8 string from the byteArray, using the offset in index-array,
    * into an OutputStream
    *
diff --git a/java/core/src/java/org/apache/orc/impl/DynamicByteArray.java b/java/core/src/java/org/apache/orc/impl/DynamicByteArray.java
index 40bef60..fa8e0a0 100644
--- a/java/core/src/java/org/apache/orc/impl/DynamicByteArray.java
+++ b/java/core/src/java/org/apache/orc/impl/DynamicByteArray.java
@@ -294,6 +294,18 @@ public final class DynamicByteArray {
     return result;
   }
 
+  public ByteBuffer get(int offset, int length) {
+    final int currentChunk = offset / chunkSize;
+    final int currentOffset = offset % chunkSize;
+    final int currentLength = Math.min(length, chunkSize - currentOffset);
+    if (currentLength == length) {
+      return ByteBuffer.wrap(data[currentChunk], currentOffset, length);
+    }
+    ByteBuffer bb = ByteBuffer.allocate(length);
+    setByteBuffer(bb, offset, length);
+    return (ByteBuffer) bb.flip();
+  }
+
   /**
    * Get the size of the buffers.
    */
diff --git a/java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java b/java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java
index fb0a460..6cbf147 100644
--- a/java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java
+++ b/java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java
@@ -22,7 +22,7 @@ import org.apache.hadoop.io.Text;
 
 import java.io.IOException;
 import java.io.OutputStream;
-
+import java.nio.ByteBuffer;
 
 /**
  * Using HashTable to represent a dictionary. The strings are stored as UTF-8 bytes
@@ -124,6 +124,11 @@ public class StringHashTableDictionary implements Dictionary {
   }
 
   @Override
+  public ByteBuffer getText(int positionInKeyOffset) {
+    return DictionaryUtils.getTextInternal(positionInKeyOffset, this.keyOffsets, this.byteArray);
+  }
+
+  @Override
   public int writeTo(OutputStream out, int position) throws IOException {
     return DictionaryUtils.writeToTextInternal(out, position, this.keyOffsets, this.byteArray);
   }
@@ -197,12 +202,13 @@ public class StringHashTableDictionary implements Dictionary {
       resizedHashBuckets[i] = createBucket();
     }
 
-    Text tmpText = new Text();
     for (int i = 0; i < oldCapacity; i++) {
       DynamicIntArray oldBucket = hashBuckets[i];
       for (int j = 0; j < oldBucket.size(); j++) {
-        getText(tmpText, oldBucket.get(j));
-        resizedHashBuckets[getIndex(tmpText)].add(oldBucket.get(j));
+        final int offset = oldBucket.get(j);
+        ByteBuffer text = getText(offset);
+        resizedHashBuckets[getIndex(text.array(),
+                text.position(), text.remaining())].add(oldBucket.get(j));
       }
     }
 
@@ -218,4 +224,5 @@ public class StringHashTableDictionary implements Dictionary {
 
     return byteArray.getSizeInBytes() + keyOffsets.getSizeInBytes() + bucketTotalSize ;
   }
+
 }
diff --git a/java/core/src/java/org/apache/orc/impl/StringRedBlackTree.java b/java/core/src/java/org/apache/orc/impl/StringRedBlackTree.java
index e256249..c78e20b 100644
--- a/java/core/src/java/org/apache/orc/impl/StringRedBlackTree.java
+++ b/java/core/src/java/org/apache/orc/impl/StringRedBlackTree.java
@@ -21,6 +21,7 @@ import org.apache.hadoop.io.Text;
 
 import java.io.IOException;
 import java.io.OutputStream;
+import java.nio.ByteBuffer;
 
 /**
  * A red-black tree that stores strings. The strings are stored as UTF-8 bytes
@@ -113,6 +114,11 @@ public class StringRedBlackTree extends RedBlackTree implements Dictionary {
   }
 
   @Override
+  public ByteBuffer getText(int positionInKeyOffset) {
+    return DictionaryUtils.getTextInternal(positionInKeyOffset, this.keyOffsets, this.byteArray);
+  }
+
+  @Override
   public int writeTo(OutputStream out, int position) throws IOException {
     return DictionaryUtils.writeToTextInternal(out, position, this.keyOffsets,
         this.byteArray);