You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by do...@apache.org on 2021/08/29 02:44:42 UTC
[orc] branch main updated: ORC-852: Allow DynamicByteArray to
return a ByteBuffer (#754)
This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push:
new 9836870 ORC-852: Allow DynamicByteArray to return a ByteBuffer (#754)
9836870 is described below
commit 98368707a3562e82fcb486f7b864f737b9a2787e
Author: belugabehr <12...@users.noreply.github.com>
AuthorDate: Sat Aug 28 22:44:37 2021 -0400
ORC-852: Allow DynamicByteArray to return a ByteBuffer (#754)
### What changes were proposed in this pull request?
Allow `DyanmicByteArray` and `Dictionary` to return a ByteBuffer of its content.
### Why are the changes needed?
Performance.
### How was this patch tested?
No change to functionality. Existing unit tests used.
---
.../src/java/org/apache/orc/impl/Dictionary.java | 3 +++
.../java/org/apache/orc/impl/DictionaryUtils.java | 22 ++++++++++++++++++++++
.../java/org/apache/orc/impl/DynamicByteArray.java | 12 ++++++++++++
.../apache/orc/impl/StringHashTableDictionary.java | 15 +++++++++++----
.../org/apache/orc/impl/StringRedBlackTree.java | 6 ++++++
5 files changed, 54 insertions(+), 4 deletions(-)
diff --git a/java/core/src/java/org/apache/orc/impl/Dictionary.java b/java/core/src/java/org/apache/orc/impl/Dictionary.java
index 776e455..430f343 100644
--- a/java/core/src/java/org/apache/orc/impl/Dictionary.java
+++ b/java/core/src/java/org/apache/orc/impl/Dictionary.java
@@ -22,6 +22,7 @@ import org.apache.hadoop.io.Text;
import java.io.IOException;
import java.io.OutputStream;
+import java.nio.ByteBuffer;
/**
@@ -52,6 +53,8 @@ public interface Dictionary {
*/
void getText(Text result, int position);
+ ByteBuffer getText(int position);
+
/**
* Given the position index, write the original string, before being encoded,
* to the OutputStream.
diff --git a/java/core/src/java/org/apache/orc/impl/DictionaryUtils.java b/java/core/src/java/org/apache/orc/impl/DictionaryUtils.java
index 1de37df..144de02 100644
--- a/java/core/src/java/org/apache/orc/impl/DictionaryUtils.java
+++ b/java/core/src/java/org/apache/orc/impl/DictionaryUtils.java
@@ -21,6 +21,7 @@ import org.apache.hadoop.io.Text;
import java.io.IOException;
import java.io.OutputStream;
+import java.nio.ByteBuffer;
public class DictionaryUtils {
private DictionaryUtils() {
@@ -47,6 +48,27 @@ public class DictionaryUtils {
}
/**
+ * Return a {@code ByteBuffer} containing the data at a certain offset within a
+ * {@code DynamicByteArray}.
+ *
+ * @param position position in the keyOffsets
+ * @param keyOffsets starting offset of the key (in byte) in the byte array
+ * @param byteArray storing raw bytes of all keys seen in dictionary
+ * @return the number of bytes written to the output stream
+ */
+ public static ByteBuffer getTextInternal(int position, DynamicIntArray keyOffsets,
+ DynamicByteArray byteArray) {
+ final int offset = keyOffsets.get(position);
+ final int length;
+ if (position + 1 == keyOffsets.size()) {
+ length = byteArray.size() - offset;
+ } else {
+ length = keyOffsets.get(position + 1) - offset;
+ }
+ return byteArray.get(offset, length);
+ }
+
+ /**
* Write a UTF8 string from the byteArray, using the offset in index-array,
* into an OutputStream
*
diff --git a/java/core/src/java/org/apache/orc/impl/DynamicByteArray.java b/java/core/src/java/org/apache/orc/impl/DynamicByteArray.java
index 40bef60..fa8e0a0 100644
--- a/java/core/src/java/org/apache/orc/impl/DynamicByteArray.java
+++ b/java/core/src/java/org/apache/orc/impl/DynamicByteArray.java
@@ -294,6 +294,18 @@ public final class DynamicByteArray {
return result;
}
+ public ByteBuffer get(int offset, int length) {
+ final int currentChunk = offset / chunkSize;
+ final int currentOffset = offset % chunkSize;
+ final int currentLength = Math.min(length, chunkSize - currentOffset);
+ if (currentLength == length) {
+ return ByteBuffer.wrap(data[currentChunk], currentOffset, length);
+ }
+ ByteBuffer bb = ByteBuffer.allocate(length);
+ setByteBuffer(bb, offset, length);
+ return (ByteBuffer) bb.flip();
+ }
+
/**
* Get the size of the buffers.
*/
diff --git a/java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java b/java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java
index fb0a460..6cbf147 100644
--- a/java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java
+++ b/java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java
@@ -22,7 +22,7 @@ import org.apache.hadoop.io.Text;
import java.io.IOException;
import java.io.OutputStream;
-
+import java.nio.ByteBuffer;
/**
* Using HashTable to represent a dictionary. The strings are stored as UTF-8 bytes
@@ -124,6 +124,11 @@ public class StringHashTableDictionary implements Dictionary {
}
@Override
+ public ByteBuffer getText(int positionInKeyOffset) {
+ return DictionaryUtils.getTextInternal(positionInKeyOffset, this.keyOffsets, this.byteArray);
+ }
+
+ @Override
public int writeTo(OutputStream out, int position) throws IOException {
return DictionaryUtils.writeToTextInternal(out, position, this.keyOffsets, this.byteArray);
}
@@ -197,12 +202,13 @@ public class StringHashTableDictionary implements Dictionary {
resizedHashBuckets[i] = createBucket();
}
- Text tmpText = new Text();
for (int i = 0; i < oldCapacity; i++) {
DynamicIntArray oldBucket = hashBuckets[i];
for (int j = 0; j < oldBucket.size(); j++) {
- getText(tmpText, oldBucket.get(j));
- resizedHashBuckets[getIndex(tmpText)].add(oldBucket.get(j));
+ final int offset = oldBucket.get(j);
+ ByteBuffer text = getText(offset);
+ resizedHashBuckets[getIndex(text.array(),
+ text.position(), text.remaining())].add(oldBucket.get(j));
}
}
@@ -218,4 +224,5 @@ public class StringHashTableDictionary implements Dictionary {
return byteArray.getSizeInBytes() + keyOffsets.getSizeInBytes() + bucketTotalSize ;
}
+
}
diff --git a/java/core/src/java/org/apache/orc/impl/StringRedBlackTree.java b/java/core/src/java/org/apache/orc/impl/StringRedBlackTree.java
index e256249..c78e20b 100644
--- a/java/core/src/java/org/apache/orc/impl/StringRedBlackTree.java
+++ b/java/core/src/java/org/apache/orc/impl/StringRedBlackTree.java
@@ -21,6 +21,7 @@ import org.apache.hadoop.io.Text;
import java.io.IOException;
import java.io.OutputStream;
+import java.nio.ByteBuffer;
/**
* A red-black tree that stores strings. The strings are stored as UTF-8 bytes
@@ -113,6 +114,11 @@ public class StringRedBlackTree extends RedBlackTree implements Dictionary {
}
@Override
+ public ByteBuffer getText(int positionInKeyOffset) {
+ return DictionaryUtils.getTextInternal(positionInKeyOffset, this.keyOffsets, this.byteArray);
+ }
+
+ @Override
public int writeTo(OutputStream out, int position) throws IOException {
return DictionaryUtils.writeToTextInternal(out, position, this.keyOffsets,
this.byteArray);