You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by do...@apache.org on 2021/08/04 05:26:34 UTC
[orc] branch main updated: ORC-830: Do Not Copy String When Adding
to StringHashTableDictionary (#735)
This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push:
new 43c6a55 ORC-830: Do Not Copy String When Adding to StringHashTableDictionary (#735)
43c6a55 is described below
commit 43c6a554518f48b2280e048974574e648c28f6ad
Author: belugabehr <12...@users.noreply.github.com>
AuthorDate: Wed Aug 4 01:26:30 2021 -0400
ORC-830: Do Not Copy String When Adding to StringHashTableDictionary (#735)
### What changes were proposed in this pull request?
When there is a collision adding a value into a StringHashTableDictionary, a temp Text object is created and then each value in the byte array is copied into the temp Text until a match is found (or worst-case scenario, a match is not found and every value is loaded).
Instead of loading (copying) the values, just compare directly against the byte array without copying the data into a intermediate (temp) buffer.
### Why are the changes needed?
Performance. StringTreeWriter#writeBatch() consumes some number the cycles, much of which is spent in getText(). By Changing this getText() implementation (removing the copy), the number of cycles consumed is decreased measurably.
### How was this patch tested?
No functionality change, utilized existing unit tests.
---
.../java/org/apache/orc/impl/DictionaryUtils.java | 25 ++++++++++++++++++++++
.../apache/orc/impl/StringHashTableDictionary.java | 5 ++---
2 files changed, 27 insertions(+), 3 deletions(-)
diff --git a/java/core/src/java/org/apache/orc/impl/DictionaryUtils.java b/java/core/src/java/org/apache/orc/impl/DictionaryUtils.java
index 24208bb..a8e5322 100644
--- a/java/core/src/java/org/apache/orc/impl/DictionaryUtils.java
+++ b/java/core/src/java/org/apache/orc/impl/DictionaryUtils.java
@@ -70,4 +70,29 @@ public class DictionaryUtils {
byteArray.write(out, offset, length);
return length;
}
+
+ /**
+ * Compare a UTF8 string from the byteArray using the offset in index-array.
+ *
+ * @param bytes an array containing bytes to search for
+ * @param offset the offset in the array
+ * @param length the number of bytes to search for
+ * @param position position in the keyOffsets
+ * @param keyOffsets starting offset of the key (in byte) in the byte array
+ * @param byteArray storing raw bytes of all key seen in dictionary
+ * @return true if the text is equal to the value within the byteArray; false
+ * otherwise
+ */
+ public static boolean equalsInternal(byte[] bytes, int offset, int length, int position,
+ DynamicIntArray keyOffsets, DynamicByteArray byteArray) {
+ final int byteArrayOffset = keyOffsets.get(position);
+ final int keyLength;
+ if (position + 1 == keyOffsets.size()) {
+ keyLength = byteArray.size() - byteArrayOffset;
+ } else {
+ keyLength = keyOffsets.get(position + 1) - byteArrayOffset;
+ }
+ return 0 == byteArray.compare(bytes, offset, length, byteArrayOffset,
+ keyLength);
+ }
}
diff --git a/java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java b/java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java
index 80ad379..efb9a05 100644
--- a/java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java
+++ b/java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java
@@ -133,11 +133,10 @@ public class StringHashTableDictionary implements Dictionary {
int index = getIndex(bytes, offset, length);
DynamicIntArray candidateArray = hashBuckets[index];
- Text tmpText = new Text();
for (int i = 0; i < candidateArray.size(); i++) {
final int candidateIndex = candidateArray.get(i);
- getText(tmpText, candidateIndex);
- if (tmpText.compareTo(bytes, offset, length) == 0) {
+ if (DictionaryUtils.equalsInternal(bytes, offset, length, candidateIndex,
+ this.keyOffsets, this.byteArray)) {
return candidateIndex;
}
}