You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by do...@apache.org on 2021/08/04 05:26:34 UTC

[orc] branch main updated: ORC-830: Do Not Copy String When Adding to StringHashTableDictionary (#735)

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new 43c6a55  ORC-830: Do Not Copy String When Adding to StringHashTableDictionary (#735)
43c6a55 is described below

commit 43c6a554518f48b2280e048974574e648c28f6ad
Author: belugabehr <12...@users.noreply.github.com>
AuthorDate: Wed Aug 4 01:26:30 2021 -0400

    ORC-830: Do Not Copy String When Adding to StringHashTableDictionary (#735)
    
    ### What changes were proposed in this pull request?
    When there is a collision adding a value into a StringHashTableDictionary, a temp Text object is created and then each value in the byte array is copied into the temp Text until a match is found (or worst-case scenario, a match is not found and every value is loaded).
    
    Instead of loading (copying) the values, just compare directly against the byte array without copying the data into a intermediate (temp) buffer.
    
    ### Why are the changes needed?
    Performance.  StringTreeWriter#writeBatch() consumes some number the cycles, much of which is spent in getText().  By Changing this getText() implementation (removing the copy), the number of cycles consumed is decreased measurably.
    
    ### How was this patch tested?
    No functionality change, utilized existing unit tests.
---
 .../java/org/apache/orc/impl/DictionaryUtils.java  | 25 ++++++++++++++++++++++
 .../apache/orc/impl/StringHashTableDictionary.java |  5 ++---
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/java/core/src/java/org/apache/orc/impl/DictionaryUtils.java b/java/core/src/java/org/apache/orc/impl/DictionaryUtils.java
index 24208bb..a8e5322 100644
--- a/java/core/src/java/org/apache/orc/impl/DictionaryUtils.java
+++ b/java/core/src/java/org/apache/orc/impl/DictionaryUtils.java
@@ -70,4 +70,29 @@ public class DictionaryUtils {
     byteArray.write(out, offset, length);
     return length;
   }
+
+  /**
+   * Compare a UTF8 string from the byteArray using the offset in index-array.
+   *
+   * @param bytes an array containing bytes to search for
+   * @param offset the offset in the array
+   * @param length the number of bytes to search for
+   * @param position position in the keyOffsets
+   * @param keyOffsets starting offset of the key (in byte) in the byte array
+   * @param byteArray storing raw bytes of all key seen in dictionary
+   * @return true if the text is equal to the value within the byteArray; false
+   *         otherwise
+   */
+  public static boolean equalsInternal(byte[] bytes, int offset, int length, int position,
+      DynamicIntArray keyOffsets, DynamicByteArray byteArray) {
+    final int byteArrayOffset = keyOffsets.get(position);
+    final int keyLength;
+    if (position + 1 == keyOffsets.size()) {
+      keyLength = byteArray.size() - byteArrayOffset;
+    } else {
+      keyLength = keyOffsets.get(position + 1) - byteArrayOffset;
+    }
+    return 0 == byteArray.compare(bytes, offset, length, byteArrayOffset,
+      keyLength);
+  }
 }
diff --git a/java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java b/java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java
index 80ad379..efb9a05 100644
--- a/java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java
+++ b/java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java
@@ -133,11 +133,10 @@ public class StringHashTableDictionary implements Dictionary {
     int index = getIndex(bytes, offset, length);
     DynamicIntArray candidateArray = hashBuckets[index];
 
-    Text tmpText = new Text();
     for (int i = 0; i < candidateArray.size(); i++) {
       final int candidateIndex = candidateArray.get(i);
-      getText(tmpText, candidateIndex);
-      if (tmpText.compareTo(bytes, offset, length) == 0) {
+      if (DictionaryUtils.equalsInternal(bytes, offset, length, candidateIndex,
+          this.keyOffsets, this.byteArray)) {
         return candidateIndex;
       }
     }