You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by pg...@apache.org on 2021/07/29 10:01:56 UTC

[orc] branch main updated: ORC-842: Remove newKey from StringHashTableDictionary (#748)

This is an automated email from the ASF dual-hosted git repository.

pgaref pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new 8458715  ORC-842: Remove newKey from StringHashTableDictionary (#748)
8458715 is described below

commit 8458715c12bcddb35b0bc265c3b240081fbeb270
Author: David Mollitor <dm...@apache.org>
AuthorDate: Mon Jul 12 12:28:07 2021 -0400

    ORC-842: Remove newKey from StringHashTableDictionary (#748)
    
    When adding keys to `StringHashTableDictionary`, do not copy data into intermediate container.
    
    Performance.
    
    No changes to functionality. Use existing unit tests. Existing unit test was updated to use new internal mechanism for testing.
    
    Change-Id: Ie04c940f13570d2d77df2bdc5621c944b7879823
    Signed-off-by: Panagiotis Garefalakis <pg...@apache.org>
---
 .../apache/orc/impl/StringHashTableDictionary.java | 38 +++++++++++++---------
 .../orc/impl/TestStringHashTableDictionary.java    |  6 ++--
 2 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java b/java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java
index d940ef8..80ad379 100644
--- a/java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java
+++ b/java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java
@@ -37,8 +37,6 @@ public class StringHashTableDictionary implements Dictionary {
   // containing starting offset of the key (in byte) in the byte array.
   private final DynamicIntArray keyOffsets;
 
-  private final Text newKey = new Text();
-
   private DynamicIntArray[] hashBuckets;
 
   private int capacity;
@@ -124,31 +122,29 @@ public class StringHashTableDictionary implements Dictionary {
     return DictionaryUtils.writeToTextInternal(out, position, this.keyOffsets, this.byteArray);
   }
 
-  @Override
-  public int add(byte[] bytes, int offset, int length) {
-    newKey.set(bytes, offset, length);
-    return add(newKey);
+  public int add(Text text) {
+    return add(text.getBytes(), 0, text.getLength());
   }
 
-  public int add(Text text) {
+  @Override
+  public int add(final byte[] bytes, final int offset, final int length) {
     resizeIfNeeded();
-    newKey.set(text);
 
-    int index = getIndex(text);
+    int index = getIndex(bytes, offset, length);
     DynamicIntArray candidateArray = hashBuckets[index];
 
     Text tmpText = new Text();
     for (int i = 0; i < candidateArray.size(); i++) {
-      getText(tmpText, candidateArray.get(i));
-      if (tmpText.equals(newKey)) {
-        return candidateArray.get(i);
+      final int candidateIndex = candidateArray.get(i);
+      getText(tmpText, candidateIndex);
+      if (tmpText.compareTo(bytes, offset, length) == 0) {
+        return candidateIndex;
       }
     }
 
     // if making it here, it means no match.
-    int len = newKey.getLength();
     int currIdx = keyOffsets.size();
-    keyOffsets.add(byteArray.add(newKey.getBytes(), 0, len));
+    keyOffsets.add(byteArray.add(bytes, offset, length));
     candidateArray.add(currIdx);
     return currIdx;
   }
@@ -172,10 +168,20 @@ public class StringHashTableDictionary implements Dictionary {
 
   /**
    * Compute the hash value and find the corresponding index.
-   *
    */
   int getIndex(Text text) {
-    return Math.floorMod(text.hashCode(), capacity);
+    return getIndex(text.getBytes(), 0, text.getLength());
+  }
+
+  /**
+   * Compute the hash value and find the corresponding index.
+   */
+  int getIndex(final byte[] bytes, final int offset, final int length) {
+    int hash = 1;
+    for (int i = offset; i < offset + length; i++) {
+      hash = (31 * hash) + bytes[i];
+    }
+    return Math.floorMod(hash, capacity);
   }
 
   // Resize the hash table, re-hash all the existing keys.
diff --git a/java/core/src/test/org/apache/orc/impl/TestStringHashTableDictionary.java b/java/core/src/test/org/apache/orc/impl/TestStringHashTableDictionary.java
index b3a5470..388a065 100644
--- a/java/core/src/test/org/apache/orc/impl/TestStringHashTableDictionary.java
+++ b/java/core/src/test/org/apache/orc/impl/TestStringHashTableDictionary.java
@@ -99,10 +99,8 @@ public class TestStringHashTableDictionary {
      * this way we know the order of the traverse() method.
      */
     @Override
-    int getIndex(Text text) {
-      String s = text.toString();
-      int underscore = s.indexOf("_");
-      return Integer.parseInt(text.toString().substring(0, underscore));
+    int getIndex(byte[] bytes, int offset, int length) {
+      return (char) bytes[0] - '0';
     }
   }