You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ra...@apache.org on 2019/08/05 06:31:13 UTC

[arrow] branch master updated: ARROW-6030: [Java] Efficiently compute hash code for ArrowBufPointer

This is an automated email from the ASF dual-hosted git repository.

ravindra pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new a4480f7  ARROW-6030: [Java] Efficiently compute hash code for ArrowBufPointer
a4480f7 is described below

commit a4480f7ebce639c950695f80bdbea1d94bd3f8f6
Author: liyafan82 <li...@gmail.com>
AuthorDate: Mon Aug 5 12:00:46 2019 +0530

    ARROW-6030: [Java] Efficiently compute hash code for ArrowBufPointer
    
    As ArrowBufHasher is introduced, we can compute the hash code of a continuous region within an ArrowBuf.
    
    We optimize the process to make it efficient to avoid recomputation.
    
    Closes #4939 from liyafan82/fly_0725_pthash1 and squashes the following commits:
    
    a73432545 <liyafan82>  Efficiently compute hash code for ArrowBufPointer
    
    Authored-by: liyafan82 <li...@gmail.com>
    Signed-off-by: Pindikura Ravindra <ra...@dremio.com>
---
 .../apache/arrow/memory/util/ArrowBufPointer.java  |  64 +++++++++-
 .../arrow/memory/util/hash/DirectHasher.java       |   5 +
 .../arrow/memory/util/TestArrowBufPointer.java     | 131 ++++++++++++++++++++-
 3 files changed, 197 insertions(+), 3 deletions(-)

diff --git a/java/memory/src/main/java/org/apache/arrow/memory/util/ArrowBufPointer.java b/java/memory/src/main/java/org/apache/arrow/memory/util/ArrowBufPointer.java
index cafe4b0..3d0bf7b 100644
--- a/java/memory/src/main/java/org/apache/arrow/memory/util/ArrowBufPointer.java
+++ b/java/memory/src/main/java/org/apache/arrow/memory/util/ArrowBufPointer.java
@@ -17,6 +17,10 @@
 
 package org.apache.arrow.memory.util;
 
+import org.apache.arrow.memory.util.hash.ArrowBufHasher;
+import org.apache.arrow.memory.util.hash.DirectHasher;
+import org.apache.arrow.util.Preconditions;
+
 import io.netty.buffer.ArrowBuf;
 
 /**
@@ -25,17 +29,40 @@ import io.netty.buffer.ArrowBuf;
  */
 public final class ArrowBufPointer {
 
+  /**
+   * The hash code when the arrow buffer is null.
+   */
+  public static final int NULL_HASH_CODE = 0;
+
   private ArrowBuf buf;
 
   private int offset;
 
   private int length;
 
+  private int hashCode = NULL_HASH_CODE;
+
+  private final ArrowBufHasher hasher;
+
+  /**
+   * A flag indicating if the underlying memory region has changed.
+   */
+  private boolean hashCodeChanged = false;
+
   /**
    * The default constructor.
    */
   public ArrowBufPointer() {
+    this(DirectHasher.INSTANCE);
+  }
 
+  /**
+   * Constructs an arrow buffer pointer with the specified hasher.
+   * @param hasher the hasher to use.
+   */
+  public ArrowBufPointer(ArrowBufHasher hasher) {
+    Preconditions.checkNotNull(hasher);
+    this.hasher = hasher;
   }
 
   /**
@@ -45,6 +72,19 @@ public final class ArrowBufPointer {
    * @param length the length off set of the memory region pointed to.
    */
   public ArrowBufPointer(ArrowBuf buf, int offset, int length) {
+    this(buf, offset, length, DirectHasher.INSTANCE);
+  }
+
+  /**
+   * Constructs an Arrow buffer pointer.
+   * @param buf the underlying {@link ArrowBuf}, which can be null.
+   * @param offset the start off set of the memory region pointed to.
+   * @param length the length off set of the memory region pointed to.
+   * @param hasher the hasher used to calculate the hash code.
+   */
+  public ArrowBufPointer(ArrowBuf buf, int offset, int length, ArrowBufHasher hasher) {
+    Preconditions.checkNotNull(hasher);
+    this.hasher = hasher;
     set(buf, offset, length);
   }
 
@@ -58,6 +98,8 @@ public final class ArrowBufPointer {
     this.buf = buf;
     this.offset = offset;
     this.length = length;
+
+    hashCodeChanged = true;
   }
 
   /**
@@ -85,6 +127,13 @@ public final class ArrowBufPointer {
       return false;
     }
 
+    if (!hasher.equals(((ArrowBufPointer) o).hasher)) {
+      // note that the hasher is incorporated in equality determination
+      // this is to avoid problems in cases where two Arrow buffer pointers are not equal
+      // while having different hashers and equal hash codes.
+      return false;
+    }
+
     ArrowBufPointer other = (ArrowBufPointer) o;
     if (buf == null || other.buf == null) {
       if (buf == null && other.buf == null) {
@@ -100,7 +149,18 @@ public final class ArrowBufPointer {
 
   @Override
   public int hashCode() {
-    // implement after ARROW-5898
-    throw new UnsupportedOperationException();
+    if (!hashCodeChanged) {
+      return hashCode;
+    }
+
+    // re-compute the hash code
+    if (buf == null) {
+      hashCode = NULL_HASH_CODE;
+    } else {
+      hashCode = hasher.hashCode(buf, offset, length);
+    }
+
+    hashCodeChanged = false;
+    return hashCode;
   }
 }
diff --git a/java/memory/src/main/java/org/apache/arrow/memory/util/hash/DirectHasher.java b/java/memory/src/main/java/org/apache/arrow/memory/util/hash/DirectHasher.java
index 18f39c3..aa889e6 100644
--- a/java/memory/src/main/java/org/apache/arrow/memory/util/hash/DirectHasher.java
+++ b/java/memory/src/main/java/org/apache/arrow/memory/util/hash/DirectHasher.java
@@ -79,4 +79,9 @@ public class DirectHasher extends ArrowBufHasher {
 
     return hash;
   }
+
+  @Override
+  public boolean equals(Object obj) {
+    return obj != null && this.getClass() == obj.getClass();
+  }
 }
diff --git a/java/memory/src/test/java/org/apache/arrow/memory/util/TestArrowBufPointer.java b/java/memory/src/test/java/org/apache/arrow/memory/util/TestArrowBufPointer.java
index f9d738e..36b78af 100644
--- a/java/memory/src/test/java/org/apache/arrow/memory/util/TestArrowBufPointer.java
+++ b/java/memory/src/test/java/org/apache/arrow/memory/util/TestArrowBufPointer.java
@@ -17,12 +17,15 @@
 
 package org.apache.arrow.memory.util;
 
+import static junit.framework.TestCase.assertEquals;
 import static junit.framework.TestCase.assertTrue;
-
+import static org.junit.jupiter.api.Assertions.assertFalse;
 
 import org.apache.arrow.memory.BufferAllocator;
 import org.apache.arrow.memory.RootAllocator;
 
+import org.apache.arrow.memory.util.hash.ArrowBufHasher;
+import org.apache.arrow.memory.util.hash.DirectHasher;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -67,4 +70,130 @@ public class TestArrowBufPointer {
       }
     }
   }
+
+  @Test
+  public void testArrowBufPointersHashCode() {
+    final int vectorLength = 100;
+    try (ArrowBuf buf1 = allocator.buffer(vectorLength * 4);
+         ArrowBuf buf2 = allocator.buffer(vectorLength * 4)) {
+      for (int i = 0; i < vectorLength; i++) {
+        buf1.setInt(i * 4, i);
+        buf2.setInt(i * 4, i);
+      }
+
+      CounterHasher hasher1 = new CounterHasher();
+      CounterHasher hasher2 = new CounterHasher();
+
+      ArrowBufPointer pointer1 = new ArrowBufPointer(hasher1);
+      assertEquals(ArrowBufPointer.NULL_HASH_CODE, pointer1.hashCode());
+
+      ArrowBufPointer pointer2 = new ArrowBufPointer(hasher2);
+      assertEquals(ArrowBufPointer.NULL_HASH_CODE, pointer2.hashCode());
+
+      for (int i = 0; i < vectorLength; i++) {
+        pointer1.set(buf1, i * 4, 4);
+        pointer2.set(buf2, i * 4, 4);
+
+        assertEquals(pointer1.hashCode(), pointer2.hashCode());
+
+        // verify that the hash codes have been re-computed
+        assertEquals(hasher1.counter, i + 1);
+        assertEquals(hasher2.counter, i + 1);
+      }
+    }
+  }
+
+  @Test
+  public void testNullPointersHashCode() {
+    ArrowBufPointer pointer = new ArrowBufPointer();
+    assertEquals(ArrowBufPointer.NULL_HASH_CODE, pointer.hashCode());
+
+    pointer.set(null, 0, 0);
+    assertEquals(ArrowBufPointer.NULL_HASH_CODE, pointer.hashCode());
+  }
+
+  @Test
+  public void testReuseHashCode() {
+    try (ArrowBuf buf = allocator.buffer(10)) {
+      buf.setInt(0, 10);
+      buf.setInt(4, 20);
+
+      CounterHasher hasher = new CounterHasher();
+      ArrowBufPointer pointer = new ArrowBufPointer(hasher);
+
+      pointer.set(buf, 0, 4);
+      pointer.hashCode();
+
+      // hash code computed
+      assertEquals(1, hasher.counter);
+
+      // no hash code re-compute
+      pointer.hashCode();
+      assertEquals(1, hasher.counter);
+
+      // hash code re-computed
+      pointer.set(buf, 4, 4);
+      pointer.hashCode();
+      assertEquals(2, hasher.counter);
+    }
+  }
+
+  @Test
+  public void testHashersForEquality() {
+    try (ArrowBuf buf = allocator.buffer(10)) {
+      // pointer 1 uses the default hasher
+      ArrowBufPointer pointer1 = new ArrowBufPointer(buf, 0, 10);
+
+      // pointer 2 uses the counter hasher
+      ArrowBufPointer pointer2 = new ArrowBufPointer(buf, 0, 10, new CounterHasher());
+
+      // the two pointers cannot be equal, since they have different hashers
+      assertFalse(pointer1.equals(pointer2));
+    }
+  }
+
+  /**
+   * Hasher with a counter that increments each time a hash code is calculated.
+   * This is to validate that the hash code in {@link ArrowBufPointer} is reused.
+   */
+  class CounterHasher extends ArrowBufHasher {
+
+    protected int counter = 0;
+
+    @Override
+    protected int combineHashCode(int currentHashCode, int newHashCode) {
+      return 0;
+    }
+
+    @Override
+    protected int getByteHashCode(byte byteValue) {
+      return 0;
+    }
+
+    @Override
+    protected int getIntHashCode(int intValue) {
+      return 0;
+    }
+
+    @Override
+    protected int getLongHashCode(long longValue) {
+      return 0;
+    }
+
+    @Override
+    protected int finalizeHashCode(int hashCode) {
+      return 0;
+    }
+
+    @Override
+    public int hashCode(ArrowBuf buf, int offset, int length) {
+      counter += 1;
+      return DirectHasher.INSTANCE.hashCode(buf, offset, length);
+    }
+
+    @Override
+    public boolean equals(Object o) {
+      return o != null && this.getClass() == o.getClass();
+    }
+  }
 }