You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by li...@apache.org on 2018/04/10 04:07:33 UTC

spark git commit: [SPARK-23947][SQL] Add hashUTF8String convenience method to hasher classes

Repository: spark
Updated Branches:
  refs/heads/master 61b724724 -> f94f3624e


[SPARK-23947][SQL] Add hashUTF8String convenience method to hasher classes

## What changes were proposed in this pull request?

Add `hashUTF8String()` to the hasher classes to allow Spark SQL codegen to generate cleaner code for hashing `UTF8String`s. No change in behavior otherwise.

Although with the introduction of SPARK-10399, the code size for hashing `UTF8String` is already smaller, it's still good to extract a separate function in the hasher classes so that the generated code can stay clean.

## How was this patch tested?

Existing tests.

Author: Kris Mok <kr...@databricks.com>

Closes #21016 from rednaxelafx/hashutf8.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f94f3624
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f94f3624
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f94f3624

Branch: refs/heads/master
Commit: f94f3624ea81053653a06560808cb71f510c6828
Parents: 61b7247
Author: Kris Mok <kr...@databricks.com>
Authored: Mon Apr 9 21:07:28 2018 -0700
Committer: gatorsmile <ga...@gmail.com>
Committed: Mon Apr 9 21:07:28 2018 -0700

----------------------------------------------------------------------
 .../org/apache/spark/sql/catalyst/expressions/HiveHasher.java | 5 +++++
 .../java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java     | 7 ++++++-
 .../java/org/apache/spark/sql/catalyst/expressions/XXH64.java | 5 +++++
 .../org/apache/spark/sql/catalyst/expressions/hash.scala      | 6 ++----
 4 files changed, 18 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/f94f3624/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/expressions/HiveHasher.java
----------------------------------------------------------------------
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/expressions/HiveHasher.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/expressions/HiveHasher.java
index c34e369..62b75ae 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/expressions/HiveHasher.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/expressions/HiveHasher.java
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions;
 
 import org.apache.spark.unsafe.memory.MemoryBlock;
+import org.apache.spark.unsafe.types.UTF8String;
 
 /**
  * Simulates Hive's hashing function from Hive v1.2.1
@@ -51,4 +52,8 @@ public class HiveHasher {
   public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes) {
     return hashUnsafeBytesBlock(MemoryBlock.allocateFromObject(base, offset, lengthInBytes));
   }
+
+  public static int hashUTF8String(UTF8String str) {
+    return hashUnsafeBytesBlock(str.getMemoryBlock());
+  }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/f94f3624/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
----------------------------------------------------------------------
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
index f372b19..aff6e93 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
@@ -20,6 +20,7 @@ package org.apache.spark.unsafe.hash;
 import com.google.common.primitives.Ints;
 
 import org.apache.spark.unsafe.memory.MemoryBlock;
+import org.apache.spark.unsafe.types.UTF8String;
 
 /**
  * 32-bit Murmur3 hasher.  This is based on Guava's Murmur3_32HashFunction.
@@ -82,6 +83,10 @@ public final class Murmur3_x86_32 {
     return fmix(h1, lengthInBytes);
   }
 
+  public static int hashUTF8String(UTF8String str, int seed) {
+    return hashUnsafeBytesBlock(str.getMemoryBlock(), seed);
+  }
+
   public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed) {
     return hashUnsafeBytesBlock(MemoryBlock.allocateFromObject(base, offset, lengthInBytes), seed);
   }
@@ -91,7 +96,7 @@ public final class Murmur3_x86_32 {
   }
 
   public static int hashUnsafeBytes2Block(MemoryBlock base, int seed) {
-    // This is compatible with original and another implementations.
+    // This is compatible with original and other implementations.
     // Use this method for new components after Spark 2.3.
     int lengthInBytes = Ints.checkedCast(base.size());
     assert (lengthInBytes >= 0) : "lengthInBytes cannot be negative";

http://git-wip-us.apache.org/repos/asf/spark/blob/f94f3624/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/XXH64.java
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/XXH64.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/XXH64.java
index fe727f6..8e9c0a2 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/XXH64.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/XXH64.java
@@ -17,6 +17,7 @@
 package org.apache.spark.sql.catalyst.expressions;
 
 import org.apache.spark.unsafe.memory.MemoryBlock;
+import org.apache.spark.unsafe.types.UTF8String;
 
 // scalastyle: off
 /**
@@ -107,6 +108,10 @@ public final class XXH64 {
     return fmix(hash);
   }
 
+  public static long hashUTF8String(UTF8String str, long seed) {
+    return hashUnsafeBytesBlock(str.getMemoryBlock(), seed);
+  }
+
   public static long hashUnsafeBytes(Object base, long offset, int length, long seed) {
     return hashUnsafeBytesBlock(MemoryBlock.allocateFromObject(base, offset, length), seed);
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/f94f3624/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
index df29c38..ef79033 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
@@ -361,8 +361,7 @@ abstract class HashExpression[E] extends Expression {
   }
 
   protected def genHashString(input: String, result: String): String = {
-    val mb = s"$input.getMemoryBlock()"
-    s"$result = $hasherClassName.hashUnsafeBytesBlock($mb, $result);"
+    s"$result = $hasherClassName.hashUTF8String($input, $result);"
   }
 
   protected def genHashForMap(
@@ -725,8 +724,7 @@ case class HiveHash(children: Seq[Expression]) extends HashExpression[Int] {
      """
 
   override protected def genHashString(input: String, result: String): String = {
-    val mb = s"$input.getMemoryBlock()"
-    s"$result = $hasherClassName.hashUnsafeBytesBlock($mb);"
+    s"$result = $hasherClassName.hashUTF8String($input);"
   }
 
   override protected def genHashForArray(


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org