You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2020/01/27 18:34:55 UTC
[spark] branch branch-2.4 updated: [SPARK-30633][SQL] Append L to seed when type is LongType

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-2.4 by this push:
     new ad9f578  [SPARK-30633][SQL] Append L to seed when type is LongType
ad9f578 is described below

commit ad9f578ef5726c2b48362b147c2809cb36ead245
Author: Patrick Cording <pa...@datarobot.com>
AuthorDate: Mon Jan 27 10:32:15 2020 -0800

    [SPARK-30633][SQL] Append L to seed when type is LongType
    
    ### What changes were proposed in this pull request?
    
    Allow for using longs as seed for xxHash.
    
    ### Why are the changes needed?
    
    Codegen fails when passing a seed to xxHash that is > 2^31.
    
    ### Does this PR introduce any user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Existing tests pass. Should more be added?
    
    Closes #27354 from patrickcording/fix_xxhash_seed_bug.
    
    Authored-by: Patrick Cording <pa...@datarobot.com>
    Signed-off-by: Dongjoon Hyun <dh...@apple.com>
    (cherry picked from commit c5c580ba0d253a04a3df5bbfd5acf6b5d23cdc1c)
    Signed-off-by: Dongjoon Hyun <dh...@apple.com>
---
 .../spark/sql/catalyst/expressions/hash.scala      |  3 +-
 .../expressions/HashExpressionsSuite.scala         | 39 ++++++++++++++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
index 742a4f8..163cf61 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
@@ -280,6 +280,7 @@ abstract class HashExpression[E] extends Expression {
     }
 
     val hashResultType = CodeGenerator.javaType(dataType)
+    val typedSeed = if (dataType.sameType(LongType)) s"${seed}L" else s"$seed"
     val codes = ctx.splitExpressionsWithCurrentInputs(
       expressions = childrenHash,
       funcName = "computeHash",
@@ -294,7 +295,7 @@ abstract class HashExpression[E] extends Expression {
 
     ev.copy(code =
       code"""
-         |$hashResultType ${ev.value} = $seed;
+         |$hashResultType ${ev.value} = $typedSeed;
          |$codes
        """.stripMargin)
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala
index 4281c89..b136681 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala
@@ -676,6 +676,33 @@ class HashExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     assert(murmur3HashPlan(wideRow).getInt(0) == murmursHashEval)
   }
 
+  test("SPARK-30633: xxHash with different type seeds") {
+    val literal = Literal.create(42L, LongType)
+
+    val longSeeds = Seq(
+      Long.MinValue,
+      Integer.MIN_VALUE.toLong - 1L,
+      0L,
+      Integer.MAX_VALUE.toLong + 1L,
+      Long.MaxValue
+    )
+    for (seed <- longSeeds) {
+      checkEvaluation(XxHash64(Seq(literal), seed), XxHash64(Seq(literal), seed).eval())
+    }
+
+    val intSeeds = Seq(
+      Integer.MIN_VALUE,
+      0,
+      Integer.MAX_VALUE
+    )
+    for (seed <- intSeeds) {
+      checkEvaluation(XxHash64(Seq(literal), seed), XxHash64(Seq(literal), seed).eval())
+    }
+
+    checkEvaluation(XxHash64(Seq(literal), 100), XxHash64(Seq(literal), 100L).eval())
+    checkEvaluation(XxHash64(Seq(literal), 100L), XxHash64(Seq(literal), 100).eval())
+  }
+
   private def testHash(inputSchema: StructType): Unit = {
     val inputGenerator = RandomDataGenerator.forType(inputSchema, nullable = false).get
     val encoder = RowEncoder(inputSchema)
@@ -692,5 +719,17 @@ class HashExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
         checkEvaluation(HiveHash(literals), HiveHash(literals).eval())
       }
     }
+
+    val longSeed = Math.abs(seed).toLong + Integer.MAX_VALUE.toLong
+    test(s"SPARK-30633: xxHash64 with long seed: ${inputSchema.simpleString}") {
+      for (_ <- 1 to 10) {
+        val input = encoder.toRow(inputGenerator.apply().asInstanceOf[Row]).asInstanceOf[UnsafeRow]
+        val literals = input.toSeq(inputSchema).zip(inputSchema.map(_.dataType)).map {
+          case (value, dt) => Literal.create(value, dt)
+        }
+        // Only test the interpreted version has same result with codegen version.
+        checkEvaluation(XxHash64(literals, longSeed), XxHash64(literals, longSeed).eval())
+      }
+    }
   }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org