You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2019/02/22 04:27:38 UTC

[spark] branch branch-2.4 updated: [SPARK-26950][SQL][TEST] Make RandomDataGenerator use Float.NaN or Double.NaN for all NaN values

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-2.4 by this push:
     new ef67be3  [SPARK-26950][SQL][TEST] Make RandomDataGenerator use Float.NaN or Double.NaN for all NaN values
ef67be3 is described below

commit ef67be363be6d6b6954b55ef1c243a0672b84abb
Author: Dongjoon Hyun <do...@apache.org>
AuthorDate: Fri Feb 22 12:25:26 2019 +0800

    [SPARK-26950][SQL][TEST] Make RandomDataGenerator use Float.NaN or Double.NaN for all NaN values
    
    ## What changes were proposed in this pull request?
    
    Apache Spark uses the predefined `Float.NaN` and `Double.NaN` for NaN values, but there exists more NaN values with different binary presentations.
    
    ```scala
    scala> java.nio.ByteBuffer.allocate(4).putFloat(Float.NaN).array
    res1: Array[Byte] = Array(127, -64, 0, 0)
    
    scala> val x = java.lang.Float.intBitsToFloat(-6966608)
    x: Float = NaN
    
    scala> java.nio.ByteBuffer.allocate(4).putFloat(x).array
    res2: Array[Byte] = Array(-1, -107, -78, -80)
    ```
    
    Since users can have these values, `RandomDataGenerator` generates these NaN values. However, this causes `checkEvaluationWithUnsafeProjection` failures due to the difference between `UnsafeRow` binary presentation. The following is the UT failure instance. This PR aims to fix this UT flakiness.
    
    - https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/102528/testReport/
    
    ## How was this patch tested?
    
    Pass the Jenkins with the newly added test cases.
    
    Closes #23851 from dongjoon-hyun/SPARK-26950.
    
    Authored-by: Dongjoon Hyun <do...@apache.org>
    Signed-off-by: Wenchen Fan <we...@databricks.com>
    (cherry picked from commit ffef3d40741b0be321421aa52a6e17a26d89f541)
    Signed-off-by: Wenchen Fan <we...@databricks.com>
---
 .../org/apache/spark/sql/RandomDataGenerator.scala | 24 +++++++++++++++--
 .../spark/sql/RandomDataGeneratorSuite.scala       | 31 ++++++++++++++++++++++
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
index 8ae3ff5..d361e62 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql
 
-import java.lang.Double.longBitsToDouble
-import java.lang.Float.intBitsToFloat
 import java.math.MathContext
 
 import scala.collection.mutable
@@ -70,6 +68,28 @@ object RandomDataGenerator {
   }
 
   /**
+   * A wrapper of Float.intBitsToFloat to use a unique NaN value for all NaN values.
+   * This prevents `checkEvaluationWithUnsafeProjection` from failing due to
+   * the difference between `UnsafeRow` binary presentation for NaN.
+   * This is visible for testing.
+   */
+  def intBitsToFloat(bits: Int): Float = {
+    val value = java.lang.Float.intBitsToFloat(bits)
+    if (value.isNaN) Float.NaN else value
+  }
+
+  /**
+   * A wrapper of Double.longBitsToDouble to use a unique NaN value for all NaN values.
+   * This prevents `checkEvaluationWithUnsafeProjection` from failing due to
+   * the difference between `UnsafeRow` binary presentation for NaN.
+   * This is visible for testing.
+   */
+  def longBitsToDouble(bits: Long): Double = {
+    val value = java.lang.Double.longBitsToDouble(bits)
+    if (value.isNaN) Double.NaN else value
+  }
+
+  /**
    * Returns a randomly generated schema, based on the given accepted types.
    *
    * @param numFields the number of fields in this schema
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala
index 3c2f8a2..3e62ca0 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala
@@ -17,6 +17,9 @@
 
 package org.apache.spark.sql
 
+import java.nio.ByteBuffer
+import java.util.Arrays
+
 import scala.util.Random
 
 import org.apache.spark.SparkFunSuite
@@ -106,4 +109,32 @@ class RandomDataGeneratorSuite extends SparkFunSuite {
       assert(deviation.toDouble / expectedTotalElements < 2e-1)
     }
   }
+
+  test("Use Float.NaN for all NaN values") {
+    val bits = -6966608
+    val nan1 = java.lang.Float.intBitsToFloat(bits)
+    val nan2 = RandomDataGenerator.intBitsToFloat(bits)
+    assert(nan1.isNaN)
+    assert(nan2.isNaN)
+
+    val arrayExpected = ByteBuffer.allocate(4).putFloat(Float.NaN).array
+    val array1 = ByteBuffer.allocate(4).putFloat(nan1).array
+    val array2 = ByteBuffer.allocate(4).putFloat(nan2).array
+    assert(!Arrays.equals(array1, arrayExpected))
+    assert(Arrays.equals(array2, arrayExpected))
+  }
+
+  test("Use Double.NaN for all NaN values") {
+    val bits = -6966608
+    val nan1 = java.lang.Double.longBitsToDouble(bits)
+    val nan2 = RandomDataGenerator.longBitsToDouble(bits)
+    assert(nan1.isNaN)
+    assert(nan2.isNaN)
+
+    val arrayExpected = ByteBuffer.allocate(8).putDouble(Double.NaN).array
+    val array1 = ByteBuffer.allocate(8).putDouble(nan1).array
+    val array2 = ByteBuffer.allocate(8).putDouble(nan2).array
+    assert(!Arrays.equals(array1, arrayExpected))
+    assert(Arrays.equals(array2, arrayExpected))
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org