You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by yh...@apache.org on 2016/02/03 01:26:52 UTC
spark git commit: [SPARK-13020][SQL][TEST] fix random generator for map type

Repository: spark
Updated Branches:
  refs/heads/master 6de6a9772 -> 672032d0a


[SPARK-13020][SQL][TEST] fix random generator for map type

when we generate map, we first randomly pick a length, then create a seq of key value pair with the expected length, and finally call `toMap`. However, `toMap` will remove all duplicated keys, which makes the actual map size much less than we expected.

This PR fixes this problem by put keys in a set first, to guarantee we have enough keys to build a map with expected length.

Author: Wenchen Fan <we...@databricks.com>

Closes #10930 from cloud-fan/random-generator.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/672032d0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/672032d0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/672032d0

Branch: refs/heads/master
Commit: 672032d0ab1e43bc5a25cecdb1b96dfd35c39778
Parents: 6de6a97
Author: Wenchen Fan <we...@databricks.com>
Authored: Wed Feb 3 08:26:35 2016 +0800
Committer: Yin Huai <yh...@databricks.com>
Committed: Wed Feb 3 08:26:35 2016 +0800

----------------------------------------------------------------------
 .../apache/spark/sql/RandomDataGenerator.scala    | 18 ++++++++++++++----
 .../spark/sql/RandomDataGeneratorSuite.scala      | 11 +++++++++++
 2 files changed, 25 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/672032d0/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
index 55efea8..7c173cb 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
@@ -47,9 +47,9 @@ object RandomDataGenerator {
    */
   private val PROBABILITY_OF_NULL: Float = 0.1f
 
-  private val MAX_STR_LEN: Int = 1024
-  private val MAX_ARR_SIZE: Int = 128
-  private val MAX_MAP_SIZE: Int = 128
+  final val MAX_STR_LEN: Int = 1024
+  final val MAX_ARR_SIZE: Int = 128
+  final val MAX_MAP_SIZE: Int = 128
 
   /**
    * Helper function for constructing a biased random number generator which returns "interesting"
@@ -208,7 +208,17 @@ object RandomDataGenerator {
             forType(valueType, nullable = valueContainsNull, rand)
         ) yield {
           () => {
-            Seq.fill(rand.nextInt(MAX_MAP_SIZE))((keyGenerator(), valueGenerator())).toMap
+            val length = rand.nextInt(MAX_MAP_SIZE)
+            val keys = scala.collection.mutable.HashSet(Seq.fill(length)(keyGenerator()): _*)
+            // In case the number of different keys is not enough, set a max iteration to avoid
+            // infinite loop.
+            var count = 0
+            while (keys.size < length && count < MAX_MAP_SIZE) {
+              keys += keyGenerator()
+              count += 1
+            }
+            val values = Seq.fill(keys.size)(valueGenerator())
+            keys.zip(values).toMap
           }
         }
       }

http://git-wip-us.apache.org/repos/asf/spark/blob/672032d0/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala
index b8ccdf7..9fba792 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala
@@ -95,4 +95,15 @@ class RandomDataGeneratorSuite extends SparkFunSuite {
     }
   }
 
+  test("check size of generated map") {
+    val mapType = MapType(IntegerType, IntegerType)
+    for (seed <- 1 to 1000) {
+      val generator = RandomDataGenerator.forType(
+        mapType, nullable = false, rand = new Random(seed)).get
+      val maps = Seq.fill(100)(generator().asInstanceOf[Map[Int, Int]])
+      val expectedTotalElements = 100 / 2 * RandomDataGenerator.MAX_MAP_SIZE
+      val deviation = math.abs(maps.map(_.size).sum - expectedTotalElements)
+      assert(deviation.toDouble / expectedTotalElements < 2e-1)
+    }
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org