You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ma...@apache.org on 2013/11/25 00:52:40 UTC
[1/5] git commit: XORShift RNG with unit tests and benchmark To run
unit test, start SBT console and type: compile test-only
org.apache.spark.util.XORShiftRandomSuite To run benchmark,
type: project core console Once the Scala console starts, type: org.a
Updated Branches:
refs/heads/master 972171b9d -> 65de73c7f
XORShift RNG with unit tests and benchmark
To run unit test, start SBT console and type:
compile
test-only org.apache.spark.util.XORShiftRandomSuite
To run benchmark, type:
project core
console
Once the Scala console starts, type:
org.apache.spark.util.XORShiftRandom.benchmark(100000000)
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/09bdfe3b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/09bdfe3b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/09bdfe3b
Branch: refs/heads/master
Commit: 09bdfe3b163559fdcf8771b52ffbe2542883c912
Parents: e2ebc3a
Author: Marek Kolodziej <mk...@gmail.com>
Authored: Mon Nov 18 15:21:43 2013 -0500
Committer: Marek Kolodziej <mk...@gmail.com>
Committed: Mon Nov 18 15:21:43 2013 -0500
----------------------------------------------------------------------
.../main/scala/org/apache/spark/rdd/RDD.scala | 2 +-
.../scala/org/apache/spark/util/Utils.scala | 35 ++++++++-
.../org/apache/spark/util/XORShiftRandom.scala | 63 ++++++++++++++++
.../apache/spark/util/XORShiftRandomSuite.scala | 76 ++++++++++++++++++++
.../apache/spark/mllib/clustering/KMeans.scala | 2 +-
5 files changed, 175 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/09bdfe3b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index 6e88be6..dd9c32f 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -17,7 +17,7 @@
package org.apache.spark.rdd
-import java.util.Random
+import org.apache.spark.util.{XORShiftRandom => Random}
import scala.collection.Map
import scala.collection.JavaConversions.mapAsScalaMap
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/09bdfe3b/core/src/main/scala/org/apache/spark/util/Utils.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index fe932d8..2df7108 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -818,9 +818,42 @@ private[spark] object Utils extends Logging {
hashAbs
}
- /** Returns a copy of the system properties that is thread-safe to iterator over. */
+ /* Returns a copy of the system properties that is thread-safe to iterator over. */
def getSystemProperties(): Map[String, String] = {
return System.getProperties().clone()
.asInstanceOf[java.util.Properties].toMap[String, String]
}
+
+ /* Used for performance tersting along with the intToTimesInt() and timeIt methods
+ * It uses a while loop instead of a for comprehension since the JIT will
+ * optimize the while loop better than the "for" closure
+ * e.g.
+ * import org.apache.spark.util.Utils.{TimesInt, intToTimesInt, timeIt}
+ * import java.util.Random
+ * val rand = new Random()
+ * timeIt(rand.nextDouble, 10000000)
+ */
+ class TimesInt(i: Int) {
+ def times(f: => Unit) = {
+ var x = 1
+ while (x <= i) {
+ f
+ x += 1
+ }
+ }
+ }
+
+ /* Used in conjunction with TimesInt since it's Scala 2.9.3
+ * instead of 2.10 and we don't have implicit classes */
+ implicit def intToTimesInt(i: Int) = new TimesInt(i)
+
+ /* See TimesInt for use example */
+ def timeIt(f: => Unit, iters: Int): Long = {
+
+ val start = System.currentTimeMillis
+ iters.times(f)
+ System.currentTimeMillis - start
+
+ }
+
}
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/09bdfe3b/core/src/main/scala/org/apache/spark/util/XORShiftRandom.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/util/XORShiftRandom.scala b/core/src/main/scala/org/apache/spark/util/XORShiftRandom.scala
new file mode 100644
index 0000000..3c189c1
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/util/XORShiftRandom.scala
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util
+
+import java.util.{Random => JavaRandom}
+import Utils.{TimesInt, intToTimesInt, timeIt}
+
+class XORShiftRandom(init: Long) extends JavaRandom(init) {
+
+ def this() = this(System.nanoTime)
+
+ var seed = init
+
+ // we need to just override next - this will be called by nextInt, nextDouble,
+ // nextGaussian, nextLong, etc.
+ override protected def next(bits: Int): Int = {
+
+ var nextSeed = seed ^ (seed << 21)
+ nextSeed ^= (nextSeed >>> 35)
+ nextSeed ^= (nextSeed << 4)
+ seed = nextSeed
+ (nextSeed & ((1L << bits) -1)).asInstanceOf[Int]
+ }
+}
+
+object XORShiftRandom {
+
+ def benchmark(numIters: Int) = {
+
+ val seed = 1L
+ val million = 1e6.toInt
+ val javaRand = new JavaRandom(seed)
+ val xorRand = new XORShiftRandom(seed)
+
+ // warm up the JIT
+ million.times {
+ javaRand.nextInt
+ xorRand.nextInt
+ }
+
+ /* Return results as a map instead of just printing to screen
+ in case the user wants to do something with them */
+ Map("javaTime" -> timeIt(javaRand.nextInt, numIters),
+ "xorTime" -> timeIt(xorRand.nextInt, numIters))
+
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/09bdfe3b/core/src/test/scala/org/apache/spark/util/XORShiftRandomSuite.scala
----------------------------------------------------------------------
diff --git a/core/src/test/scala/org/apache/spark/util/XORShiftRandomSuite.scala b/core/src/test/scala/org/apache/spark/util/XORShiftRandomSuite.scala
new file mode 100644
index 0000000..1691cb4
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/util/XORShiftRandomSuite.scala
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util
+
+import java.util.Random
+import org.scalatest.FlatSpec
+import org.scalatest.FunSuite
+import org.scalatest.matchers.ShouldMatchers
+import org.apache.spark.util.Utils.{TimesInt, intToTimesInt, timeIt}
+
+class XORShiftRandomSuite extends FunSuite with ShouldMatchers {
+
+ def fixture = new {
+ val seed = 1L
+ val xorRand = new XORShiftRandom(seed)
+ val hundMil = 1e8.toInt
+ }
+
+ /*
+ * This test is based on a chi-squared test for randomness. The values are hard-coded
+ * so as not to create Spark's dependency on apache.commons.math3 just to call one
+ * method for calculating the exact p-value for a given number of random numbers
+ * and bins. In case one would want to move to a full-fledged test based on
+ * apache.commons.math3, the relevant class is here:
+ * org.apache.commons.math3.stat.inference.ChiSquareTest
+ */
+ test ("XORShift generates valid random numbers") {
+
+ val f = fixture
+
+ val numBins = 10
+ // create 10 bins
+ val bins = Array.fill(numBins)(0)
+
+ // populate bins based on modulus of the random number
+ f.hundMil.times(bins(math.abs(f.xorRand.nextInt) % 10) += 1)
+
+ /* since the seed is deterministic, until the algorithm is changed, we know the result will be
+ * exactly this: Array(10004908, 9993136, 9994600, 10000744, 10000091, 10002474, 10002272,
+ * 10000790, 10002286, 9998699), so the test will never fail at the prespecified (5%)
+ * significance level. However, should the RNG implementation change, the test should still
+ * pass at the same significance level. The chi-squared test done in R gave the following
+ * results:
+ * > chisq.test(c(10004908, 9993136, 9994600, 10000744, 10000091, 10002474, 10002272,
+ * 10000790, 10002286, 9998699))
+ * Chi-squared test for given probabilities
+ * data: c(10004908, 9993136, 9994600, 10000744, 10000091, 10002474, 10002272, 10000790,
+ * 10002286, 9998699)
+ * X-squared = 11.975, df = 9, p-value = 0.2147
+ * Note that the p-value was ~0.22. The test will fail if alpha < 0.05, which for 100 million
+ * random numbers
+ * and 10 bins will happen at X-squared of ~16.9196. So, the test will fail if X-squared
+ * is greater than or equal to that number.
+ */
+ val binSize = f.hundMil/numBins
+ val xSquared = bins.map(x => math.pow((binSize - x), 2)/binSize).sum
+ xSquared should be < (16.9196)
+
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/09bdfe3b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index edbf77d..56bcb6c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -18,7 +18,7 @@
package org.apache.spark.mllib.clustering
import scala.collection.mutable.ArrayBuffer
-import scala.util.Random
+import org.apache.spark.util.{XORShiftRandom => Random}
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
[5/5] git commit: Merge pull request #185 from
mkolod/random-number-generator
Posted by ma...@apache.org.
Merge pull request #185 from mkolod/random-number-generator
XORShift RNG with unit tests and benchmark
This patch was introduced to address SPARK-950 - the discussion below the ticket explains not only the rationale, but also the design and testing decisions: https://spark-project.atlassian.net/browse/SPARK-950
To run unit test, start SBT console and type:
compile
test-only org.apache.spark.util.XORShiftRandomSuite
To run benchmark, type:
project core
console
Once the Scala console starts, type:
org.apache.spark.util.XORShiftRandom.benchmark(100000000)
XORShiftRandom is also an object with a main method taking the
number of iterations as an argument, so you can also run it
from the command line.
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/65de73c7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/65de73c7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/65de73c7
Branch: refs/heads/master
Commit: 65de73c7f8529be363e9ae814d0d4eab0da112aa
Parents: 972171b 2272465
Author: Matei Zaharia <ma...@eecs.berkeley.edu>
Authored: Sun Nov 24 15:52:33 2013 -0800
Committer: Matei Zaharia <ma...@eecs.berkeley.edu>
Committed: Sun Nov 24 15:52:33 2013 -0800
----------------------------------------------------------------------
.../scala/org/apache/spark/util/Utils.scala | 24 +++++
.../org/apache/spark/util/XORShiftRandom.scala | 94 ++++++++++++++++++++
.../apache/spark/util/XORShiftRandomSuite.scala | 76 ++++++++++++++++
.../apache/spark/mllib/clustering/KMeans.scala | 11 +--
4 files changed, 200 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
[4/5] git commit: Make XORShiftRandom explicit in KMeans and roll it
back for RDD
Posted by ma...@apache.org.
Make XORShiftRandom explicit in KMeans and roll it back for RDD
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/22724659
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/22724659
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/22724659
Branch: refs/heads/master
Commit: 22724659db8d711492f58c90d530be2f4a5b3de9
Parents: bcc6ed3
Author: Marek Kolodziej <mk...@gmail.com>
Authored: Wed Nov 20 07:03:36 2013 -0500
Committer: Marek Kolodziej <mk...@gmail.com>
Committed: Wed Nov 20 07:03:36 2013 -0500
----------------------------------------------------------------------
core/src/main/scala/org/apache/spark/rdd/RDD.scala | 4 +++-
.../scala/org/apache/spark/mllib/clustering/KMeans.scala | 8 ++++----
2 files changed, 7 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/22724659/core/src/main/scala/org/apache/spark/rdd/RDD.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index e738bfb..6e88be6 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -17,6 +17,8 @@
package org.apache.spark.rdd
+import java.util.Random
+
import scala.collection.Map
import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.mutable.ArrayBuffer
@@ -36,7 +38,7 @@ import org.apache.spark.partial.CountEvaluator
import org.apache.spark.partial.GroupedCountEvaluator
import org.apache.spark.partial.PartialResult
import org.apache.spark.storage.StorageLevel
-import org.apache.spark.util.{Utils, BoundedPriorityQueue, XORShiftRandom => Random}
+import org.apache.spark.util.{Utils, BoundedPriorityQueue}
import org.apache.spark.SparkContext._
import org.apache.spark._
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/22724659/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index f09ea9e..0dee939 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -26,7 +26,7 @@ import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD
import org.apache.spark.Logging
import org.apache.spark.mllib.util.MLUtils
-import org.apache.spark.util.{XORShiftRandom => Random}
+import org.apache.spark.util.XORShiftRandom
@@ -196,7 +196,7 @@ class KMeans private (
*/
private def initRandom(data: RDD[Array[Double]]): Array[ClusterCenters] = {
// Sample all the cluster centers in one pass to avoid repeated scans
- val sample = data.takeSample(true, runs * k, new Random().nextInt()).toSeq
+ val sample = data.takeSample(true, runs * k, new XORShiftRandom().nextInt()).toSeq
Array.tabulate(runs)(r => sample.slice(r * k, (r + 1) * k).toArray)
}
@@ -211,7 +211,7 @@ class KMeans private (
*/
private def initKMeansParallel(data: RDD[Array[Double]]): Array[ClusterCenters] = {
// Initialize each run's center to a random point
- val seed = new Random().nextInt()
+ val seed = new XORShiftRandom().nextInt()
val sample = data.takeSample(true, runs, seed).toSeq
val centers = Array.tabulate(runs)(r => ArrayBuffer(sample(r)))
@@ -223,7 +223,7 @@ class KMeans private (
for (r <- 0 until runs) yield (r, KMeans.pointCost(centerArrays(r), point))
}.reduceByKey(_ + _).collectAsMap()
val chosen = data.mapPartitionsWithIndex { (index, points) =>
- val rand = new Random(seed ^ (step << 16) ^ index)
+ val rand = new XORShiftRandom(seed ^ (step << 16) ^ index)
for {
p <- points
r <- 0 until runs
[2/5] git commit: Updates to reflect pull request code review
Posted by ma...@apache.org.
Updates to reflect pull request code review
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/99cfe89c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/99cfe89c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/99cfe89c
Branch: refs/heads/master
Commit: 99cfe89c688ee1499d2723d8ea909651995abe86
Parents: 09bdfe3
Author: Marek Kolodziej <mk...@gmail.com>
Authored: Mon Nov 18 22:00:36 2013 -0500
Committer: Marek Kolodziej <mk...@gmail.com>
Committed: Mon Nov 18 22:00:36 2013 -0500
----------------------------------------------------------------------
.../main/scala/org/apache/spark/rdd/RDD.scala | 4 +-
.../scala/org/apache/spark/util/Utils.scala | 43 ++++++---------
.../org/apache/spark/util/XORShiftRandom.scala | 55 +++++++++++++++-----
.../apache/spark/util/XORShiftRandomSuite.scala | 10 ++--
.../apache/spark/mllib/clustering/KMeans.scala | 5 +-
5 files changed, 69 insertions(+), 48 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/99cfe89c/core/src/main/scala/org/apache/spark/rdd/RDD.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index dd9c32f..e738bfb 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -17,8 +17,6 @@
package org.apache.spark.rdd
-import org.apache.spark.util.{XORShiftRandom => Random}
-
import scala.collection.Map
import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.mutable.ArrayBuffer
@@ -38,7 +36,7 @@ import org.apache.spark.partial.CountEvaluator
import org.apache.spark.partial.GroupedCountEvaluator
import org.apache.spark.partial.PartialResult
import org.apache.spark.storage.StorageLevel
-import org.apache.spark.util.{Utils, BoundedPriorityQueue}
+import org.apache.spark.util.{Utils, BoundedPriorityQueue, XORShiftRandom => Random}
import org.apache.spark.SparkContext._
import org.apache.spark._
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/99cfe89c/core/src/main/scala/org/apache/spark/util/Utils.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 2df7108..b98a810 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -818,42 +818,33 @@ private[spark] object Utils extends Logging {
hashAbs
}
- /* Returns a copy of the system properties that is thread-safe to iterator over. */
+ /** Returns a copy of the system properties that is thread-safe to iterator over. */
def getSystemProperties(): Map[String, String] = {
return System.getProperties().clone()
.asInstanceOf[java.util.Properties].toMap[String, String]
}
- /* Used for performance tersting along with the intToTimesInt() and timeIt methods
- * It uses a while loop instead of a for comprehension since the JIT will
- * optimize the while loop better than the "for" closure
- * e.g.
- * import org.apache.spark.util.Utils.{TimesInt, intToTimesInt, timeIt}
- * import java.util.Random
- * val rand = new Random()
- * timeIt(rand.nextDouble, 10000000)
+ /**
+ * Method executed for repeating a task for side effects.
+ * Unlike a for comprehension, it permits JVM JIT optimization
*/
- class TimesInt(i: Int) {
- def times(f: => Unit) = {
- var x = 1
- while (x <= i) {
- f
- x += 1
+ def times(numIters: Int)(f: => Unit): Unit = {
+ var i = 0
+ while (i < numIters) {
+ f
+ i += 1
}
- }
}
-
- /* Used in conjunction with TimesInt since it's Scala 2.9.3
- * instead of 2.10 and we don't have implicit classes */
- implicit def intToTimesInt(i: Int) = new TimesInt(i)
-
- /* See TimesInt for use example */
- def timeIt(f: => Unit, iters: Int): Long = {
+ /**
+ * Timing method based on iterations that permit JVM JIT optimization.
+ * @param numIters number of iterations
+ * @param f function to be executed
+ */
+ def timeIt(numIters: Int)(f: => Unit): Long = {
val start = System.currentTimeMillis
- iters.times(f)
+ times(numIters)(f)
System.currentTimeMillis - start
-
}
-
+
}
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/99cfe89c/core/src/main/scala/org/apache/spark/util/XORShiftRandom.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/util/XORShiftRandom.scala b/core/src/main/scala/org/apache/spark/util/XORShiftRandom.scala
index 3c189c1..d443595 100644
--- a/core/src/main/scala/org/apache/spark/util/XORShiftRandom.scala
+++ b/core/src/main/scala/org/apache/spark/util/XORShiftRandom.scala
@@ -18,18 +18,28 @@
package org.apache.spark.util
import java.util.{Random => JavaRandom}
-import Utils.{TimesInt, intToTimesInt, timeIt}
+import org.apache.spark.util.Utils.timeIt
+/**
+ * This class implements a XORShift random number generator algorithm
+ * Source:
+ * Marsaglia, G. (2003). Xorshift RNGs. Journal of Statistical Software, Vol. 8, Issue 14.
+ * @see <a href="http://www.jstatsoft.org/v08/i14/paper">Paper</a>
+ * This implementation is approximately 3.5 times faster than
+ * {@link java.util.Random java.util.Random}, partly because of the algorithm, but also due
+ * to renouncing thread safety. JDK's implementation uses an AtomicLong seed, this class
+ * uses a regular Long. We can forgo thread safety since we use a new instance of the RNG
+ * for each thread.
+ */
class XORShiftRandom(init: Long) extends JavaRandom(init) {
def this() = this(System.nanoTime)
- var seed = init
+ private var seed = init
// we need to just override next - this will be called by nextInt, nextDouble,
// nextGaussian, nextLong, etc.
- override protected def next(bits: Int): Int = {
-
+ override protected def next(bits: Int): Int = {
var nextSeed = seed ^ (seed << 21)
nextSeed ^= (nextSeed >>> 35)
nextSeed ^= (nextSeed << 4)
@@ -38,25 +48,46 @@ class XORShiftRandom(init: Long) extends JavaRandom(init) {
}
}
+/** Contains benchmark method and main method to run benchmark of the RNG */
object XORShiftRandom {
+ /**
+ * Main method for running benchmark
+ * @param args takes one argument - the number of random numbers to generate
+ */
+ def main(args: Array[String]): Unit = {
+ if (args.length != 1) {
+ println("Benchmark of XORShiftRandom vis-a-vis java.util.Random")
+ println("Usage: XORShiftRandom number_of_random_numbers_to_generate")
+ System.exit(1)
+ }
+ println(benchmark(args(0).toInt))
+ }
+
+ /**
+ * @param numIters Number of random numbers to generate while running the benchmark
+ * @return Map of execution times for {@link java.util.Random java.util.Random}
+ * and XORShift
+ */
def benchmark(numIters: Int) = {
val seed = 1L
val million = 1e6.toInt
val javaRand = new JavaRandom(seed)
val xorRand = new XORShiftRandom(seed)
-
- // warm up the JIT
- million.times {
- javaRand.nextInt
- xorRand.nextInt
+
+ // this is just to warm up the JIT - we're not timing anything
+ timeIt(1e6.toInt) {
+ javaRand.nextInt()
+ xorRand.nextInt()
}
+ val iters = timeIt(numIters)(_)
+
/* Return results as a map instead of just printing to screen
- in case the user wants to do something with them */
- Map("javaTime" -> timeIt(javaRand.nextInt, numIters),
- "xorTime" -> timeIt(xorRand.nextInt, numIters))
+ in case the user wants to do something with them */
+ Map("javaTime" -> iters {javaRand.nextInt()},
+ "xorTime" -> iters {xorRand.nextInt()})
}
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/99cfe89c/core/src/test/scala/org/apache/spark/util/XORShiftRandomSuite.scala
----------------------------------------------------------------------
diff --git a/core/src/test/scala/org/apache/spark/util/XORShiftRandomSuite.scala b/core/src/test/scala/org/apache/spark/util/XORShiftRandomSuite.scala
index 1691cb4..b78367b 100644
--- a/core/src/test/scala/org/apache/spark/util/XORShiftRandomSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/XORShiftRandomSuite.scala
@@ -21,7 +21,7 @@ import java.util.Random
import org.scalatest.FlatSpec
import org.scalatest.FunSuite
import org.scalatest.matchers.ShouldMatchers
-import org.apache.spark.util.Utils.{TimesInt, intToTimesInt, timeIt}
+import org.apache.spark.util.Utils.times
class XORShiftRandomSuite extends FunSuite with ShouldMatchers {
@@ -48,7 +48,7 @@ class XORShiftRandomSuite extends FunSuite with ShouldMatchers {
val bins = Array.fill(numBins)(0)
// populate bins based on modulus of the random number
- f.hundMil.times(bins(math.abs(f.xorRand.nextInt) % 10) += 1)
+ times(f.hundMil) {bins(math.abs(f.xorRand.nextInt) % 10) += 1}
/* since the seed is deterministic, until the algorithm is changed, we know the result will be
* exactly this: Array(10004908, 9993136, 9994600, 10000744, 10000091, 10002474, 10002272,
@@ -67,9 +67,9 @@ class XORShiftRandomSuite extends FunSuite with ShouldMatchers {
* and 10 bins will happen at X-squared of ~16.9196. So, the test will fail if X-squared
* is greater than or equal to that number.
*/
- val binSize = f.hundMil/numBins
- val xSquared = bins.map(x => math.pow((binSize - x), 2)/binSize).sum
- xSquared should be < (16.9196)
+ val binSize = f.hundMil/numBins
+ val xSquared = bins.map(x => math.pow((binSize - x), 2)/binSize).sum
+ xSquared should be < (16.9196)
}
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/99cfe89c/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index 56bcb6c..f09ea9e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -18,15 +18,16 @@
package org.apache.spark.mllib.clustering
import scala.collection.mutable.ArrayBuffer
-import org.apache.spark.util.{XORShiftRandom => Random}
+
+import org.jblas.DoubleMatrix
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD
import org.apache.spark.Logging
import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.util.{XORShiftRandom => Random}
-import org.jblas.DoubleMatrix
/**
[3/5] git commit: Formatting and scoping (private[spark]) updates
Posted by ma...@apache.org.
Formatting and scoping (private[spark]) updates
Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/bcc6ed30
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/bcc6ed30
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/bcc6ed30
Branch: refs/heads/master
Commit: bcc6ed30bf7189ebf0226f212b4e39830b830b6e
Parents: 99cfe89
Author: Marek Kolodziej <mk...@gmail.com>
Authored: Tue Nov 19 20:50:38 2013 -0500
Committer: Marek Kolodziej <mk...@gmail.com>
Committed: Tue Nov 19 20:50:38 2013 -0500
----------------------------------------------------------------------
core/src/main/scala/org/apache/spark/util/Utils.scala | 2 +-
core/src/main/scala/org/apache/spark/util/XORShiftRandom.scala | 4 ++--
2 files changed, 3 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/bcc6ed30/core/src/main/scala/org/apache/spark/util/Utils.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index b98a810..a79e64e 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -833,7 +833,7 @@ private[spark] object Utils extends Logging {
while (i < numIters) {
f
i += 1
- }
+ }
}
/**
http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/bcc6ed30/core/src/main/scala/org/apache/spark/util/XORShiftRandom.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/util/XORShiftRandom.scala b/core/src/main/scala/org/apache/spark/util/XORShiftRandom.scala
index d443595..e9907e6 100644
--- a/core/src/main/scala/org/apache/spark/util/XORShiftRandom.scala
+++ b/core/src/main/scala/org/apache/spark/util/XORShiftRandom.scala
@@ -31,7 +31,7 @@ import org.apache.spark.util.Utils.timeIt
* uses a regular Long. We can forgo thread safety since we use a new instance of the RNG
* for each thread.
*/
-class XORShiftRandom(init: Long) extends JavaRandom(init) {
+private[spark] class XORShiftRandom(init: Long) extends JavaRandom(init) {
def this() = this(System.nanoTime)
@@ -49,7 +49,7 @@ class XORShiftRandom(init: Long) extends JavaRandom(init) {
}
/** Contains benchmark method and main method to run benchmark of the RNG */
-object XORShiftRandom {
+private[spark] object XORShiftRandom {
/**
* Main method for running benchmark