You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by sr...@apache.org on 2016/09/04 11:40:52 UTC

spark git commit: [SPARK-17311][MLLIB] Standardize Python-Java MLlib API to accept optional long seeds in all cases

Repository: spark
Updated Branches:
  refs/heads/master e75c162e9 -> cdeb97a8c


[SPARK-17311][MLLIB] Standardize Python-Java MLlib API to accept optional long seeds in all cases

## What changes were proposed in this pull request?

Related to https://github.com/apache/spark/pull/14524 -- just the 'fix' rather than a behavior change.

- PythonMLlibAPI methods that take a seed now always take a `java.lang.Long` consistently, allowing the Python API to specify "no seed"
- .mllib's Word2VecModel seemed to be an odd man out in .mllib in that it picked its own random seed. Instead it defaults to None, meaning, letting the Scala implementation pick a seed
- BisectingKMeansModel arguably should not hard-code a seed for consistency with .mllib, I think. However I left it.

## How was this patch tested?

Existing tests

Author: Sean Owen <so...@cloudera.com>

Closes #14826 from srowen/SPARK-16832.2.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cdeb97a8
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cdeb97a8
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cdeb97a8

Branch: refs/heads/master
Commit: cdeb97a8cd26e3282cc2a4f126242ed2199f3898
Parents: e75c162
Author: Sean Owen <so...@cloudera.com>
Authored: Sun Sep 4 12:40:51 2016 +0100
Committer: Sean Owen <so...@cloudera.com>
Committed: Sun Sep 4 12:40:51 2016 +0100

----------------------------------------------------------------------
 .../spark/mllib/api/python/PythonMLLibAPI.scala | 20 +++++++++++---------
 python/pyspark/mllib/feature.py                 |  4 ++--
 2 files changed, 13 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/cdeb97a8/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index a80cca7..2ed6c6b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -126,13 +126,13 @@ private[python] class PythonMLLibAPI extends Serializable {
       k: Int,
       maxIterations: Int,
       minDivisibleClusterSize: Double,
-      seed: Long): BisectingKMeansModel = {
-    new BisectingKMeans()
+      seed: java.lang.Long): BisectingKMeansModel = {
+    val kmeans = new BisectingKMeans()
       .setK(k)
       .setMaxIterations(maxIterations)
       .setMinDivisibleClusterSize(minDivisibleClusterSize)
-      .setSeed(seed)
-      .run(data)
+    if (seed != null) kmeans.setSeed(seed)
+    kmeans.run(data)
   }
 
   /**
@@ -678,7 +678,7 @@ private[python] class PythonMLLibAPI extends Serializable {
       learningRate: Double,
       numPartitions: Int,
       numIterations: Int,
-      seed: Long,
+      seed: java.lang.Long,
       minCount: Int,
       windowSize: Int): Word2VecModelWrapper = {
     val word2vec = new Word2Vec()
@@ -686,9 +686,9 @@ private[python] class PythonMLLibAPI extends Serializable {
       .setLearningRate(learningRate)
       .setNumPartitions(numPartitions)
       .setNumIterations(numIterations)
-      .setSeed(seed)
       .setMinCount(minCount)
       .setWindowSize(windowSize)
+    if (seed != null) word2vec.setSeed(seed)
     try {
       val model = word2vec.fit(dataJRDD.rdd.persist(StorageLevel.MEMORY_AND_DISK_SER))
       new Word2VecModelWrapper(model)
@@ -751,7 +751,7 @@ private[python] class PythonMLLibAPI extends Serializable {
       impurityStr: String,
       maxDepth: Int,
       maxBins: Int,
-      seed: Int): RandomForestModel = {
+      seed: java.lang.Long): RandomForestModel = {
 
     val algo = Algo.fromString(algoStr)
     val impurity = Impurities.fromString(impurityStr)
@@ -763,11 +763,13 @@ private[python] class PythonMLLibAPI extends Serializable {
       maxBins = maxBins,
       categoricalFeaturesInfo = categoricalFeaturesInfo.asScala.toMap)
     val cached = data.rdd.persist(StorageLevel.MEMORY_AND_DISK)
+    // Only done because methods below want an int, not an optional Long
+    val intSeed = getSeedOrDefault(seed).toInt
     try {
       if (algo == Algo.Classification) {
-        RandomForest.trainClassifier(cached, strategy, numTrees, featureSubsetStrategy, seed)
+        RandomForest.trainClassifier(cached, strategy, numTrees, featureSubsetStrategy, intSeed)
       } else {
-        RandomForest.trainRegressor(cached, strategy, numTrees, featureSubsetStrategy, seed)
+        RandomForest.trainRegressor(cached, strategy, numTrees, featureSubsetStrategy, intSeed)
       }
     } finally {
       cached.unpersist(blocking = false)

http://git-wip-us.apache.org/repos/asf/spark/blob/cdeb97a8/python/pyspark/mllib/feature.py
----------------------------------------------------------------------
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 324ba97..b32d0c7 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -600,7 +600,7 @@ class Word2Vec(object):
         self.learningRate = 0.025
         self.numPartitions = 1
         self.numIterations = 1
-        self.seed = random.randint(0, sys.maxsize)
+        self.seed = None
         self.minCount = 5
         self.windowSize = 5
 
@@ -675,7 +675,7 @@ class Word2Vec(object):
             raise TypeError("data should be an RDD of list of string")
         jmodel = callMLlibFunc("trainWord2VecModel", data, int(self.vectorSize),
                                float(self.learningRate), int(self.numPartitions),
-                               int(self.numIterations), int(self.seed),
+                               int(self.numIterations), self.seed,
                                int(self.minCount), int(self.windowSize))
         return Word2VecModel(jmodel)
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org