You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by jk...@apache.org on 2018/05/07 21:49:36 UTC
spark git commit: [SPARK-15750][MLLIB][PYSPARK] Constructing FPGrowth
fails when no numPartitions specified in pyspark
Repository: spark
Updated Branches:
refs/heads/master d83e96372 -> 56a52e0a5
[SPARK-15750][MLLIB][PYSPARK] Constructing FPGrowth fails when no numPartitions specified in pyspark
## What changes were proposed in this pull request?
Change FPGrowth from private to private[spark]. If no numPartitions is specified, then default value -1 is used. But -1 is only valid in the construction function of FPGrowth, but not in setNumPartitions. So I make this change and use the constructor directly rather than using set method.
## How was this patch tested?
Unit test is added
Author: Jeff Zhang <zj...@apache.org>
Closes #13493 from zjffdu/SPARK-15750.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/56a52e0a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/56a52e0a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/56a52e0a
Branch: refs/heads/master
Commit: 56a52e0a58fc82ea69e47d0d8c4f905565be7c8b
Parents: d83e963
Author: Jeff Zhang <zj...@apache.org>
Authored: Mon May 7 14:47:58 2018 -0700
Committer: Joseph K. Bradley <jo...@databricks.com>
Committed: Mon May 7 14:47:58 2018 -0700
----------------------------------------------------------------------
.../apache/spark/mllib/api/python/PythonMLLibAPI.scala | 5 +----
.../scala/org/apache/spark/mllib/fpm/FPGrowth.scala | 2 +-
python/pyspark/mllib/tests.py | 12 ++++++++++++
3 files changed, 14 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/56a52e0a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index b32d3f2..db3f074 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -572,10 +572,7 @@ private[python] class PythonMLLibAPI extends Serializable {
data: JavaRDD[java.lang.Iterable[Any]],
minSupport: Double,
numPartitions: Int): FPGrowthModel[Any] = {
- val fpg = new FPGrowth()
- .setMinSupport(minSupport)
- .setNumPartitions(numPartitions)
-
+ val fpg = new FPGrowth(minSupport, numPartitions)
val model = fpg.run(data.rdd.map(_.asScala.toArray))
new FPGrowthModelWrapper(model)
}
http://git-wip-us.apache.org/repos/asf/spark/blob/56a52e0a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
index f6b1143..4f2b7e6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
@@ -162,7 +162,7 @@ object FPGrowthModel extends Loader[FPGrowthModel[_]] {
*
*/
@Since("1.3.0")
-class FPGrowth private (
+class FPGrowth private[spark] (
private var minSupport: Double,
private var numPartitions: Int) extends Logging with Serializable {
http://git-wip-us.apache.org/repos/asf/spark/blob/56a52e0a/python/pyspark/mllib/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 14d788b..4c2ce13 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -57,6 +57,7 @@ from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, _
DenseMatrix, SparseMatrix, Vectors, Matrices, MatrixUDT
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.mllib.classification import StreamingLogisticRegressionWithSGD
+from pyspark.mllib.fpm import FPGrowth
from pyspark.mllib.recommendation import Rating
from pyspark.mllib.regression import LabeledPoint, StreamingLinearRegressionWithSGD
from pyspark.mllib.random import RandomRDDs
@@ -1762,6 +1763,17 @@ class DimensionalityReductionTests(MLlibTestCase):
self.assertEqualUpToSign(pcs.toArray()[:, k - 1], expected_pcs[:, k - 1])
+class FPGrowthTest(MLlibTestCase):
+
+ def test_fpgrowth(self):
+ data = [["a", "b", "c"], ["a", "b", "d", "e"], ["a", "c", "e"], ["a", "c", "f"]]
+ rdd = self.sc.parallelize(data, 2)
+ model1 = FPGrowth.train(rdd, 0.6, 2)
+ # use default data partition number when numPartitions is not specified
+ model2 = FPGrowth.train(rdd, 0.6)
+ self.assertEqual(sorted(model1.freqItemsets().collect()),
+ sorted(model2.freqItemsets().collect()))
+
if __name__ == "__main__":
from pyspark.mllib.tests import *
if not _have_scipy:
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org