You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by sr...@apache.org on 2018/02/10 16:46:49 UTC
spark git commit: [SPARK-23344][PYTHON][ML] Add distanceMeasure param
to KMeans
Repository: spark
Updated Branches:
refs/heads/master 97a224a85 -> 0783876c8
[SPARK-23344][PYTHON][ML] Add distanceMeasure param to KMeans
## What changes were proposed in this pull request?
SPARK-22119 introduced a new parameter for KMeans, ie. `distanceMeasure`. The PR adds it also to the Python interface.
## How was this patch tested?
added UTs
Author: Marco Gaido <ma...@gmail.com>
Closes #20520 from mgaido91/SPARK-23344.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0783876c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0783876c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0783876c
Branch: refs/heads/master
Commit: 0783876c81f212e1422a1b7786c26e3ac8e84f9f
Parents: 97a224a
Author: Marco Gaido <ma...@gmail.com>
Authored: Sat Feb 10 10:46:45 2018 -0600
Committer: Sean Owen <so...@cloudera.com>
Committed: Sat Feb 10 10:46:45 2018 -0600
----------------------------------------------------------------------
python/pyspark/ml/clustering.py | 32 +++++++++++++++++++++++++++-----
python/pyspark/ml/tests.py | 18 ++++++++++++++++++
2 files changed, 45 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/0783876c/python/pyspark/ml/clustering.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index 66fb005..6448b76 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -403,17 +403,23 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol
typeConverter=TypeConverters.toString)
initSteps = Param(Params._dummy(), "initSteps", "The number of steps for k-means|| " +
"initialization mode. Must be > 0.", typeConverter=TypeConverters.toInt)
+ distanceMeasure = Param(Params._dummy(), "distanceMeasure", "The distance measure. " +
+ "Supported options: 'euclidean' and 'cosine'.",
+ typeConverter=TypeConverters.toString)
@keyword_only
def __init__(self, featuresCol="features", predictionCol="prediction", k=2,
- initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None):
+ initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None,
+ distanceMeasure="euclidean"):
"""
__init__(self, featuresCol="features", predictionCol="prediction", k=2, \
- initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None)
+ initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None, \
+ distanceMeasure="euclidean")
"""
super(KMeans, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.KMeans", self.uid)
- self._setDefault(k=2, initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20)
+ self._setDefault(k=2, initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20,
+ distanceMeasure="euclidean")
kwargs = self._input_kwargs
self.setParams(**kwargs)
@@ -423,10 +429,12 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol
@keyword_only
@since("1.5.0")
def setParams(self, featuresCol="features", predictionCol="prediction", k=2,
- initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None):
+ initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None,
+ distanceMeasure="euclidean"):
"""
setParams(self, featuresCol="features", predictionCol="prediction", k=2, \
- initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None)
+ initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None, \
+ distanceMeasure="euclidean")
Sets params for KMeans.
"""
@@ -475,6 +483,20 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol
"""
return self.getOrDefault(self.initSteps)
+ @since("2.4.0")
+ def setDistanceMeasure(self, value):
+ """
+ Sets the value of :py:attr:`distanceMeasure`.
+ """
+ return self._set(distanceMeasure=value)
+
+ @since("2.4.0")
+ def getDistanceMeasure(self):
+ """
+ Gets the value of `distanceMeasure`
+ """
+ return self.getOrDefault(self.distanceMeasure)
+
class BisectingKMeansModel(JavaModel, JavaMLWritable, JavaMLReadable):
"""
http://git-wip-us.apache.org/repos/asf/spark/blob/0783876c/python/pyspark/ml/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 75d0478..6d67372 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -418,6 +418,9 @@ class ParamTests(PySparkTestCase):
self.assertEqual(algo.getK(), 10)
algo.setInitSteps(10)
self.assertEqual(algo.getInitSteps(), 10)
+ self.assertEqual(algo.getDistanceMeasure(), "euclidean")
+ algo.setDistanceMeasure("cosine")
+ self.assertEqual(algo.getDistanceMeasure(), "cosine")
def test_hasseed(self):
noSeedSpecd = TestParams()
@@ -1620,6 +1623,21 @@ class TrainingSummaryTest(SparkSessionTestCase):
self.assertEqual(s.k, 2)
+class KMeansTests(SparkSessionTestCase):
+
+ def test_kmeans_cosine_distance(self):
+ data = [(Vectors.dense([1.0, 1.0]),), (Vectors.dense([10.0, 10.0]),),
+ (Vectors.dense([1.0, 0.5]),), (Vectors.dense([10.0, 4.4]),),
+ (Vectors.dense([-1.0, 1.0]),), (Vectors.dense([-100.0, 90.0]),)]
+ df = self.spark.createDataFrame(data, ["features"])
+ kmeans = KMeans(k=3, seed=1, distanceMeasure="cosine")
+ model = kmeans.fit(df)
+ result = model.transform(df).collect()
+ self.assertTrue(result[0].prediction == result[1].prediction)
+ self.assertTrue(result[2].prediction == result[3].prediction)
+ self.assertTrue(result[4].prediction == result[5].prediction)
+
+
class OneVsRestTests(SparkSessionTestCase):
def test_copy(self):
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org