You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by sr...@apache.org on 2018/02/21 18:39:39 UTC

spark git commit: [SPARK-23217][ML][PYTHON] Add distanceMeasure param to ClusteringEvaluator Python API

Repository: spark
Updated Branches:
  refs/heads/master c8c4441df -> e836c27ce


[SPARK-23217][ML][PYTHON] Add distanceMeasure param to ClusteringEvaluator Python API

## What changes were proposed in this pull request?

The PR adds the `distanceMeasure` param to ClusteringEvaluator in the Python API. This allows the user to specify `cosine` as distance measure in addition to the default `squaredEuclidean`.

## How was this patch tested?

added UT

Author: Marco Gaido <ma...@gmail.com>

Closes #20627 from mgaido91/SPARK-23217_python.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e836c27c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e836c27c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e836c27c

Branch: refs/heads/master
Commit: e836c27ce011ca9aef822bef6320b4a7059ec343
Parents: c8c4441
Author: Marco Gaido <ma...@gmail.com>
Authored: Wed Feb 21 12:39:36 2018 -0600
Committer: Sean Owen <so...@cloudera.com>
Committed: Wed Feb 21 12:39:36 2018 -0600

----------------------------------------------------------------------
 python/pyspark/ml/evaluation.py | 28 +++++++++++++++++++++++-----
 python/pyspark/ml/tests.py      | 16 ++++++++++++++--
 2 files changed, 37 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/e836c27c/python/pyspark/ml/evaluation.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
index 0cbce9b..695d8ab 100644
--- a/python/pyspark/ml/evaluation.py
+++ b/python/pyspark/ml/evaluation.py
@@ -362,18 +362,21 @@ class ClusteringEvaluator(JavaEvaluator, HasPredictionCol, HasFeaturesCol,
     metricName = Param(Params._dummy(), "metricName",
                        "metric name in evaluation (silhouette)",
                        typeConverter=TypeConverters.toString)
+    distanceMeasure = Param(Params._dummy(), "distanceMeasure", "The distance measure. " +
+                            "Supported options: 'squaredEuclidean' and 'cosine'.",
+                            typeConverter=TypeConverters.toString)
 
     @keyword_only
     def __init__(self, predictionCol="prediction", featuresCol="features",
-                 metricName="silhouette"):
+                 metricName="silhouette", distanceMeasure="squaredEuclidean"):
         """
         __init__(self, predictionCol="prediction", featuresCol="features", \
-                 metricName="silhouette")
+                 metricName="silhouette", distanceMeasure="squaredEuclidean")
         """
         super(ClusteringEvaluator, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.evaluation.ClusteringEvaluator", self.uid)
-        self._setDefault(metricName="silhouette")
+        self._setDefault(metricName="silhouette", distanceMeasure="squaredEuclidean")
         kwargs = self._input_kwargs
         self._set(**kwargs)
 
@@ -394,15 +397,30 @@ class ClusteringEvaluator(JavaEvaluator, HasPredictionCol, HasFeaturesCol,
     @keyword_only
     @since("2.3.0")
     def setParams(self, predictionCol="prediction", featuresCol="features",
-                  metricName="silhouette"):
+                  metricName="silhouette", distanceMeasure="squaredEuclidean"):
         """
         setParams(self, predictionCol="prediction", featuresCol="features", \
-                  metricName="silhouette")
+                  metricName="silhouette", distanceMeasure="squaredEuclidean")
         Sets params for clustering evaluator.
         """
         kwargs = self._input_kwargs
         return self._set(**kwargs)
 
+    @since("2.4.0")
+    def setDistanceMeasure(self, value):
+        """
+        Sets the value of :py:attr:`distanceMeasure`.
+        """
+        return self._set(distanceMeasure=value)
+
+    @since("2.4.0")
+    def getDistanceMeasure(self):
+        """
+        Gets the value of `distanceMeasure`
+        """
+        return self.getOrDefault(self.distanceMeasure)
+
+
 if __name__ == "__main__":
     import doctest
     import tempfile

http://git-wip-us.apache.org/repos/asf/spark/blob/e836c27c/python/pyspark/ml/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 6d67372..1168859 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -51,7 +51,7 @@ from pyspark.ml import Estimator, Model, Pipeline, PipelineModel, Transformer, U
 from pyspark.ml.classification import *
 from pyspark.ml.clustering import *
 from pyspark.ml.common import _java2py, _py2java
-from pyspark.ml.evaluation import BinaryClassificationEvaluator, \
+from pyspark.ml.evaluation import BinaryClassificationEvaluator, ClusteringEvaluator, \
     MulticlassClassificationEvaluator, RegressionEvaluator
 from pyspark.ml.feature import *
 from pyspark.ml.fpm import FPGrowth, FPGrowthModel
@@ -541,6 +541,15 @@ class EvaluatorTests(SparkSessionTestCase):
         self.assertEqual(evaluator._java_obj.getMetricName(), "r2")
         self.assertEqual(evaluatorCopy._java_obj.getMetricName(), "mae")
 
+    def test_clustering_evaluator_with_cosine_distance(self):
+        featureAndPredictions = map(lambda x: (Vectors.dense(x[0]), x[1]),
+                                    [([1.0, 1.0], 1.0), ([10.0, 10.0], 1.0), ([1.0, 0.5], 2.0),
+                                     ([10.0, 4.4], 2.0), ([-1.0, 1.0], 3.0), ([-100.0, 90.0], 3.0)])
+        dataset = self.spark.createDataFrame(featureAndPredictions, ["features", "prediction"])
+        evaluator = ClusteringEvaluator(predictionCol="prediction", distanceMeasure="cosine")
+        self.assertEqual(evaluator.getDistanceMeasure(), "cosine")
+        self.assertTrue(np.isclose(evaluator.evaluate(dataset),  0.992671213, atol=1e-5))
+
 
 class FeatureTests(SparkSessionTestCase):
 
@@ -1961,11 +1970,14 @@ class DefaultValuesTests(PySparkTestCase):
         import pyspark.ml.feature
         import pyspark.ml.classification
         import pyspark.ml.clustering
+        import pyspark.ml.evaluation
         import pyspark.ml.pipeline
         import pyspark.ml.recommendation
         import pyspark.ml.regression
+
         modules = [pyspark.ml.feature, pyspark.ml.classification, pyspark.ml.clustering,
-                   pyspark.ml.pipeline, pyspark.ml.recommendation, pyspark.ml.regression]
+                   pyspark.ml.evaluation, pyspark.ml.pipeline, pyspark.ml.recommendation,
+                   pyspark.ml.regression]
         for module in modules:
             for name, cls in inspect.getmembers(module, inspect.isclass):
                 if not name.endswith('Model') and issubclass(cls, JavaParams)\


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org