You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ho...@apache.org on 2018/07/20 16:19:08 UTC
spark git commit: [SPARK-23451][ML] Deprecate KMeans.computeCost

Repository: spark
Updated Branches:
  refs/heads/master e0b638321 -> cc4d64bb1


[SPARK-23451][ML] Deprecate KMeans.computeCost

## What changes were proposed in this pull request?

Deprecate `KMeans.computeCost` which was introduced as a temp fix and now it is not needed anymore, since we introduced `ClusteringEvaluator`.

## How was this patch tested?

manual test (deprecation warning displayed)
Scala
```
...
scala> model.computeCost(dataset)
warning: there was one deprecation warning; re-run with -deprecation for details
res1: Double = 0.0
```

Python
```
>>> import warnings
>>> warnings.simplefilter('always', DeprecationWarning)
...
>>> model.computeCost(df)
/Users/mgaido/apache/spark/python/pyspark/ml/clustering.py:330: DeprecationWarning: Deprecated in 2.4.0. It will be removed in 3.0.0. Use ClusteringEvaluator instead.
  " instead.", DeprecationWarning)
```

Author: Marco Gaido <ma...@gmail.com>

Closes #20629 from mgaido91/SPARK-23451.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cc4d64bb
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cc4d64bb
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cc4d64bb

Branch: refs/heads/master
Commit: cc4d64bb16987eb5a41d4198bf4a5882e549a94f
Parents: e0b6383
Author: Marco Gaido <ma...@gmail.com>
Authored: Fri Jul 20 09:18:57 2018 -0700
Committer: Holden Karau <ho...@pigscanfly.ca>
Committed: Fri Jul 20 09:18:57 2018 -0700

----------------------------------------------------------------------
 .../org/apache/spark/ml/clustering/KMeans.scala  | 19 ++++++++++++++++---
 .../apache/spark/mllib/clustering/KMeans.scala   |  2 +-
 .../spark/mllib/clustering/KMeansModel.scala     | 11 +++++++----
 .../apache/spark/ml/clustering/KMeansSuite.scala |  2 ++
 python/pyspark/ml/clustering.py                  | 19 ++++++++++++++++++-
 5 files changed, 44 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/cc4d64bb/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index f40037a..6f4a30d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -145,8 +145,12 @@ class KMeansModel private[ml] (
   /**
    * Return the K-means cost (sum of squared distances of points to their nearest center) for this
    * model on the given data.
+   *
+   * @deprecated This method is deprecated and will be removed in 3.0.0. Use ClusteringEvaluator
+   *             instead. You can also get the cost on the training dataset in the summary.
    */
-  // TODO: Replace the temp fix when we have proper evaluators defined for clustering.
+  @deprecated("This method is deprecated and will be removed in 3.0.0. Use ClusteringEvaluator " +
+    "instead. You can also get the cost on the training dataset in the summary.", "2.4.0")
   @Since("2.0.0")
   def computeCost(dataset: Dataset[_]): Double = {
     SchemaUtils.validateVectorCompatibleColumn(dataset.schema, getFeaturesCol)
@@ -356,7 +360,12 @@ class KMeans @Since("1.5.0") (
     val parentModel = algo.run(instances, Option(instr))
     val model = copyValues(new KMeansModel(uid, parentModel).setParent(this))
     val summary = new KMeansSummary(
-      model.transform(dataset), $(predictionCol), $(featuresCol), $(k), parentModel.numIter)
+      model.transform(dataset),
+      $(predictionCol),
+      $(featuresCol),
+      $(k),
+      parentModel.numIter,
+      parentModel.trainingCost)
 
     model.setSummary(Some(summary))
     instr.logNamedValue("clusterSizes", summary.clusterSizes)
@@ -389,6 +398,8 @@ object KMeans extends DefaultParamsReadable[KMeans] {
  * @param featuresCol  Name for column of features in `predictions`.
  * @param k  Number of clusters.
  * @param numIter  Number of iterations.
+ * @param trainingCost K-means cost (sum of squared distances to the nearest centroid for all
+ *                     points in the training dataset). This is equivalent to sklearn's inertia.
  */
 @Since("2.0.0")
 @Experimental
@@ -397,4 +408,6 @@ class KMeansSummary private[clustering] (
     predictionCol: String,
     featuresCol: String,
     k: Int,
-    numIter: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k, numIter)
+    numIter: Int,
+    @Since("2.4.0") val trainingCost: Double)
+  extends ClusteringSummary(predictions, predictionCol, featuresCol, k, numIter)

http://git-wip-us.apache.org/repos/asf/spark/blob/cc4d64bb/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index 4f554f4..55df8a3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -348,7 +348,7 @@ class KMeans private (
 
     logInfo(s"The cost is $cost.")
 
-    new KMeansModel(centers.map(_.vector), distanceMeasure, iteration)
+    new KMeansModel(centers.map(_.vector), distanceMeasure, cost, iteration)
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/spark/blob/cc4d64bb/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
index e3a88b4..d5c8188 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
@@ -38,6 +38,7 @@ import org.apache.spark.sql.{Row, SparkSession}
 @Since("0.8.0")
 class KMeansModel (@Since("1.0.0") val clusterCenters: Array[Vector],
   @Since("2.4.0") val distanceMeasure: String,
+  @Since("2.4.0") val trainingCost: Double,
   private[spark] val numIter: Int)
   extends Saveable with Serializable with PMMLExportable {
 
@@ -49,11 +50,11 @@ class KMeansModel (@Since("1.0.0") val clusterCenters: Array[Vector],
 
   @Since("2.4.0")
   private[spark] def this(clusterCenters: Array[Vector], distanceMeasure: String) =
-    this(clusterCenters: Array[Vector], distanceMeasure, -1)
+    this(clusterCenters: Array[Vector], distanceMeasure, 0.0, -1)
 
   @Since("1.1.0")
   def this(clusterCenters: Array[Vector]) =
-    this(clusterCenters: Array[Vector], DistanceMeasure.EUCLIDEAN)
+    this(clusterCenters: Array[Vector], DistanceMeasure.EUCLIDEAN, 0.0, -1)
 
   /**
    * A Java-friendly constructor that takes an Iterable of Vectors.
@@ -187,7 +188,8 @@ object KMeansModel extends Loader[KMeansModel] {
       val spark = SparkSession.builder().sparkContext(sc).getOrCreate()
       val metadata = compact(render(
         ("class" -> thisClassName) ~ ("version" -> thisFormatVersion)
-         ~ ("k" -> model.k) ~ ("distanceMeasure" -> model.distanceMeasure)))
+         ~ ("k" -> model.k) ~ ("distanceMeasure" -> model.distanceMeasure)
+         ~ ("trainingCost" -> model.trainingCost)))
       sc.parallelize(Seq(metadata), 1).saveAsTextFile(Loader.metadataPath(path))
       val dataRDD = sc.parallelize(model.clusterCentersWithNorm.zipWithIndex).map { case (p, id) =>
         Cluster(id, p.vector)
@@ -207,7 +209,8 @@ object KMeansModel extends Loader[KMeansModel] {
       val localCentroids = centroids.rdd.map(Cluster.apply).collect()
       assert(k == localCentroids.length)
       val distanceMeasure = (metadata \ "distanceMeasure").extract[String]
-      new KMeansModel(localCentroids.sortBy(_.id).map(_.point), distanceMeasure)
+      val trainingCost = (metadata \ "trainingCost").extract[Double]
+      new KMeansModel(localCentroids.sortBy(_.id).map(_.point), distanceMeasure, trainingCost, -1)
     }
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/cc4d64bb/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
index 829c90f..9b0b526 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
@@ -131,6 +131,8 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes
       assert(summary.predictions.columns.contains(c))
     }
     assert(summary.cluster.columns === Array(predictionColName))
+    assert(summary.trainingCost < 0.1)
+    assert(model.computeCost(dataset) == summary.trainingCost)
     val clusterSizes = summary.clusterSizes
     assert(clusterSizes.length === k)
     assert(clusterSizes.sum === numRows)

http://git-wip-us.apache.org/repos/asf/spark/blob/cc4d64bb/python/pyspark/ml/clustering.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index 2f06600..8a58d83 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -16,6 +16,7 @@
 #
 
 import sys
+import warnings
 
 from pyspark import since, keyword_only
 from pyspark.ml.util import *
@@ -303,7 +304,15 @@ class KMeansSummary(ClusteringSummary):
 
     .. versionadded:: 2.1.0
     """
-    pass
+
+    @property
+    @since("2.4.0")
+    def trainingCost(self):
+        """
+        K-means cost (sum of squared distances to the nearest centroid for all points in the
+        training dataset). This is equivalent to sklearn's inertia.
+        """
+        return self._call_java("trainingCost")
 
 
 class KMeansModel(JavaModel, JavaMLWritable, JavaMLReadable):
@@ -323,7 +332,13 @@ class KMeansModel(JavaModel, JavaMLWritable, JavaMLReadable):
         """
         Return the K-means cost (sum of squared distances of points to their nearest center)
         for this model on the given data.
+
+        ..note:: Deprecated in 2.4.0. It will be removed in 3.0.0. Use ClusteringEvaluator instead.
+           You can also get the cost on the training dataset in the summary.
         """
+        warnings.warn("Deprecated in 2.4.0. It will be removed in 3.0.0. Use ClusteringEvaluator "
+                      "instead. You can also get the cost on the training dataset in the summary.",
+                      DeprecationWarning)
         return self._call_java("computeCost", dataset)
 
     @property
@@ -379,6 +394,8 @@ class KMeans(JavaEstimator, HasDistanceMeasure, HasFeaturesCol, HasPredictionCol
     2
     >>> summary.clusterSizes
     [2, 2]
+    >>> summary.trainingCost
+    2.000...
     >>> kmeans_path = temp_path + "/kmeans"
     >>> kmeans.save(kmeans_path)
     >>> kmeans2 = KMeans.load(kmeans_path)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org