You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ho...@apache.org on 2018/07/20 16:19:08 UTC
spark git commit: [SPARK-23451][ML] Deprecate KMeans.computeCost
Repository: spark
Updated Branches:
refs/heads/master e0b638321 -> cc4d64bb1
[SPARK-23451][ML] Deprecate KMeans.computeCost
## What changes were proposed in this pull request?
Deprecate `KMeans.computeCost` which was introduced as a temp fix and now it is not needed anymore, since we introduced `ClusteringEvaluator`.
## How was this patch tested?
manual test (deprecation warning displayed)
Scala
```
...
scala> model.computeCost(dataset)
warning: there was one deprecation warning; re-run with -deprecation for details
res1: Double = 0.0
```
Python
```
>>> import warnings
>>> warnings.simplefilter('always', DeprecationWarning)
...
>>> model.computeCost(df)
/Users/mgaido/apache/spark/python/pyspark/ml/clustering.py:330: DeprecationWarning: Deprecated in 2.4.0. It will be removed in 3.0.0. Use ClusteringEvaluator instead.
" instead.", DeprecationWarning)
```
Author: Marco Gaido <ma...@gmail.com>
Closes #20629 from mgaido91/SPARK-23451.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cc4d64bb
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cc4d64bb
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cc4d64bb
Branch: refs/heads/master
Commit: cc4d64bb16987eb5a41d4198bf4a5882e549a94f
Parents: e0b6383
Author: Marco Gaido <ma...@gmail.com>
Authored: Fri Jul 20 09:18:57 2018 -0700
Committer: Holden Karau <ho...@pigscanfly.ca>
Committed: Fri Jul 20 09:18:57 2018 -0700
----------------------------------------------------------------------
.../org/apache/spark/ml/clustering/KMeans.scala | 19 ++++++++++++++++---
.../apache/spark/mllib/clustering/KMeans.scala | 2 +-
.../spark/mllib/clustering/KMeansModel.scala | 11 +++++++----
.../apache/spark/ml/clustering/KMeansSuite.scala | 2 ++
python/pyspark/ml/clustering.py | 19 ++++++++++++++++++-
5 files changed, 44 insertions(+), 9 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/cc4d64bb/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index f40037a..6f4a30d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -145,8 +145,12 @@ class KMeansModel private[ml] (
/**
* Return the K-means cost (sum of squared distances of points to their nearest center) for this
* model on the given data.
+ *
+ * @deprecated This method is deprecated and will be removed in 3.0.0. Use ClusteringEvaluator
+ * instead. You can also get the cost on the training dataset in the summary.
*/
- // TODO: Replace the temp fix when we have proper evaluators defined for clustering.
+ @deprecated("This method is deprecated and will be removed in 3.0.0. Use ClusteringEvaluator " +
+ "instead. You can also get the cost on the training dataset in the summary.", "2.4.0")
@Since("2.0.0")
def computeCost(dataset: Dataset[_]): Double = {
SchemaUtils.validateVectorCompatibleColumn(dataset.schema, getFeaturesCol)
@@ -356,7 +360,12 @@ class KMeans @Since("1.5.0") (
val parentModel = algo.run(instances, Option(instr))
val model = copyValues(new KMeansModel(uid, parentModel).setParent(this))
val summary = new KMeansSummary(
- model.transform(dataset), $(predictionCol), $(featuresCol), $(k), parentModel.numIter)
+ model.transform(dataset),
+ $(predictionCol),
+ $(featuresCol),
+ $(k),
+ parentModel.numIter,
+ parentModel.trainingCost)
model.setSummary(Some(summary))
instr.logNamedValue("clusterSizes", summary.clusterSizes)
@@ -389,6 +398,8 @@ object KMeans extends DefaultParamsReadable[KMeans] {
* @param featuresCol Name for column of features in `predictions`.
* @param k Number of clusters.
* @param numIter Number of iterations.
+ * @param trainingCost K-means cost (sum of squared distances to the nearest centroid for all
+ * points in the training dataset). This is equivalent to sklearn's inertia.
*/
@Since("2.0.0")
@Experimental
@@ -397,4 +408,6 @@ class KMeansSummary private[clustering] (
predictionCol: String,
featuresCol: String,
k: Int,
- numIter: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k, numIter)
+ numIter: Int,
+ @Since("2.4.0") val trainingCost: Double)
+ extends ClusteringSummary(predictions, predictionCol, featuresCol, k, numIter)
http://git-wip-us.apache.org/repos/asf/spark/blob/cc4d64bb/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index 4f554f4..55df8a3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -348,7 +348,7 @@ class KMeans private (
logInfo(s"The cost is $cost.")
- new KMeansModel(centers.map(_.vector), distanceMeasure, iteration)
+ new KMeansModel(centers.map(_.vector), distanceMeasure, cost, iteration)
}
/**
http://git-wip-us.apache.org/repos/asf/spark/blob/cc4d64bb/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
index e3a88b4..d5c8188 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
@@ -38,6 +38,7 @@ import org.apache.spark.sql.{Row, SparkSession}
@Since("0.8.0")
class KMeansModel (@Since("1.0.0") val clusterCenters: Array[Vector],
@Since("2.4.0") val distanceMeasure: String,
+ @Since("2.4.0") val trainingCost: Double,
private[spark] val numIter: Int)
extends Saveable with Serializable with PMMLExportable {
@@ -49,11 +50,11 @@ class KMeansModel (@Since("1.0.0") val clusterCenters: Array[Vector],
@Since("2.4.0")
private[spark] def this(clusterCenters: Array[Vector], distanceMeasure: String) =
- this(clusterCenters: Array[Vector], distanceMeasure, -1)
+ this(clusterCenters: Array[Vector], distanceMeasure, 0.0, -1)
@Since("1.1.0")
def this(clusterCenters: Array[Vector]) =
- this(clusterCenters: Array[Vector], DistanceMeasure.EUCLIDEAN)
+ this(clusterCenters: Array[Vector], DistanceMeasure.EUCLIDEAN, 0.0, -1)
/**
* A Java-friendly constructor that takes an Iterable of Vectors.
@@ -187,7 +188,8 @@ object KMeansModel extends Loader[KMeansModel] {
val spark = SparkSession.builder().sparkContext(sc).getOrCreate()
val metadata = compact(render(
("class" -> thisClassName) ~ ("version" -> thisFormatVersion)
- ~ ("k" -> model.k) ~ ("distanceMeasure" -> model.distanceMeasure)))
+ ~ ("k" -> model.k) ~ ("distanceMeasure" -> model.distanceMeasure)
+ ~ ("trainingCost" -> model.trainingCost)))
sc.parallelize(Seq(metadata), 1).saveAsTextFile(Loader.metadataPath(path))
val dataRDD = sc.parallelize(model.clusterCentersWithNorm.zipWithIndex).map { case (p, id) =>
Cluster(id, p.vector)
@@ -207,7 +209,8 @@ object KMeansModel extends Loader[KMeansModel] {
val localCentroids = centroids.rdd.map(Cluster.apply).collect()
assert(k == localCentroids.length)
val distanceMeasure = (metadata \ "distanceMeasure").extract[String]
- new KMeansModel(localCentroids.sortBy(_.id).map(_.point), distanceMeasure)
+ val trainingCost = (metadata \ "trainingCost").extract[Double]
+ new KMeansModel(localCentroids.sortBy(_.id).map(_.point), distanceMeasure, trainingCost, -1)
}
}
}
http://git-wip-us.apache.org/repos/asf/spark/blob/cc4d64bb/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
index 829c90f..9b0b526 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
@@ -131,6 +131,8 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes
assert(summary.predictions.columns.contains(c))
}
assert(summary.cluster.columns === Array(predictionColName))
+ assert(summary.trainingCost < 0.1)
+ assert(model.computeCost(dataset) == summary.trainingCost)
val clusterSizes = summary.clusterSizes
assert(clusterSizes.length === k)
assert(clusterSizes.sum === numRows)
http://git-wip-us.apache.org/repos/asf/spark/blob/cc4d64bb/python/pyspark/ml/clustering.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index 2f06600..8a58d83 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -16,6 +16,7 @@
#
import sys
+import warnings
from pyspark import since, keyword_only
from pyspark.ml.util import *
@@ -303,7 +304,15 @@ class KMeansSummary(ClusteringSummary):
.. versionadded:: 2.1.0
"""
- pass
+
+ @property
+ @since("2.4.0")
+ def trainingCost(self):
+ """
+ K-means cost (sum of squared distances to the nearest centroid for all points in the
+ training dataset). This is equivalent to sklearn's inertia.
+ """
+ return self._call_java("trainingCost")
class KMeansModel(JavaModel, JavaMLWritable, JavaMLReadable):
@@ -323,7 +332,13 @@ class KMeansModel(JavaModel, JavaMLWritable, JavaMLReadable):
"""
Return the K-means cost (sum of squared distances of points to their nearest center)
for this model on the given data.
+
+ ..note:: Deprecated in 2.4.0. It will be removed in 3.0.0. Use ClusteringEvaluator instead.
+ You can also get the cost on the training dataset in the summary.
"""
+ warnings.warn("Deprecated in 2.4.0. It will be removed in 3.0.0. Use ClusteringEvaluator "
+ "instead. You can also get the cost on the training dataset in the summary.",
+ DeprecationWarning)
return self._call_java("computeCost", dataset)
@property
@@ -379,6 +394,8 @@ class KMeans(JavaEstimator, HasDistanceMeasure, HasFeaturesCol, HasPredictionCol
2
>>> summary.clusterSizes
[2, 2]
+ >>> summary.trainingCost
+ 2.000...
>>> kmeans_path = temp_path + "/kmeans"
>>> kmeans.save(kmeans_path)
>>> kmeans2 = KMeans.load(kmeans_path)
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org