You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ho...@apache.org on 2018/07/13 18:24:32 UTC
spark git commit: [SPARK-23528][ML] Add numIter to ClusteringSummary
Repository: spark
Updated Branches:
refs/heads/master 3bcb1b481 -> 3b6005b8a
[SPARK-23528][ML] Add numIter to ClusteringSummary
## What changes were proposed in this pull request?
Added the number of iterations in `ClusteringSummary`. This is an helpful information in evaluating how to eventually modify the parameters in order to get a better model.
## How was this patch tested?
modified existing UTs
Author: Marco Gaido <ma...@gmail.com>
Closes #20701 from mgaido91/SPARK-23528.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3b6005b8
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3b6005b8
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3b6005b8
Branch: refs/heads/master
Commit: 3b6005b8a276e646c0785d924f139a48238a7c87
Parents: 3bcb1b4
Author: Marco Gaido <ma...@gmail.com>
Authored: Fri Jul 13 11:23:42 2018 -0700
Committer: Holden Karau <ho...@pigscanfly.ca>
Committed: Fri Jul 13 11:23:42 2018 -0700
----------------------------------------------------------------------
.../org/apache/spark/ml/clustering/BisectingKMeans.scala | 6 ++++--
.../org/apache/spark/ml/clustering/ClusteringSummary.scala | 6 ++++--
.../org/apache/spark/ml/clustering/GaussianMixture.scala | 8 +++++---
.../main/scala/org/apache/spark/ml/clustering/KMeans.scala | 6 ++++--
.../scala/org/apache/spark/mllib/clustering/KMeans.scala | 2 +-
.../org/apache/spark/mllib/clustering/KMeansModel.scala | 9 +++++++--
.../apache/spark/ml/clustering/BisectingKMeansSuite.scala | 1 +
.../apache/spark/ml/clustering/GaussianMixtureSuite.scala | 1 +
.../scala/org/apache/spark/ml/clustering/KMeansSuite.scala | 1 +
project/MimaExcludes.scala | 5 +++++
10 files changed, 33 insertions(+), 12 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/3b6005b8/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
index 9c96145..de56447 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
@@ -274,7 +274,7 @@ class BisectingKMeans @Since("2.0.0") (
val parentModel = bkm.run(rdd)
val model = copyValues(new BisectingKMeansModel(uid, parentModel).setParent(this))
val summary = new BisectingKMeansSummary(
- model.transform(dataset), $(predictionCol), $(featuresCol), $(k))
+ model.transform(dataset), $(predictionCol), $(featuresCol), $(k), $(maxIter))
model.setSummary(Some(summary))
instr.logNamedValue("clusterSizes", summary.clusterSizes)
instr.logSuccess(model)
@@ -304,6 +304,7 @@ object BisectingKMeans extends DefaultParamsReadable[BisectingKMeans] {
* @param predictionCol Name for column of predicted clusters in `predictions`.
* @param featuresCol Name for column of features in `predictions`.
* @param k Number of clusters.
+ * @param numIter Number of iterations.
*/
@Since("2.1.0")
@Experimental
@@ -311,4 +312,5 @@ class BisectingKMeansSummary private[clustering] (
predictions: DataFrame,
predictionCol: String,
featuresCol: String,
- k: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k)
+ k: Int,
+ numIter: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k, numIter)
http://git-wip-us.apache.org/repos/asf/spark/blob/3b6005b8/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala
index 44e832b..7da4c43 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala
@@ -17,7 +17,7 @@
package org.apache.spark.ml.clustering
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.sql.{DataFrame, Row}
/**
@@ -28,13 +28,15 @@ import org.apache.spark.sql.{DataFrame, Row}
* @param predictionCol Name for column of predicted clusters in `predictions`.
* @param featuresCol Name for column of features in `predictions`.
* @param k Number of clusters.
+ * @param numIter Number of iterations.
*/
@Experimental
class ClusteringSummary private[clustering] (
@transient val predictions: DataFrame,
val predictionCol: String,
val featuresCol: String,
- val k: Int) extends Serializable {
+ val k: Int,
+ @Since("2.4.0") val numIter: Int) extends Serializable {
/**
* Cluster centers of the transformed data.
http://git-wip-us.apache.org/repos/asf/spark/blob/3b6005b8/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index 64ecc1e..dae64ba 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -423,7 +423,7 @@ class GaussianMixture @Since("2.0.0") (
val model = copyValues(new GaussianMixtureModel(uid, weights, gaussianDists)).setParent(this)
val summary = new GaussianMixtureSummary(model.transform(dataset),
- $(predictionCol), $(probabilityCol), $(featuresCol), $(k), logLikelihood)
+ $(predictionCol), $(probabilityCol), $(featuresCol), $(k), logLikelihood, iter)
model.setSummary(Some(summary))
instr.logNamedValue("logLikelihood", logLikelihood)
instr.logNamedValue("clusterSizes", summary.clusterSizes)
@@ -687,6 +687,7 @@ private class ExpectationAggregator(
* @param featuresCol Name for column of features in `predictions`.
* @param k Number of clusters.
* @param logLikelihood Total log-likelihood for this model on the given data.
+ * @param numIter Number of iterations.
*/
@Since("2.0.0")
@Experimental
@@ -696,8 +697,9 @@ class GaussianMixtureSummary private[clustering] (
@Since("2.0.0") val probabilityCol: String,
featuresCol: String,
k: Int,
- @Since("2.2.0") val logLikelihood: Double)
- extends ClusteringSummary(predictions, predictionCol, featuresCol, k) {
+ @Since("2.2.0") val logLikelihood: Double,
+ numIter: Int)
+ extends ClusteringSummary(predictions, predictionCol, featuresCol, k, numIter) {
/**
* Probability of each cluster.
http://git-wip-us.apache.org/repos/asf/spark/blob/3b6005b8/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index 1704412..f40037a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -356,7 +356,7 @@ class KMeans @Since("1.5.0") (
val parentModel = algo.run(instances, Option(instr))
val model = copyValues(new KMeansModel(uid, parentModel).setParent(this))
val summary = new KMeansSummary(
- model.transform(dataset), $(predictionCol), $(featuresCol), $(k))
+ model.transform(dataset), $(predictionCol), $(featuresCol), $(k), parentModel.numIter)
model.setSummary(Some(summary))
instr.logNamedValue("clusterSizes", summary.clusterSizes)
@@ -388,6 +388,7 @@ object KMeans extends DefaultParamsReadable[KMeans] {
* @param predictionCol Name for column of predicted clusters in `predictions`.
* @param featuresCol Name for column of features in `predictions`.
* @param k Number of clusters.
+ * @param numIter Number of iterations.
*/
@Since("2.0.0")
@Experimental
@@ -395,4 +396,5 @@ class KMeansSummary private[clustering] (
predictions: DataFrame,
predictionCol: String,
featuresCol: String,
- k: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k)
+ k: Int,
+ numIter: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k, numIter)
http://git-wip-us.apache.org/repos/asf/spark/blob/3b6005b8/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index b5b1be3..37ae8b1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -348,7 +348,7 @@ class KMeans private (
logInfo(s"The cost is $cost.")
- new KMeansModel(centers.map(_.vector), distanceMeasure)
+ new KMeansModel(centers.map(_.vector), distanceMeasure, iteration)
}
/**
http://git-wip-us.apache.org/repos/asf/spark/blob/3b6005b8/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
index a78c21e..e3a88b4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
@@ -36,8 +36,9 @@ import org.apache.spark.sql.{Row, SparkSession}
* A clustering model for K-means. Each point belongs to the cluster with the closest center.
*/
@Since("0.8.0")
-class KMeansModel @Since("2.4.0") (@Since("1.0.0") val clusterCenters: Array[Vector],
- @Since("2.4.0") val distanceMeasure: String)
+class KMeansModel (@Since("1.0.0") val clusterCenters: Array[Vector],
+ @Since("2.4.0") val distanceMeasure: String,
+ private[spark] val numIter: Int)
extends Saveable with Serializable with PMMLExportable {
private val distanceMeasureInstance: DistanceMeasure =
@@ -46,6 +47,10 @@ class KMeansModel @Since("2.4.0") (@Since("1.0.0") val clusterCenters: Array[Vec
private val clusterCentersWithNorm =
if (clusterCenters == null) null else clusterCenters.map(new VectorWithNorm(_))
+ @Since("2.4.0")
+ private[spark] def this(clusterCenters: Array[Vector], distanceMeasure: String) =
+ this(clusterCenters: Array[Vector], distanceMeasure, -1)
+
@Since("1.1.0")
def this(clusterCenters: Array[Vector]) =
this(clusterCenters: Array[Vector], DistanceMeasure.EUCLIDEAN)
http://git-wip-us.apache.org/repos/asf/spark/blob/3b6005b8/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
index 81842af..1b7780e 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
@@ -133,6 +133,7 @@ class BisectingKMeansSuite extends MLTest with DefaultReadWriteTest {
assert(clusterSizes.length === k)
assert(clusterSizes.sum === numRows)
assert(clusterSizes.forall(_ >= 0))
+ assert(summary.numIter == 20)
model.setSummary(None)
assert(!model.hasSummary)
http://git-wip-us.apache.org/repos/asf/spark/blob/3b6005b8/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
index 0b91f50..13bed9d 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
@@ -145,6 +145,7 @@ class GaussianMixtureSuite extends MLTest with DefaultReadWriteTest {
assert(clusterSizes.length === k)
assert(clusterSizes.sum === numRows)
assert(clusterSizes.forall(_ >= 0))
+ assert(summary.numIter == 2)
model.setSummary(None)
assert(!model.hasSummary)
http://git-wip-us.apache.org/repos/asf/spark/blob/3b6005b8/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
index 2569e7a..829c90f 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
@@ -135,6 +135,7 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes
assert(clusterSizes.length === k)
assert(clusterSizes.sum === numRows)
assert(clusterSizes.forall(_ >= 0))
+ assert(summary.numIter == 1)
model.setSummary(None)
assert(!model.hasSummary)
http://git-wip-us.apache.org/repos/asf/spark/blob/3b6005b8/project/MimaExcludes.scala
----------------------------------------------------------------------
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index eeb097e..8f96bb0 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -36,6 +36,11 @@ object MimaExcludes {
// Exclude rules for 2.4.x
lazy val v24excludes = v23excludes ++ Seq(
+ // [SPARK-23528] Add numIter to ClusteringSummary
+ ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.clustering.ClusteringSummary.this"),
+ ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.clustering.KMeansSummary.this"),
+ ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.clustering.BisectingKMeansSummary.this"),
+
// [SPARK-6237][NETWORK] Network-layer changes to allow stream upload
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.network.netty.NettyBlockRpcServer.receive"),
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org