You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by jk...@apache.org on 2016/10/26 18:48:57 UTC
spark git commit: [MINOR][ML] Refactor clustering summary.
Repository: spark
Updated Branches:
refs/heads/master 7d10631c1 -> ea3605e82
[MINOR][ML] Refactor clustering summary.
## What changes were proposed in this pull request?
Abstract ```ClusteringSummary``` from ```KMeansSummary```, ```GaussianMixtureSummary``` and ```BisectingSummary```, and eliminate duplicated pieces of code.
## How was this patch tested?
Existing tests.
Author: Yanbo Liang <yb...@gmail.com>
Closes #15555 from yanboliang/clustering-summary.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ea3605e8
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ea3605e8
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ea3605e8
Branch: refs/heads/master
Commit: ea3605e82545031a00235ee0f449e1e2418674e8
Parents: 7d10631
Author: Yanbo Liang <yb...@gmail.com>
Authored: Wed Oct 26 11:48:54 2016 -0700
Committer: Joseph K. Bradley <jo...@databricks.com>
Committed: Wed Oct 26 11:48:54 2016 -0700
----------------------------------------------------------------------
.../spark/ml/clustering/BisectingKMeans.scala | 36 +++----------
.../spark/ml/clustering/ClusteringSummary.scala | 54 ++++++++++++++++++++
.../spark/ml/clustering/GaussianMixture.scala | 37 ++++----------
.../org/apache/spark/ml/clustering/KMeans.scala | 36 +++----------
4 files changed, 80 insertions(+), 83 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/ea3605e8/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
index ef2d918..2718dd9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
@@ -288,35 +288,15 @@ object BisectingKMeans extends DefaultParamsReadable[BisectingKMeans] {
* :: Experimental ::
* Summary of BisectingKMeans.
*
- * @param predictions [[DataFrame]] produced by [[BisectingKMeansModel.transform()]]
- * @param predictionCol Name for column of predicted clusters in `predictions`
- * @param featuresCol Name for column of features in `predictions`
- * @param k Number of clusters
+ * @param predictions [[DataFrame]] produced by [[BisectingKMeansModel.transform()]].
+ * @param predictionCol Name for column of predicted clusters in `predictions`.
+ * @param featuresCol Name for column of features in `predictions`.
+ * @param k Number of clusters.
*/
@Since("2.1.0")
@Experimental
class BisectingKMeansSummary private[clustering] (
- @Since("2.1.0") @transient val predictions: DataFrame,
- @Since("2.1.0") val predictionCol: String,
- @Since("2.1.0") val featuresCol: String,
- @Since("2.1.0") val k: Int) extends Serializable {
-
- /**
- * Cluster centers of the transformed data.
- */
- @Since("2.1.0")
- @transient lazy val cluster: DataFrame = predictions.select(predictionCol)
-
- /**
- * Size of (number of data points in) each cluster.
- */
- @Since("2.1.0")
- lazy val clusterSizes: Array[Long] = {
- val sizes = Array.fill[Long](k)(0)
- cluster.groupBy(predictionCol).count().select(predictionCol, "count").collect().foreach {
- case Row(cluster: Int, count: Long) => sizes(cluster) = count
- }
- sizes
- }
-
-}
+ predictions: DataFrame,
+ predictionCol: String,
+ featuresCol: String,
+ k: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k)
http://git-wip-us.apache.org/repos/asf/spark/blob/ea3605e8/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala
new file mode 100644
index 0000000..8b5f525
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.clustering
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.sql.{DataFrame, Row}
+
+/**
+ * :: Experimental ::
+ * Summary of clustering algorithms.
+ *
+ * @param predictions [[DataFrame]] produced by model.transform().
+ * @param predictionCol Name for column of predicted clusters in `predictions`.
+ * @param featuresCol Name for column of features in `predictions`.
+ * @param k Number of clusters.
+ */
+@Experimental
+class ClusteringSummary private[clustering] (
+ @transient val predictions: DataFrame,
+ val predictionCol: String,
+ val featuresCol: String,
+ val k: Int) extends Serializable {
+
+ /**
+ * Cluster centers of the transformed data.
+ */
+ @transient lazy val cluster: DataFrame = predictions.select(predictionCol)
+
+ /**
+ * Size of (number of data points in) each cluster.
+ */
+ lazy val clusterSizes: Array[Long] = {
+ val sizes = Array.fill[Long](k)(0)
+ cluster.groupBy(predictionCol).count().select(predictionCol, "count").collect().foreach {
+ case Row(cluster: Int, count: Long) => sizes(cluster) = count
+ }
+ sizes
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/ea3605e8/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index 69f060a..e3cb92f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -356,42 +356,25 @@ object GaussianMixture extends DefaultParamsReadable[GaussianMixture] {
* :: Experimental ::
* Summary of GaussianMixture.
*
- * @param predictions [[DataFrame]] produced by [[GaussianMixtureModel.transform()]]
- * @param predictionCol Name for column of predicted clusters in `predictions`
- * @param probabilityCol Name for column of predicted probability of each cluster in `predictions`
- * @param featuresCol Name for column of features in `predictions`
- * @param k Number of clusters
+ * @param predictions [[DataFrame]] produced by [[GaussianMixtureModel.transform()]].
+ * @param predictionCol Name for column of predicted clusters in `predictions`.
+ * @param probabilityCol Name for column of predicted probability of each cluster
+ * in `predictions`.
+ * @param featuresCol Name for column of features in `predictions`.
+ * @param k Number of clusters.
*/
@Since("2.0.0")
@Experimental
class GaussianMixtureSummary private[clustering] (
- @Since("2.0.0") @transient val predictions: DataFrame,
- @Since("2.0.0") val predictionCol: String,
+ predictions: DataFrame,
+ predictionCol: String,
@Since("2.0.0") val probabilityCol: String,
- @Since("2.0.0") val featuresCol: String,
- @Since("2.0.0") val k: Int) extends Serializable {
-
- /**
- * Cluster centers of the transformed data.
- */
- @Since("2.0.0")
- @transient lazy val cluster: DataFrame = predictions.select(predictionCol)
+ featuresCol: String,
+ k: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k) {
/**
* Probability of each cluster.
*/
@Since("2.0.0")
@transient lazy val probability: DataFrame = predictions.select(probabilityCol)
-
- /**
- * Size of (number of data points in) each cluster.
- */
- @Since("2.0.0")
- lazy val clusterSizes: Array[Long] = {
- val sizes = Array.fill[Long](k)(0)
- cluster.groupBy(predictionCol).count().select(predictionCol, "count").collect().foreach {
- case Row(cluster: Int, count: Long) => sizes(cluster) = count
- }
- sizes
- }
}
http://git-wip-us.apache.org/repos/asf/spark/blob/ea3605e8/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index 0d2405b..05ed322 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -346,35 +346,15 @@ object KMeans extends DefaultParamsReadable[KMeans] {
* :: Experimental ::
* Summary of KMeans.
*
- * @param predictions [[DataFrame]] produced by [[KMeansModel.transform()]]
- * @param predictionCol Name for column of predicted clusters in `predictions`
- * @param featuresCol Name for column of features in `predictions`
- * @param k Number of clusters
+ * @param predictions [[DataFrame]] produced by [[KMeansModel.transform()]].
+ * @param predictionCol Name for column of predicted clusters in `predictions`.
+ * @param featuresCol Name for column of features in `predictions`.
+ * @param k Number of clusters.
*/
@Since("2.0.0")
@Experimental
class KMeansSummary private[clustering] (
- @Since("2.0.0") @transient val predictions: DataFrame,
- @Since("2.0.0") val predictionCol: String,
- @Since("2.0.0") val featuresCol: String,
- @Since("2.0.0") val k: Int) extends Serializable {
-
- /**
- * Cluster centers of the transformed data.
- */
- @Since("2.0.0")
- @transient lazy val cluster: DataFrame = predictions.select(predictionCol)
-
- /**
- * Size of (number of data points in) each cluster.
- */
- @Since("2.0.0")
- lazy val clusterSizes: Array[Long] = {
- val sizes = Array.fill[Long](k)(0)
- cluster.groupBy(predictionCol).count().select(predictionCol, "count").collect().foreach {
- case Row(cluster: Int, count: Long) => sizes(cluster) = count
- }
- sizes
- }
-
-}
+ predictions: DataFrame,
+ predictionCol: String,
+ featuresCol: String,
+ k: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k)
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org