You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by yl...@apache.org on 2017/01/19 11:46:44 UTC
spark git commit: [SPARK-14272][ML] Add Loglikelihood in
GaussianMixtureSummary
Repository: spark
Updated Branches:
refs/heads/master 2e6256002 -> 8ccca9170
[SPARK-14272][ML] Add Loglikelihood in GaussianMixtureSummary
## What changes were proposed in this pull request?
add loglikelihood in GMM.summary
## How was this patch tested?
added tests
Author: Zheng RuiFeng <ru...@foxmail.com>
Author: Ruifeng Zheng <ru...@foxmail.com>
Closes #12064 from zhengruifeng/gmm_metric.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8ccca917
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8ccca917
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8ccca917
Branch: refs/heads/master
Commit: 8ccca9170f983f74a7482f67206dae070c77b419
Parents: 2e62560
Author: Zheng RuiFeng <ru...@foxmail.com>
Authored: Thu Jan 19 03:46:37 2017 -0800
Committer: Yanbo Liang <yb...@gmail.com>
Committed: Thu Jan 19 03:46:37 2017 -0800
----------------------------------------------------------------------
.../org/apache/spark/ml/clustering/GaussianMixture.scala | 7 +++++--
.../main/scala/org/apache/spark/mllib/util/MLUtils.scala | 2 +-
.../apache/spark/ml/clustering/GaussianMixtureSuite.scala | 7 +++++++
project/MimaExcludes.scala | 5 ++++-
python/pyspark/ml/clustering.py | 10 ++++++++++
5 files changed, 27 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/8ccca917/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index a7bb413..db5fff5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -416,7 +416,7 @@ class GaussianMixture @Since("2.0.0") (
val model = copyValues(new GaussianMixtureModel(uid, weights, gaussianDists)).setParent(this)
val summary = new GaussianMixtureSummary(model.transform(dataset),
- $(predictionCol), $(probabilityCol), $(featuresCol), $(k))
+ $(predictionCol), $(probabilityCol), $(featuresCol), $(k), logLikelihood)
model.setSummary(Some(summary))
instr.logSuccess(model)
model
@@ -674,6 +674,7 @@ private class ExpectationAggregator(
* in `predictions`.
* @param featuresCol Name for column of features in `predictions`.
* @param k Number of clusters.
+ * @param logLikelihood Total log-likelihood for this model on the given data.
*/
@Since("2.0.0")
@Experimental
@@ -682,7 +683,9 @@ class GaussianMixtureSummary private[clustering] (
predictionCol: String,
@Since("2.0.0") val probabilityCol: String,
featuresCol: String,
- k: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k) {
+ k: Int,
+ @Since("2.2.0") val logLikelihood: Double)
+ extends ClusteringSummary(predictions, predictionCol, featuresCol, k) {
/**
* Probability of each cluster.
http://git-wip-us.apache.org/repos/asf/spark/blob/8ccca917/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index de66c7c..95f904d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -34,7 +34,7 @@ import org.apache.spark.storage.StorageLevel
import org.apache.spark.util.random.BernoulliCellSampler
/**
- * Helper methods to load, save and pre-process data used in ML Lib.
+ * Helper methods to load, save and pre-process data used in MLLib.
*/
@Since("0.8.0")
object MLUtils extends Logging {
http://git-wip-us.apache.org/repos/asf/spark/blob/8ccca917/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
index a362aee..e54eb27 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
@@ -207,6 +207,10 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
[,1] [,2]
[1,] 0.2961543 0.160783
[2,] 0.1607830 1.008878
+
+ model$loglik
+
+ [1] -46.89499
*/
val weights = Array(0.5333333, 0.4666667)
val means = Array(Vectors.dense(10.363673, 9.897081), Vectors.dense(0.11731091, -0.06192351))
@@ -219,6 +223,9 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
val expected = new GaussianMixtureModel("dummy", weights, gaussians)
val actual = new GaussianMixture().setK(2).setSeed(seed).fit(rDataset)
modelEquals(expected, actual)
+
+ val llk = actual.summary.logLikelihood
+ assert(llk ~== -46.89499 absTol 1E-5)
}
test("upper triangular matrix unpacking") {
http://git-wip-us.apache.org/repos/asf/spark/blob/8ccca917/project/MimaExcludes.scala
----------------------------------------------------------------------
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index e0ee00e..bf62821 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -46,7 +46,10 @@ object MimaExcludes {
ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.streaming.scheduler.StreamingListener.onStreamingStarted"),
// [SPARK-19148][SQL] do not expose the external table concept in Catalog
- ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.createTable")
+ ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.createTable"),
+
+ // [SPARK-14272][ML] Add logLikelihood in GaussianMixtureSummary
+ ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.clustering.GaussianMixtureSummary.this")
)
// Exclude rules for 2.1.x
http://git-wip-us.apache.org/repos/asf/spark/blob/8ccca917/python/pyspark/ml/clustering.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index 25f97f5..c6c1a00 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -175,6 +175,8 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
3
>>> summary.clusterSizes
[2, 2, 2]
+ >>> summary.logLikelihood
+ 8.14636...
>>> weights = model.weights
>>> len(weights)
3
@@ -281,6 +283,14 @@ class GaussianMixtureSummary(ClusteringSummary):
"""
return self._call_java("probability")
+ @property
+ @since("2.2.0")
+ def logLikelihood(self):
+ """
+ Total log-likelihood for this model on the given data.
+ """
+ return self._call_java("logLikelihood")
+
class KMeansSummary(ClusteringSummary):
"""
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org