You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by me...@apache.org on 2014/10/30 20:01:05 UTC
git commit: SPARK-4111 [MLlib] add regression metrics
Repository: spark
Updated Branches:
refs/heads/master c7ad08520 -> d9327192e
SPARK-4111 [MLlib] add regression metrics
Add RegressionMetrics.scala as regression metrics used for evaluation and corresponding test case RegressionMetricsSuite.scala.
Author: Yanbo Liang <ya...@gmail.com>
Author: liangyanbo <li...@meituan.com>
Closes #2978 from yanbohappy/regression_metrics and squashes the following commits:
730d0a9 [Yanbo Liang] more clearly annotation
3d0bec1 [Yanbo Liang] rename and keep code style
a8ad3e3 [Yanbo Liang] simplify code for keeping style
d454909 [Yanbo Liang] rename parameter and function names, delete unused columns, add reference
2e56282 [liangyanbo] rename r2_score() and remove unused column
43bb12b [liangyanbo] add regression metrics
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d9327192
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d9327192
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d9327192
Branch: refs/heads/master
Commit: d9327192eee7f18e92381c59a42b0e1770f1f8f4
Parents: c7ad085
Author: Yanbo Liang <ya...@gmail.com>
Authored: Thu Oct 30 12:00:56 2014 -0700
Committer: Xiangrui Meng <me...@databricks.com>
Committed: Thu Oct 30 12:00:56 2014 -0700
----------------------------------------------------------------------
.../mllib/evaluation/RegressionMetrics.scala | 89 ++++++++++++++++++++
.../evaluation/RegressionMetricsSuite.scala | 52 ++++++++++++
2 files changed, 141 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/d9327192/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
new file mode 100644
index 0000000..693117d
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.evaluation
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.rdd.RDD
+import org.apache.spark.Logging
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, MultivariateOnlineSummarizer}
+
+/**
+ * :: Experimental ::
+ * Evaluator for regression.
+ *
+ * @param predictionAndObservations an RDD of (prediction, observation) pairs.
+ */
+@Experimental
+class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extends Logging {
+
+ /**
+ * Use MultivariateOnlineSummarizer to calculate summary statistics of observations and errors.
+ */
+ private lazy val summary: MultivariateStatisticalSummary = {
+ val summary: MultivariateStatisticalSummary = predictionAndObservations.map {
+ case (prediction, observation) => Vectors.dense(observation, observation - prediction)
+ }.aggregate(new MultivariateOnlineSummarizer())(
+ (summary, v) => summary.add(v),
+ (sum1, sum2) => sum1.merge(sum2)
+ )
+ summary
+ }
+
+ /**
+ * Returns the explained variance regression score.
+ * explainedVariance = 1 - variance(y - \hat{y}) / variance(y)
+ * Reference: [[http://en.wikipedia.org/wiki/Explained_variation]]
+ */
+ def explainedVariance: Double = {
+ 1 - summary.variance(1) / summary.variance(0)
+ }
+
+ /**
+ * Returns the mean absolute error, which is a risk function corresponding to the
+ * expected value of the absolute error loss or l1-norm loss.
+ */
+ def meanAbsoluteError: Double = {
+ summary.normL1(1) / summary.count
+ }
+
+ /**
+ * Returns the mean squared error, which is a risk function corresponding to the
+ * expected value of the squared error loss or quadratic loss.
+ */
+ def meanSquaredError: Double = {
+ val rmse = summary.normL2(1) / math.sqrt(summary.count)
+ rmse * rmse
+ }
+
+ /**
+ * Returns the root mean squared error, which is defined as the square root of
+ * the mean squared error.
+ */
+ def rootMeanSquaredError: Double = {
+ summary.normL2(1) / math.sqrt(summary.count)
+ }
+
+ /**
+ * Returns R^2^, the coefficient of determination.
+ * Reference: [[http://en.wikipedia.org/wiki/Coefficient_of_determination]]
+ */
+ def r2: Double = {
+ 1 - math.pow(summary.normL2(1), 2) / (summary.variance(0) * (summary.count - 1))
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/d9327192/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RegressionMetricsSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RegressionMetricsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RegressionMetricsSuite.scala
new file mode 100644
index 0000000..5396d7b
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RegressionMetricsSuite.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.evaluation
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.mllib.util.LocalSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
+
+class RegressionMetricsSuite extends FunSuite with LocalSparkContext {
+
+ test("regression metrics") {
+ val predictionAndObservations = sc.parallelize(
+ Seq((2.5,3.0),(0.0,-0.5),(2.0,2.0),(8.0,7.0)), 2)
+ val metrics = new RegressionMetrics(predictionAndObservations)
+ assert(metrics.explainedVariance ~== 0.95717 absTol 1E-5,
+ "explained variance regression score mismatch")
+ assert(metrics.meanAbsoluteError ~== 0.5 absTol 1E-5, "mean absolute error mismatch")
+ assert(metrics.meanSquaredError ~== 0.375 absTol 1E-5, "mean squared error mismatch")
+ assert(metrics.rootMeanSquaredError ~== 0.61237 absTol 1E-5,
+ "root mean squared error mismatch")
+ assert(metrics.r2 ~== 0.94861 absTol 1E-5, "r2 score mismatch")
+ }
+
+ test("regression metrics with complete fitting") {
+ val predictionAndObservations = sc.parallelize(
+ Seq((3.0,3.0),(0.0,0.0),(2.0,2.0),(8.0,8.0)), 2)
+ val metrics = new RegressionMetrics(predictionAndObservations)
+ assert(metrics.explainedVariance ~== 1.0 absTol 1E-5,
+ "explained variance regression score mismatch")
+ assert(metrics.meanAbsoluteError ~== 0.0 absTol 1E-5, "mean absolute error mismatch")
+ assert(metrics.meanSquaredError ~== 0.0 absTol 1E-5, "mean squared error mismatch")
+ assert(metrics.rootMeanSquaredError ~== 0.0 absTol 1E-5,
+ "root mean squared error mismatch")
+ assert(metrics.r2 ~== 1.0 absTol 1E-5, "r2 score mismatch")
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org