You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by jk...@apache.org on 2015/07/17 23:00:57 UTC
spark git commit: [SPARK-9090] [ML] Fix definition of residual in LinearRegressionSummary, EnsembleTestHelper, and SquaredError

Repository: spark
Updated Branches:
  refs/heads/master ad0954f6d -> 6da106969


[SPARK-9090] [ML] Fix definition of residual in LinearRegressionSummary, EnsembleTestHelper, and SquaredError

Make the definition of residuals in Spark consistent with literature. We have been using `prediction - label` for residuals, but literature usually defines `residual = label - prediction`.

Author: Feynman Liang <fl...@databricks.com>

Closes #7435 from feynmanliang/SPARK-9090-Fix-LinearRegressionSummary-Residuals and squashes the following commits:

f4b39d8 [Feynman Liang] Fix doc
bc12a92 [Feynman Liang] Tweak EnsembleTestHelper and SquaredError residuals
63f0d60 [Feynman Liang] Fix definition of residual


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6da10696
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6da10696
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6da10696

Branch: refs/heads/master
Commit: 6da1069696186572c66cbd83947c1a1dbd2bc827
Parents: ad0954f
Author: Feynman Liang <fl...@databricks.com>
Authored: Fri Jul 17 14:00:53 2015 -0700
Committer: Joseph K. Bradley <jo...@databricks.com>
Committed: Fri Jul 17 14:00:53 2015 -0700

----------------------------------------------------------------------
 .../scala/org/apache/spark/ml/regression/LinearRegression.scala  | 4 ++--
 .../scala/org/apache/spark/mllib/tree/loss/SquaredError.scala    | 4 ++--
 .../org/apache/spark/ml/regression/LinearRegressionSuite.scala   | 4 ++--
 .../scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala   | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/6da10696/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 8fc9860..89718e0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -355,9 +355,9 @@ class LinearRegressionSummary private[regression] (
    */
   val r2: Double = metrics.r2
 
-  /** Residuals (predicted value - label value) */
+  /** Residuals (label - predicted value) */
   @transient lazy val residuals: DataFrame = {
-    val t = udf { (pred: Double, label: Double) => pred - label}
+    val t = udf { (pred: Double, label: Double) => label - pred }
     predictions.select(t(col(predictionCol), col(labelCol)).as("residuals"))
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/6da10696/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala
index a5582d3..011a5d5 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala
@@ -42,11 +42,11 @@ object SquaredError extends Loss {
    * @return Loss gradient
    */
   override def gradient(prediction: Double, label: Double): Double = {
-    2.0 * (prediction - label)
+    - 2.0 * (label - prediction)
   }
 
   override private[mllib] def computeError(prediction: Double, label: Double): Double = {
-    val err = prediction - label
+    val err = label - prediction
     err * err
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/6da10696/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index cf120cf..374002c 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -302,7 +302,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       .map { case Row(features: DenseVector, label: Double) =>
       val prediction =
         features(0) * model.weights(0) + features(1) * model.weights(1) + model.intercept
-      prediction - label
+      label - prediction
     }
       .zip(model.summary.residuals.map(_.getDouble(0)))
       .collect()
@@ -314,7 +314,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        Use the following R code to generate model training results.
 
        predictions <- predict(fit, newx=features)
-       residuals <- predictions - label
+       residuals <- label - predictions
        > mean(residuals^2) # MSE
        [1] 0.009720325
        > mean(abs(residuals)) # MAD

http://git-wip-us.apache.org/repos/asf/spark/blob/6da10696/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala
index 8972c22..334bf37 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala
@@ -70,7 +70,7 @@ object EnsembleTestHelper {
       metricName: String = "mse") {
     val predictions = input.map(x => model.predict(x.features))
     val errors = predictions.zip(input.map(_.label)).map { case (prediction, label) =>
-      prediction - label
+      label - prediction
     }
     val metric = metricName match {
       case "mse" =>


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org