You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by jk...@apache.org on 2015/06/20 22:02:06 UTC
spark git commit: [SPARK-8468] [ML] Take the negative of some metrics
in RegressionEvaluator to get correct cross validation
Repository: spark
Updated Branches:
refs/heads/master 1b6fe9b1a -> 0b8995168
[SPARK-8468] [ML] Take the negative of some metrics in RegressionEvaluator to get correct cross validation
JIRA: https://issues.apache.org/jira/browse/SPARK-8468
Author: Liang-Chi Hsieh <vi...@gmail.com>
Closes #6905 from viirya/cv_min and squashes the following commits:
930d3db [Liang-Chi Hsieh] Fix python unit test and add document.
d632135 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into cv_min
16e3b2c [Liang-Chi Hsieh] Take the negative instead of reciprocal.
c3dd8d9 [Liang-Chi Hsieh] For comments.
b5f52c1 [Liang-Chi Hsieh] Add param to CrossValidator for choosing whether to maximize evaulation value.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0b899516
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0b899516
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0b899516
Branch: refs/heads/master
Commit: 0b8995168f02bb55afb0a5b7dbdb941c3c89cb4c
Parents: 1b6fe9b
Author: Liang-Chi Hsieh <vi...@gmail.com>
Authored: Sat Jun 20 13:01:59 2015 -0700
Committer: Joseph K. Bradley <jo...@databricks.com>
Committed: Sat Jun 20 13:01:59 2015 -0700
----------------------------------------------------------------------
.../ml/evaluation/RegressionEvaluator.scala | 10 ++++--
.../org/apache/spark/ml/param/params.scala | 2 +-
.../evaluation/RegressionEvaluatorSuite.scala | 4 +--
.../spark/ml/tuning/CrossValidatorSuite.scala | 35 ++++++++++++++++++--
python/pyspark/ml/evaluation.py | 8 +++--
5 files changed, 48 insertions(+), 11 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/0b899516/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
index 8670e96..01c000b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
@@ -37,6 +37,10 @@ final class RegressionEvaluator(override val uid: String)
/**
* param for metric name in evaluation (supports `"rmse"` (default), `"mse"`, `"r2"`, and `"mae"`)
+ *
+ * Because we will maximize evaluation value (ref: `CrossValidator`),
+ * when we evaluate a metric that is needed to minimize (e.g., `"rmse"`, `"mse"`, `"mae"`),
+ * we take and output the negative of this metric.
* @group param
*/
val metricName: Param[String] = {
@@ -70,13 +74,13 @@ final class RegressionEvaluator(override val uid: String)
val metrics = new RegressionMetrics(predictionAndLabels)
val metric = $(metricName) match {
case "rmse" =>
- metrics.rootMeanSquaredError
+ -metrics.rootMeanSquaredError
case "mse" =>
- metrics.meanSquaredError
+ -metrics.meanSquaredError
case "r2" =>
metrics.r2
case "mae" =>
- metrics.meanAbsoluteError
+ -metrics.meanAbsoluteError
}
metric
}
http://git-wip-us.apache.org/repos/asf/spark/blob/0b899516/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index 15ebad8..50c0d85 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -297,7 +297,7 @@ class DoubleArrayParam(parent: Params, name: String, doc: String, isValid: Array
/**
* :: Experimental ::
- * A param amd its value.
+ * A param and its value.
*/
@Experimental
case class ParamPair[T](param: Param[T], value: T) {
http://git-wip-us.apache.org/repos/asf/spark/blob/0b899516/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
index aa722da..5b20378 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
@@ -63,7 +63,7 @@ class RegressionEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext
// default = rmse
val evaluator = new RegressionEvaluator()
- assert(evaluator.evaluate(predictions) ~== 0.1019382 absTol 0.001)
+ assert(evaluator.evaluate(predictions) ~== -0.1019382 absTol 0.001)
// r2 score
evaluator.setMetricName("r2")
@@ -71,6 +71,6 @@ class RegressionEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext
// mae
evaluator.setMetricName("mae")
- assert(evaluator.evaluate(predictions) ~== 0.08036075 absTol 0.001)
+ assert(evaluator.evaluate(predictions) ~== -0.08036075 absTol 0.001)
}
}
http://git-wip-us.apache.org/repos/asf/spark/blob/0b899516/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
index 36af4b3..db64511 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
@@ -20,11 +20,12 @@ package org.apache.spark.ml.tuning
import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.ml.classification.LogisticRegression
-import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator}
+import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator, RegressionEvaluator}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasInputCol
+import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.mllib.classification.LogisticRegressionSuite.generateLogisticInput
-import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.sql.types.StructType
@@ -58,6 +59,36 @@ class CrossValidatorSuite extends SparkFunSuite with MLlibTestSparkContext {
assert(cvModel.avgMetrics.length === lrParamMaps.length)
}
+ test("cross validation with linear regression") {
+ val dataset = sqlContext.createDataFrame(
+ sc.parallelize(LinearDataGenerator.generateLinearInput(
+ 6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 100, 42, 0.1), 2))
+
+ val trainer = new LinearRegression
+ val lrParamMaps = new ParamGridBuilder()
+ .addGrid(trainer.regParam, Array(1000.0, 0.001))
+ .addGrid(trainer.maxIter, Array(0, 10))
+ .build()
+ val eval = new RegressionEvaluator()
+ val cv = new CrossValidator()
+ .setEstimator(trainer)
+ .setEstimatorParamMaps(lrParamMaps)
+ .setEvaluator(eval)
+ .setNumFolds(3)
+ val cvModel = cv.fit(dataset)
+ val parent = cvModel.bestModel.parent.asInstanceOf[LinearRegression]
+ assert(parent.getRegParam === 0.001)
+ assert(parent.getMaxIter === 10)
+ assert(cvModel.avgMetrics.length === lrParamMaps.length)
+
+ eval.setMetricName("r2")
+ val cvModel2 = cv.fit(dataset)
+ val parent2 = cvModel2.bestModel.parent.asInstanceOf[LinearRegression]
+ assert(parent2.getRegParam === 0.001)
+ assert(parent2.getMaxIter === 10)
+ assert(cvModel2.avgMetrics.length === lrParamMaps.length)
+ }
+
test("validateParams should check estimatorParamMaps") {
import CrossValidatorSuite._
http://git-wip-us.apache.org/repos/asf/spark/blob/0b899516/python/pyspark/ml/evaluation.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
index d8ddb78..595593a 100644
--- a/python/pyspark/ml/evaluation.py
+++ b/python/pyspark/ml/evaluation.py
@@ -160,13 +160,15 @@ class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol):
...
>>> evaluator = RegressionEvaluator(predictionCol="raw")
>>> evaluator.evaluate(dataset)
- 2.842...
+ -2.842...
>>> evaluator.evaluate(dataset, {evaluator.metricName: "r2"})
0.993...
>>> evaluator.evaluate(dataset, {evaluator.metricName: "mae"})
- 2.649...
+ -2.649...
"""
- # a placeholder to make it appear in the generated doc
+ # Because we will maximize evaluation value (ref: `CrossValidator`),
+ # when we evaluate a metric that is needed to minimize (e.g., `"rmse"`, `"mse"`, `"mae"`),
+ # we take and output the negative of this metric.
metricName = Param(Params._dummy(), "metricName",
"metric name in evaluation (mse|rmse|r2|mae)")
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org