You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by jk...@apache.org on 2017/04/05 00:05:00 UTC
spark git commit: [SPARK-20183][ML] Added outlierRatio arg to
MLTestingUtils.testOutliersWithSmallWeights
Repository: spark
Updated Branches:
refs/heads/master 295747e59 -> a59759e6c
[SPARK-20183][ML] Added outlierRatio arg to MLTestingUtils.testOutliersWithSmallWeights
## What changes were proposed in this pull request?
This is a small piece from https://github.com/apache/spark/pull/16722 which ultimately will add sample weights to decision trees. This is to allow more flexibility in testing outliers since linear models and trees behave differently.
Note: The primary author when this is committed should be sethah since this is taken from his code.
## How was this patch tested?
Existing tests
Author: Joseph K. Bradley <jo...@databricks.com>
Closes #17501 from jkbradley/SPARK-20183.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a59759e6
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a59759e6
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a59759e6
Branch: refs/heads/master
Commit: a59759e6c059617b2fc8102cbf41acc5d409b34a
Parents: 295747e
Author: Seth Hendrickson <sh...@us.ibm.com>
Authored: Tue Apr 4 17:04:41 2017 -0700
Committer: Joseph K. Bradley <jo...@databricks.com>
Committed: Tue Apr 4 17:04:41 2017 -0700
----------------------------------------------------------------------
.../org/apache/spark/ml/classification/LinearSVCSuite.scala | 2 +-
.../spark/ml/classification/LogisticRegressionSuite.scala | 2 +-
.../org/apache/spark/ml/classification/NaiveBayesSuite.scala | 2 +-
.../org/apache/spark/ml/regression/LinearRegressionSuite.scala | 3 ++-
.../test/scala/org/apache/spark/ml/util/MLTestingUtils.scala | 5 +++--
5 files changed, 8 insertions(+), 6 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/a59759e6/mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala
index 4c63a2a..c763a4c 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala
@@ -164,7 +164,7 @@ class LinearSVCSuite extends SparkFunSuite with MLlibTestSparkContext with Defau
MLTestingUtils.testArbitrarilyScaledWeights[LinearSVCModel, LinearSVC](
dataset.as[LabeledPoint], estimator, modelEquals)
MLTestingUtils.testOutliersWithSmallWeights[LinearSVCModel, LinearSVC](
- dataset.as[LabeledPoint], estimator, 2, modelEquals)
+ dataset.as[LabeledPoint], estimator, 2, modelEquals, outlierRatio = 3)
MLTestingUtils.testOversamplingVsWeighting[LinearSVCModel, LinearSVC](
dataset.as[LabeledPoint], estimator, modelEquals, 42L)
}
http://git-wip-us.apache.org/repos/asf/spark/blob/a59759e6/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 1b64480..f0648d0 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -1874,7 +1874,7 @@ class LogisticRegressionSuite
MLTestingUtils.testArbitrarilyScaledWeights[LogisticRegressionModel, LogisticRegression](
dataset.as[LabeledPoint], estimator, modelEquals)
MLTestingUtils.testOutliersWithSmallWeights[LogisticRegressionModel, LogisticRegression](
- dataset.as[LabeledPoint], estimator, numClasses, modelEquals)
+ dataset.as[LabeledPoint], estimator, numClasses, modelEquals, outlierRatio = 3)
MLTestingUtils.testOversamplingVsWeighting[LogisticRegressionModel, LogisticRegression](
dataset.as[LabeledPoint], estimator, modelEquals, seed)
}
http://git-wip-us.apache.org/repos/asf/spark/blob/a59759e6/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
index 4d5d299..d41c5b5 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
@@ -178,7 +178,7 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
MLTestingUtils.testArbitrarilyScaledWeights[NaiveBayesModel, NaiveBayes](
dataset.as[LabeledPoint], estimatorNoSmoothing, modelEquals)
MLTestingUtils.testOutliersWithSmallWeights[NaiveBayesModel, NaiveBayes](
- dataset.as[LabeledPoint], estimatorWithSmoothing, numClasses, modelEquals)
+ dataset.as[LabeledPoint], estimatorWithSmoothing, numClasses, modelEquals, outlierRatio = 3)
MLTestingUtils.testOversamplingVsWeighting[NaiveBayesModel, NaiveBayes](
dataset.as[LabeledPoint], estimatorWithSmoothing, modelEquals, seed)
}
http://git-wip-us.apache.org/repos/asf/spark/blob/a59759e6/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index 6a51e75..c6a267b 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -842,7 +842,8 @@ class LinearRegressionSuite
MLTestingUtils.testArbitrarilyScaledWeights[LinearRegressionModel, LinearRegression](
datasetWithStrongNoise.as[LabeledPoint], estimator, modelEquals)
MLTestingUtils.testOutliersWithSmallWeights[LinearRegressionModel, LinearRegression](
- datasetWithStrongNoise.as[LabeledPoint], estimator, numClasses, modelEquals)
+ datasetWithStrongNoise.as[LabeledPoint], estimator, numClasses, modelEquals,
+ outlierRatio = 3)
MLTestingUtils.testOversamplingVsWeighting[LinearRegressionModel, LinearRegression](
datasetWithStrongNoise.as[LabeledPoint], estimator, modelEquals, seed)
}
http://git-wip-us.apache.org/repos/asf/spark/blob/a59759e6/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala b/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala
index f1ed568..578f31c 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala
@@ -260,12 +260,13 @@ object MLTestingUtils extends SparkFunSuite {
data: Dataset[LabeledPoint],
estimator: E with HasWeightCol,
numClasses: Int,
- modelEquals: (M, M) => Unit): Unit = {
+ modelEquals: (M, M) => Unit,
+ outlierRatio: Int): Unit = {
import data.sqlContext.implicits._
val outlierDS = data.withColumn("weight", lit(1.0)).as[Instance].flatMap {
case Instance(l, w, f) =>
val outlierLabel = if (numClasses == 0) -l else numClasses - l - 1
- List.fill(3)(Instance(outlierLabel, 0.0001, f)) ++ List(Instance(l, w, f))
+ List.fill(outlierRatio)(Instance(outlierLabel, 0.0001, f)) ++ List(Instance(l, w, f))
}
val trueModel = estimator.set(estimator.weightCol, "").fit(data)
val outlierModel = estimator.set(estimator.weightCol, "weight").fit(outlierDS)
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org