You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by db...@apache.org on 2015/10/30 10:59:06 UTC
spark git commit: [SPARK-11207] [ML] Add test cases for solver selection of LinearRegres…

Repository: spark
Updated Branches:
  refs/heads/master eb59b94c4 -> 86d65265f


[SPARK-11207] [ML] Add test cases for solver selection of LinearRegres…

…sion as followup. This is the follow up work of SPARK-10668.

* Fix miner style issues.
* Add test case for checking whether solver is selected properly.

Author: Lewuathe <le...@me.com>
Author: lewuathe <le...@me.com>

Closes #9180 from Lewuathe/SPARK-11207.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/86d65265
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/86d65265
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/86d65265

Branch: refs/heads/master
Commit: 86d65265fcab7edab88a7bdb10acba47da95bcb3
Parents: eb59b94
Author: Lewuathe <le...@me.com>
Authored: Fri Oct 30 02:59:05 2015 -0700
Committer: DB Tsai <db...@netflix.com>
Committed: Fri Oct 30 02:59:05 2015 -0700

----------------------------------------------------------------------
 .../spark/mllib/util/LinearDataGenerator.scala  |  54 +++++-
 .../ml/regression/LinearRegressionSuite.scala   | 172 +++++++++++--------
 2 files changed, 144 insertions(+), 82 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/86d65265/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
index d0ba454..6ff07ee 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
@@ -77,13 +77,11 @@ object LinearDataGenerator {
       nPoints: Int,
       seed: Int,
       eps: Double = 0.1): Seq[LabeledPoint] = {
-    generateLinearInput(intercept, weights,
-      Array.fill[Double](weights.length)(0.0),
-      Array.fill[Double](weights.length)(1.0 / 3.0),
-      nPoints, seed, eps)}
+    generateLinearInput(intercept, weights, Array.fill[Double](weights.length)(0.0),
+      Array.fill[Double](weights.length)(1.0 / 3.0), nPoints, seed, eps)
+  }
 
   /**
-   *
    * @param intercept Data intercept
    * @param weights  Weights to be applied.
    * @param xMean the mean of the generated features. Lots of time, if the features are not properly
@@ -104,16 +102,49 @@ object LinearDataGenerator {
       nPoints: Int,
       seed: Int,
       eps: Double): Seq[LabeledPoint] = {
+    generateLinearInput(intercept, weights, xMean, xVariance, nPoints, seed, eps, 0.0)
+  }
+
 
+  /**
+   * @param intercept Data intercept
+   * @param weights  Weights to be applied.
+   * @param xMean the mean of the generated features. Lots of time, if the features are not properly
+   *              standardized, the algorithm with poor implementation will have difficulty
+   *              to converge.
+   * @param xVariance the variance of the generated features.
+   * @param nPoints Number of points in sample.
+   * @param seed Random seed
+   * @param eps Epsilon scaling factor.
+   * @param sparsity The ratio of zero elements. If it is 0.0, LabeledPoints with
+   *                 DenseVector is returned.
+   * @return Seq of input.
+   */
+  @Since("1.6.0")
+  def generateLinearInput(
+      intercept: Double,
+      weights: Array[Double],
+      xMean: Array[Double],
+      xVariance: Array[Double],
+      nPoints: Int,
+      seed: Int,
+      eps: Double,
+      sparsity: Double): Seq[LabeledPoint] = {
+    require(0.0 <= sparsity && sparsity <= 1.0)
     val rnd = new Random(seed)
     val x = Array.fill[Array[Double]](nPoints)(
       Array.fill[Double](weights.length)(rnd.nextDouble()))
 
+    val sparseRnd = new Random(seed)
     x.foreach { v =>
       var i = 0
       val len = v.length
       while (i < len) {
-        v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)
+        if (sparseRnd.nextDouble() < sparsity) {
+          v(i) = 0.0
+        } else {
+          v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)
+        }
         i += 1
       }
     }
@@ -121,7 +152,16 @@ object LinearDataGenerator {
     val y = x.map { xi =>
       blas.ddot(weights.length, xi, 1, weights, 1) + intercept + eps * rnd.nextGaussian()
     }
-    y.zip(x).map(p => LabeledPoint(p._1, Vectors.dense(p._2)))
+
+    y.zip(x).map { p =>
+      if (sparsity == 0.0) {
+        // Return LabeledPoints with DenseVector
+        LabeledPoint(p._1, Vectors.dense(p._2))
+      } else {
+        // Return LabeledPoints with SparseVector
+        LabeledPoint(p._1, Vectors.dense(p._2).toSparse)
+      }
+    }
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/spark/blob/86d65265/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index a6e0c72..a2a5c0b 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -32,8 +32,9 @@ import org.apache.spark.sql.{DataFrame, Row}
 class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   private val seed: Int = 42
-  @transient var dataset: DataFrame = _
-  @transient var datasetWithoutIntercept: DataFrame = _
+  @transient var datasetWithDenseFeature: DataFrame = _
+  @transient var datasetWithDenseFeatureWithoutIntercept: DataFrame = _
+  @transient var datasetWithSparseFeature: DataFrame = _
 
   /*
      In `LinearRegressionSuite`, we will make sure that the model trained by SparkML
@@ -49,16 +50,29 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
    */
   override def beforeAll(): Unit = {
     super.beforeAll()
-    dataset = sqlContext.createDataFrame(
+    datasetWithDenseFeature = sqlContext.createDataFrame(
       sc.parallelize(LinearDataGenerator.generateLinearInput(
-        6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, seed, 0.1), 2))
+        intercept = 6.3, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3),
+        xVariance = Array(0.7, 1.2), nPoints = 10000, seed, eps = 0.1), 2))
     /*
        datasetWithoutIntercept is not needed for correctness testing but is useful for illustrating
        training model without intercept
      */
-    datasetWithoutIntercept = sqlContext.createDataFrame(
+    datasetWithDenseFeatureWithoutIntercept = sqlContext.createDataFrame(
       sc.parallelize(LinearDataGenerator.generateLinearInput(
-        0.0, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, seed, 0.1), 2))
+        intercept = 0.0, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3),
+        xVariance = Array(0.7, 1.2), nPoints = 10000, seed, eps = 0.1), 2))
+
+    val r = new Random(seed)
+    // When feature size is larger than 4096, normal optimizer is choosed
+    // as the solver of linear regression in the case of "auto" mode.
+    val featureSize = 4100
+    datasetWithSparseFeature = sqlContext.createDataFrame(
+      sc.parallelize(LinearDataGenerator.generateLinearInput(
+        intercept = 0.0, weights = Seq.fill(featureSize)(r.nextDouble).toArray,
+        xMean = Seq.fill(featureSize)(r.nextDouble).toArray,
+        xVariance = Seq.fill(featureSize)(r.nextDouble).toArray, nPoints = 200,
+        seed, eps = 0.1, sparsity = 0.7), 2))
   }
 
   test("params") {
@@ -77,19 +91,19 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(lir.getFitIntercept)
     assert(lir.getStandardization)
     assert(lir.getSolver == "auto")
-    val model = lir.fit(dataset)
+    val model = lir.fit(datasetWithDenseFeature)
 
     // copied model must have the same parent.
     MLTestingUtils.checkCopy(model)
 
-    model.transform(dataset)
+    model.transform(datasetWithDenseFeature)
       .select("label", "prediction")
       .collect()
     assert(model.getFeaturesCol === "features")
     assert(model.getPredictionCol === "prediction")
     assert(model.intercept !== 0.0)
     assert(model.hasParent)
-    val numFeatures = dataset.select("features").first().getAs[Vector](0).size
+    val numFeatures = datasetWithDenseFeature.select("features").first().getAs[Vector](0).size
     assert(model.numFeatures === numFeatures)
   }
 
@@ -98,8 +112,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       val trainer1 = new LinearRegression().setSolver(solver)
       // The result should be the same regardless of standardization without regularization
       val trainer2 = (new LinearRegression).setStandardization(false).setSolver(solver)
-      val model1 = trainer1.fit(dataset)
-      val model2 = trainer2.fit(dataset)
+      val model1 = trainer1.fit(datasetWithDenseFeature)
+      val model2 = trainer2.fit(datasetWithDenseFeature)
 
       /*
          Using the following R code to load the data and train the model using glmnet package.
@@ -124,7 +138,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       assert(model2.intercept ~== interceptR relTol 1E-3)
       assert(model2.weights ~= weightsR relTol 1E-3)
 
-      model1.transform(dataset).select("features", "prediction").collect().foreach {
+      model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach {
         case Row(features: DenseVector, prediction1: Double) =>
           val prediction2 =
             features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
@@ -139,10 +153,10 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       // Without regularization the results should be the same
       val trainer2 = (new LinearRegression).setFitIntercept(false).setStandardization(false)
         .setSolver(solver)
-      val model1 = trainer1.fit(dataset)
-      val modelWithoutIntercept1 = trainer1.fit(datasetWithoutIntercept)
-      val model2 = trainer2.fit(dataset)
-      val modelWithoutIntercept2 = trainer2.fit(datasetWithoutIntercept)
+      val model1 = trainer1.fit(datasetWithDenseFeature)
+      val modelWithoutIntercept1 = trainer1.fit(datasetWithDenseFeatureWithoutIntercept)
+      val model2 = trainer2.fit(datasetWithDenseFeature)
+      val modelWithoutIntercept2 = trainer2.fit(datasetWithDenseFeatureWithoutIntercept)
 
       /*
          weights <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0,
@@ -186,19 +200,15 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       val trainer2 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
         .setSolver(solver).setStandardization(false)
 
-      var model1: LinearRegressionModel = null
-      var model2: LinearRegressionModel = null
-
       // Normal optimizer is not supported with only L1 regularization case.
       if (solver == "normal") {
         intercept[IllegalArgumentException] {
-            trainer1.fit(dataset)
-            trainer2.fit(dataset)
+            trainer1.fit(datasetWithDenseFeature)
+            trainer2.fit(datasetWithDenseFeature)
           }
       } else {
-        model1 = trainer1.fit(dataset)
-        model2 = trainer2.fit(dataset)
-
+        val model1 = trainer1.fit(datasetWithDenseFeature)
+        val model2 = trainer2.fit(datasetWithDenseFeature)
 
         /*
            weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57))
@@ -230,11 +240,12 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         assert(model2.intercept ~== interceptR2 relTol 1E-3)
         assert(model2.weights ~= weightsR2 relTol 1E-3)
 
-        model1.transform(dataset).select("features", "prediction").collect().foreach {
-          case Row(features: DenseVector, prediction1: Double) =>
-            val prediction2 =
-              features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
-            assert(prediction1 ~== prediction2 relTol 1E-5)
+        model1.transform(datasetWithDenseFeature).select("features", "prediction")
+          .collect().foreach {
+            case Row(features: DenseVector, prediction1: Double) =>
+              val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) +
+                model1.intercept
+              assert(prediction1 ~== prediction2 relTol 1E-5)
         }
       }
     }
@@ -247,18 +258,15 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       val trainer2 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
         .setFitIntercept(false).setStandardization(false).setSolver(solver)
 
-      var model1: LinearRegressionModel = null
-      var model2: LinearRegressionModel = null
-
       // Normal optimizer is not supported with only L1 regularization case.
       if (solver == "normal") {
         intercept[IllegalArgumentException] {
-            trainer1.fit(dataset)
-            trainer2.fit(dataset)
+            trainer1.fit(datasetWithDenseFeature)
+            trainer2.fit(datasetWithDenseFeature)
           }
       } else {
-        model1 = trainer1.fit(dataset)
-        model2 = trainer2.fit(dataset)
+        val model1 = trainer1.fit(datasetWithDenseFeature)
+        val model2 = trainer2.fit(datasetWithDenseFeature)
 
         /*
            weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57,
@@ -292,11 +300,12 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         assert(model2.intercept ~== interceptR2 absTol 1E-3)
         assert(model2.weights ~= weightsR2 relTol 1E-3)
 
-        model1.transform(dataset).select("features", "prediction").collect().foreach {
-          case Row(features: DenseVector, prediction1: Double) =>
-            val prediction2 =
-              features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
-            assert(prediction1 ~== prediction2 relTol 1E-5)
+        model1.transform(datasetWithDenseFeature).select("features", "prediction")
+          .collect().foreach {
+            case Row(features: DenseVector, prediction1: Double) =>
+              val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) +
+                model1.intercept
+              assert(prediction1 ~== prediction2 relTol 1E-5)
         }
       }
     }
@@ -308,8 +317,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         .setSolver(solver)
       val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
         .setStandardization(false).setSolver(solver)
-      val model1 = trainer1.fit(dataset)
-      val model2 = trainer2.fit(dataset)
+      val model1 = trainer1.fit(datasetWithDenseFeature)
+      val model2 = trainer2.fit(datasetWithDenseFeature)
 
       /*
          weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3))
@@ -342,7 +351,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       assert(model2.intercept ~== interceptR2 relTol 1E-3)
       assert(model2.weights ~= weightsR2 relTol 1E-3)
 
-      model1.transform(dataset).select("features", "prediction").collect().foreach {
+      model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach {
         case Row(features: DenseVector, prediction1: Double) =>
           val prediction2 =
             features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
@@ -357,8 +366,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         .setFitIntercept(false).setSolver(solver)
       val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
         .setFitIntercept(false).setStandardization(false).setSolver(solver)
-      val model1 = trainer1.fit(dataset)
-      val model2 = trainer2.fit(dataset)
+      val model1 = trainer1.fit(datasetWithDenseFeature)
+      val model2 = trainer2.fit(datasetWithDenseFeature)
 
       /*
          weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
@@ -392,7 +401,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       assert(model2.intercept ~== interceptR2 absTol 1E-3)
       assert(model2.weights ~= weightsR2 relTol 1E-3)
 
-      model1.transform(dataset).select("features", "prediction").collect().foreach {
+      model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach {
         case Row(features: DenseVector, prediction1: Double) =>
           val prediction2 =
             features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
@@ -408,18 +417,15 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       val trainer2 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
         .setStandardization(false).setSolver(solver)
 
-      var model1: LinearRegressionModel = null
-      var model2: LinearRegressionModel = null
-
       // Normal optimizer is not supported with non-zero elasticnet parameter.
       if (solver == "normal") {
         intercept[IllegalArgumentException] {
-            trainer1.fit(dataset)
-            trainer2.fit(dataset)
+            trainer1.fit(datasetWithDenseFeature)
+            trainer2.fit(datasetWithDenseFeature)
           }
       } else {
-        model1 = trainer1.fit(dataset)
-        model2 = trainer2.fit(dataset)
+        val model1 = trainer1.fit(datasetWithDenseFeature)
+        val model2 = trainer2.fit(datasetWithDenseFeature)
 
         /*
            weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6))
@@ -452,10 +458,11 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         assert(model2.intercept ~== interceptR2 relTol 1E-3)
         assert(model2.weights ~= weightsR2 relTol 1E-3)
 
-        model1.transform(dataset).select("features", "prediction").collect().foreach {
+        model1.transform(datasetWithDenseFeature).select("features", "prediction")
+          .collect().foreach {
           case Row(features: DenseVector, prediction1: Double) =>
-            val prediction2 =
-              features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
+            val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) +
+              model1.intercept
             assert(prediction1 ~== prediction2 relTol 1E-5)
         }
       }
@@ -469,18 +476,15 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       val trainer2 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
         .setFitIntercept(false).setStandardization(false).setSolver(solver)
 
-      var model1: LinearRegressionModel = null
-      var model2: LinearRegressionModel = null
-
       // Normal optimizer is not supported with non-zero elasticnet parameter.
       if (solver == "normal") {
         intercept[IllegalArgumentException] {
-            trainer1.fit(dataset)
-            trainer2.fit(dataset)
+            trainer1.fit(datasetWithDenseFeature)
+            trainer2.fit(datasetWithDenseFeature)
           }
       } else {
-        model1 = trainer1.fit(dataset)
-        model2 = trainer2.fit(dataset)
+        val model1 = trainer1.fit(datasetWithDenseFeature)
+        val model2 = trainer2.fit(datasetWithDenseFeature)
 
         /*
            weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6,
@@ -514,10 +518,11 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         assert(model2.intercept ~== interceptR2 absTol 1E-3)
         assert(model2.weights ~= weightsR2 relTol 1E-3)
 
-        model1.transform(dataset).select("features", "prediction").collect().foreach {
+        model1.transform(datasetWithDenseFeature).select("features", "prediction")
+          .collect().foreach {
           case Row(features: DenseVector, prediction1: Double) =>
-            val prediction2 =
-              features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
+            val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) +
+              model1.intercept
             assert(prediction1 ~== prediction2 relTol 1E-5)
         }
       }
@@ -527,27 +532,26 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
   test("linear regression model training summary") {
     Seq("auto", "l-bfgs", "normal").foreach { solver =>
       val trainer = new LinearRegression().setSolver(solver)
-      val model = trainer.fit(dataset)
+      val model = trainer.fit(datasetWithDenseFeature)
       val trainerNoPredictionCol = trainer.setPredictionCol("")
-      val modelNoPredictionCol = trainerNoPredictionCol.fit(dataset)
-
+      val modelNoPredictionCol = trainerNoPredictionCol.fit(datasetWithDenseFeature)
 
       // Training results for the model should be available
       assert(model.hasSummary)
       assert(modelNoPredictionCol.hasSummary)
 
       // Schema should be a superset of the input dataset
-      assert((dataset.schema.fieldNames.toSet + "prediction").subsetOf(
+      assert((datasetWithDenseFeature.schema.fieldNames.toSet + "prediction").subsetOf(
         model.summary.predictions.schema.fieldNames.toSet))
       // Validate that we re-insert a prediction column for evaluation
       val modelNoPredictionColFieldNames
       = modelNoPredictionCol.summary.predictions.schema.fieldNames
-      assert((dataset.schema.fieldNames.toSet).subsetOf(
+      assert((datasetWithDenseFeature.schema.fieldNames.toSet).subsetOf(
         modelNoPredictionColFieldNames.toSet))
       assert(modelNoPredictionColFieldNames.exists(s => s.startsWith("prediction_")))
 
       // Residuals in [[LinearRegressionResults]] should equal those manually computed
-      val expectedResiduals = dataset.select("features", "label")
+      val expectedResiduals = datasetWithDenseFeature.select("features", "label")
         .map { case Row(features: DenseVector, label: Double) =>
         val prediction =
           features(0) * model.weights(0) + features(1) * model.weights(1) + model.intercept
@@ -585,6 +589,10 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
             .objectiveHistory
             .sliding(2)
             .forall(x => x(0) >= x(1)))
+      } else {
+        // To clalify that the normal solver is used here.
+        assert(model.summary.objectiveHistory.length == 1)
+        assert(model.summary.objectiveHistory(0) == 0.0)
       }
     }
   }
@@ -592,10 +600,10 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
   test("linear regression model testset evaluation summary") {
     Seq("auto", "l-bfgs", "normal").foreach { solver =>
       val trainer = new LinearRegression().setSolver(solver)
-      val model = trainer.fit(dataset)
+      val model = trainer.fit(datasetWithDenseFeature)
 
       // Evaluating on training dataset should yield results summary equal to training summary
-      val testSummary = model.evaluate(dataset)
+      val testSummary = model.evaluate(datasetWithDenseFeature)
       assert(model.summary.meanSquaredError ~== testSummary.meanSquaredError relTol 1E-5)
       assert(model.summary.r2 ~== testSummary.r2 relTol 1E-5)
       model.summary.residuals.select("residuals").collect()
@@ -693,4 +701,18 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       assert(model4a0.weights ~== model4b.weights absTol 1E-3)
     }
   }
+
+  test("linear regression model with l-bfgs with big feature datasets") {
+    val trainer = new LinearRegression().setSolver("auto")
+    val model = trainer.fit(datasetWithSparseFeature)
+
+    // Training results for the model should be available
+    assert(model.hasSummary)
+    // When LBFGS is used as optimizer, objective history can be restored.
+    assert(
+      model.summary
+        .objectiveHistory
+        .sliding(2)
+        .forall(x => x(0) >= x(1)))
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org