You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by sr...@apache.org on 2015/12/08 12:08:30 UTC
spark git commit: [SPARK-11439][ML] Optimization of creating sparse feature without dense one

Repository: spark
Updated Branches:
  refs/heads/master 708129187 -> 037b7e76a


[SPARK-11439][ML] Optimization of creating sparse feature without dense one

Sparse feature generated in LinearDataGenerator does not create dense vectors as an intermediate any more.

Author: Nakul Jindal <nj...@us.ibm.com>

Closes #9756 from nakul02/SPARK-11439_sparse_without_creating_dense_feature.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/037b7e76
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/037b7e76
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/037b7e76

Branch: refs/heads/master
Commit: 037b7e76a7f8b59e031873a768d81417dd180472
Parents: 7081291
Author: Nakul Jindal <nj...@us.ibm.com>
Authored: Tue Dec 8 11:08:27 2015 +0000
Committer: Sean Owen <so...@cloudera.com>
Committed: Tue Dec 8 11:08:27 2015 +0000

----------------------------------------------------------------------
 .../spark/mllib/util/LinearDataGenerator.scala  |  44 ++--
 .../evaluation/RegressionEvaluatorSuite.scala   |   6 +-
 .../ml/regression/LinearRegressionSuite.scala   | 214 +++++++++++--------
 3 files changed, 142 insertions(+), 122 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/037b7e76/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
index 6ff07ee..094528e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
@@ -24,7 +24,7 @@ import com.github.fommil.netlib.BLAS.{getInstance => blas}
 
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.{DeveloperApi, Since}
-import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.linalg.{BLAS, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
 
@@ -131,35 +131,27 @@ object LinearDataGenerator {
       eps: Double,
       sparsity: Double): Seq[LabeledPoint] = {
     require(0.0 <= sparsity && sparsity <= 1.0)
-    val rnd = new Random(seed)
-    val x = Array.fill[Array[Double]](nPoints)(
-      Array.fill[Double](weights.length)(rnd.nextDouble()))
-
-    val sparseRnd = new Random(seed)
-    x.foreach { v =>
-      var i = 0
-      val len = v.length
-      while (i < len) {
-        if (sparseRnd.nextDouble() < sparsity) {
-          v(i) = 0.0
-        } else {
-          v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)
-        }
-        i += 1
-      }
-    }
 
-    val y = x.map { xi =>
-      blas.ddot(weights.length, xi, 1, weights, 1) + intercept + eps * rnd.nextGaussian()
-    }
+    val rnd = new Random(seed)
+    def rndElement(i: Int) = {(rnd.nextDouble() - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)}
 
-    y.zip(x).map { p =>
-      if (sparsity == 0.0) {
+    if (sparsity == 0.0) {
+      (0 until nPoints).map { _ =>
+        val features = Vectors.dense(weights.indices.map { rndElement(_) }.toArray)
+        val label = BLAS.dot(Vectors.dense(weights), features) +
+          intercept + eps * rnd.nextGaussian()
         // Return LabeledPoints with DenseVector
-        LabeledPoint(p._1, Vectors.dense(p._2))
-      } else {
+        LabeledPoint(label, features)
+      }
+    } else {
+      (0 until nPoints).map { _ =>
+        val indices = weights.indices.filter { _ => rnd.nextDouble() <= sparsity}
+        val values = indices.map { rndElement(_) }
+        val features = Vectors.sparse(weights.length, indices.toArray, values.toArray)
+        val label = BLAS.dot(Vectors.dense(weights), features) +
+          intercept + eps * rnd.nextGaussian()
         // Return LabeledPoints with SparseVector
-        LabeledPoint(p._1, Vectors.dense(p._2).toSparse)
+        LabeledPoint(label, features)
       }
     }
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/037b7e76/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
index 60886bf..954d3be 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
@@ -65,15 +65,15 @@ class RegressionEvaluatorSuite
 
     // default = rmse
     val evaluator = new RegressionEvaluator()
-    assert(evaluator.evaluate(predictions) ~== 0.1019382 absTol 0.001)
+    assert(evaluator.evaluate(predictions) ~== 0.1013829 absTol 0.01)
 
     // r2 score
     evaluator.setMetricName("r2")
-    assert(evaluator.evaluate(predictions) ~== 0.9998196 absTol 0.001)
+    assert(evaluator.evaluate(predictions) ~== 0.9998387 absTol 0.01)
 
     // mae
     evaluator.setMetricName("mae")
-    assert(evaluator.evaluate(predictions) ~== 0.08036075 absTol 0.001)
+    assert(evaluator.evaluate(predictions) ~== 0.08399089 absTol 0.01)
   }
 
   test("read/write") {

http://git-wip-us.apache.org/repos/asf/spark/blob/037b7e76/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index 2bdc0e1..2f3e703 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -42,6 +42,7 @@ class LinearRegressionSuite
      In `LinearRegressionSuite`, we will make sure that the model trained by SparkML
      is the same as the one trained by R's glmnet package. The following instruction
      describes how to reproduce the data in R.
+     In a spark-shell, use the following code:
 
      import org.apache.spark.mllib.util.LinearDataGenerator
      val data =
@@ -184,15 +185,15 @@ class LinearRegressionSuite
           3 x 1 sparse Matrix of class "dgCMatrix"
                                    s0
          (Intercept)         .
-         as.numeric.data.V2. 6.995908
-         as.numeric.data.V3. 5.275131
+         as.numeric.data.V2. 6.973403
+         as.numeric.data.V3. 5.284370
        */
-      val coefficientsR = Vectors.dense(6.995908, 5.275131)
+      val coefficientsR = Vectors.dense(6.973403, 5.284370)
 
-      assert(model1.intercept ~== 0 absTol 1E-3)
-      assert(model1.coefficients ~= coefficientsR relTol 1E-3)
-      assert(model2.intercept ~== 0 absTol 1E-3)
-      assert(model2.coefficients ~= coefficientsR relTol 1E-3)
+      assert(model1.intercept ~== 0 absTol 1E-2)
+      assert(model1.coefficients ~= coefficientsR relTol 1E-2)
+      assert(model2.intercept ~== 0 absTol 1E-2)
+      assert(model2.coefficients ~= coefficientsR relTol 1E-2)
 
       /*
          Then again with the data with no intercept:
@@ -235,14 +236,14 @@ class LinearRegressionSuite
            > coefficients
             3 x 1 sparse Matrix of class "dgCMatrix"
                                     s0
-           (Intercept)         6.24300
-           as.numeric.data.V2. 4.024821
-           as.numeric.data.V3. 6.679841
+           (Intercept)       6.242284
+           as.numeric.d1.V2. 4.019605
+           as.numeric.d1.V3. 6.679538
          */
-        val interceptR1 = 6.24300
-        val coefficientsR1 = Vectors.dense(4.024821, 6.679841)
-        assert(model1.intercept ~== interceptR1 relTol 1E-3)
-        assert(model1.coefficients ~= coefficientsR1 relTol 1E-3)
+        val interceptR1 = 6.242284
+        val coefficientsR1 = Vectors.dense(4.019605, 6.679538)
+        assert(model1.intercept ~== interceptR1 relTol 1E-2)
+        assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
 
         /*
            coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 1.0,
@@ -296,14 +297,14 @@ class LinearRegressionSuite
             3 x 1 sparse Matrix of class "dgCMatrix"
                                      s0
            (Intercept)          .
-           as.numeric.data.V2. 6.299752
-           as.numeric.data.V3. 4.772913
+           as.numeric.data.V2. 6.272927
+           as.numeric.data.V3. 4.782604
          */
         val interceptR1 = 0.0
-        val coefficientsR1 = Vectors.dense(6.299752, 4.772913)
+        val coefficientsR1 = Vectors.dense(6.272927, 4.782604)
 
-        assert(model1.intercept ~== interceptR1 absTol 1E-3)
-        assert(model1.coefficients ~= coefficientsR1 relTol 1E-3)
+        assert(model1.intercept ~== interceptR1 absTol 1E-2)
+        assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
 
         /*
            coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 1.0,
@@ -312,14 +313,14 @@ class LinearRegressionSuite
             3 x 1 sparse Matrix of class "dgCMatrix"
                                      s0
            (Intercept)         .
-           as.numeric.data.V2. 6.232193
-           as.numeric.data.V3. 4.764229
+           as.numeric.data.V2. 6.207817
+           as.numeric.data.V3. 4.775780
          */
         val interceptR2 = 0.0
-        val coefficientsR2 = Vectors.dense(6.232193, 4.764229)
+        val coefficientsR2 = Vectors.dense(6.207817, 4.775780)
 
-        assert(model2.intercept ~== interceptR2 absTol 1E-3)
-        assert(model2.coefficients ~= coefficientsR2 relTol 1E-3)
+        assert(model2.intercept ~== interceptR2 absTol 1E-2)
+        assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
 
         model1.transform(datasetWithDenseFeature).select("features", "prediction")
           .collect().foreach {
@@ -347,15 +348,15 @@ class LinearRegressionSuite
          > coefficients
           3 x 1 sparse Matrix of class "dgCMatrix"
                                    s0
-         (Intercept)         5.269376
-         as.numeric.data.V2. 3.736216
-         as.numeric.data.V3. 5.712356)
+         (Intercept)       5.260103
+         as.numeric.d1.V2. 3.725522
+         as.numeric.d1.V3. 5.711203
        */
-      val interceptR1 = 5.269376
-      val coefficientsR1 = Vectors.dense(3.736216, 5.712356)
+      val interceptR1 = 5.260103
+      val coefficientsR1 = Vectors.dense(3.725522, 5.711203)
 
-      assert(model1.intercept ~== interceptR1 relTol 1E-3)
-      assert(model1.coefficients ~= coefficientsR1 relTol 1E-3)
+      assert(model1.intercept ~== interceptR1 relTol 1E-2)
+      assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
 
       /*
          coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
@@ -363,15 +364,15 @@ class LinearRegressionSuite
          > coefficients
           3 x 1 sparse Matrix of class "dgCMatrix"
                                    s0
-         (Intercept)         5.791109
-         as.numeric.data.V2. 3.435466
-         as.numeric.data.V3. 5.910406
+         (Intercept)       5.790885
+         as.numeric.d1.V2. 3.432373
+         as.numeric.d1.V3. 5.919196
        */
-      val interceptR2 = 5.791109
-      val coefficientsR2 = Vectors.dense(3.435466, 5.910406)
+      val interceptR2 = 5.790885
+      val coefficientsR2 = Vectors.dense(3.432373, 5.919196)
 
-      assert(model2.intercept ~== interceptR2 relTol 1E-3)
-      assert(model2.coefficients ~= coefficientsR2 relTol 1E-3)
+      assert(model2.intercept ~== interceptR2 relTol 1E-2)
+      assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
 
       model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach {
         case Row(features: DenseVector, prediction1: Double) =>
@@ -398,15 +399,15 @@ class LinearRegressionSuite
          > coefficients
           3 x 1 sparse Matrix of class "dgCMatrix"
                                    s0
-         (Intercept)         .
-         as.numeric.data.V2. 5.522875
-         as.numeric.data.V3. 4.214502
+         (Intercept)       .
+         as.numeric.d1.V2. 5.493430
+         as.numeric.d1.V3. 4.223082
        */
       val interceptR1 = 0.0
-      val coefficientsR1 = Vectors.dense(5.522875, 4.214502)
+      val coefficientsR1 = Vectors.dense(5.493430, 4.223082)
 
-      assert(model1.intercept ~== interceptR1 absTol 1E-3)
-      assert(model1.coefficients ~= coefficientsR1 relTol 1E-3)
+      assert(model1.intercept ~== interceptR1 absTol 1E-2)
+      assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
 
       /*
          coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
@@ -415,14 +416,14 @@ class LinearRegressionSuite
           3 x 1 sparse Matrix of class "dgCMatrix"
                                    s0
          (Intercept)         .
-         as.numeric.data.V2. 5.263704
-         as.numeric.data.V3. 4.187419
+         as.numeric.d1.V2. 5.244324
+         as.numeric.d1.V3. 4.203106
        */
       val interceptR2 = 0.0
-      val coefficientsR2 = Vectors.dense(5.263704, 4.187419)
+      val coefficientsR2 = Vectors.dense(5.244324, 4.203106)
 
-      assert(model2.intercept ~== interceptR2 absTol 1E-3)
-      assert(model2.coefficients ~= coefficientsR2 relTol 1E-3)
+      assert(model2.intercept ~== interceptR2 absTol 1E-2)
+      assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
 
       model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach {
         case Row(features: DenseVector, prediction1: Double) =>
@@ -457,15 +458,15 @@ class LinearRegressionSuite
            > coefficients
             3 x 1 sparse Matrix of class "dgCMatrix"
                                      s0
-           (Intercept)         6.324108
-           as.numeric.data.V2. 3.168435
-           as.numeric.data.V3. 5.200403
+           (Intercept)       5.689855
+           as.numeric.d1.V2. 3.661181
+           as.numeric.d1.V3. 6.000274
          */
-        val interceptR1 = 5.696056
-        val coefficientsR1 = Vectors.dense(3.670489, 6.001122)
+        val interceptR1 = 5.689855
+        val coefficientsR1 = Vectors.dense(3.661181, 6.000274)
 
-        assert(model1.intercept ~== interceptR1 relTol 1E-3)
-        assert(model1.coefficients ~= coefficientsR1 relTol 1E-3)
+        assert(model1.intercept ~== interceptR1 relTol 1E-2)
+        assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
 
         /*
            coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6
@@ -473,15 +474,15 @@ class LinearRegressionSuite
            > coefficients
             3 x 1 sparse Matrix of class "dgCMatrix"
                                      s0
-           (Intercept)         6.114723
-           as.numeric.data.V2. 3.409937
-           as.numeric.data.V3. 6.146531
+           (Intercept)       6.113890
+           as.numeric.d1.V2. 3.407021
+           as.numeric.d1.V3. 6.152512
          */
-        val interceptR2 = 6.114723
-        val coefficientsR2 = Vectors.dense(3.409937, 6.146531)
+        val interceptR2 = 6.113890
+        val coefficientsR2 = Vectors.dense(3.407021, 6.152512)
 
-        assert(model2.intercept ~== interceptR2 relTol 1E-3)
-        assert(model2.coefficients ~= coefficientsR2 relTol 1E-3)
+        assert(model2.intercept ~== interceptR2 relTol 1E-2)
+        assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
 
         model1.transform(datasetWithDenseFeature).select("features", "prediction")
           .collect().foreach {
@@ -518,15 +519,15 @@ class LinearRegressionSuite
            > coefficients
             3 x 1 sparse Matrix of class "dgCMatrix"
                                       s0
-           (Intercept)         .
-           as.numeric.dataM.V2. 5.673348
-           as.numeric.dataM.V3. 4.322251
+           (Intercept)       .
+           as.numeric.d1.V2. 5.643748
+           as.numeric.d1.V3. 4.331519
          */
         val interceptR1 = 0.0
-        val coefficientsR1 = Vectors.dense(5.673348, 4.322251)
+        val coefficientsR1 = Vectors.dense(5.643748, 4.331519)
 
-        assert(model1.intercept ~== interceptR1 absTol 1E-3)
-        assert(model1.coefficients ~= coefficientsR1 relTol 1E-3)
+        assert(model1.intercept ~== interceptR1 absTol 1E-2)
+        assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
 
         /*
            coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.3,
@@ -535,14 +536,15 @@ class LinearRegressionSuite
             3 x 1 sparse Matrix of class "dgCMatrix"
                                      s0
            (Intercept)         .
-           as.numeric.data.V2. 5.477988
-           as.numeric.data.V3. 4.297622
+           as.numeric.d1.V2. 5.455902
+           as.numeric.d1.V3. 4.312266
+
          */
         val interceptR2 = 0.0
-        val coefficientsR2 = Vectors.dense(5.477988, 4.297622)
+        val coefficientsR2 = Vectors.dense(5.455902, 4.312266)
 
-        assert(model2.intercept ~== interceptR2 absTol 1E-3)
-        assert(model2.coefficients ~= coefficientsR2 relTol 1E-3)
+        assert(model2.intercept ~== interceptR2 absTol 1E-2)
+        assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
 
         model1.transform(datasetWithDenseFeature).select("features", "prediction")
           .collect().foreach {
@@ -592,21 +594,47 @@ class LinearRegressionSuite
       }
 
       /*
-         Use the following R code to generate model training results.
-
-         predictions <- predict(fit, newx=features)
-         residuals <- label - predictions
-         > mean(residuals^2) # MSE
-         [1] 0.009720325
-         > mean(abs(residuals)) # MAD
-         [1] 0.07863206
-         > cor(predictions, label)^2# r^2
-                 [,1]
-         s0 0.9998749
+         # Use the following R code to generate model training results.
+
+         # path/part-00000 is the file generated by running LinearDataGenerator.generateLinearInput
+         # as described before the beforeAll() method.
+         d1 <- read.csv("path/part-00000", header=FALSE, stringsAsFactors=FALSE)
+         fit <- glm(V1 ~ V2 + V3, data = d1, family = "gaussian")
+         names(f1)[1] = c("V2")
+         names(f1)[2] = c("V3")
+         f1 <- data.frame(as.numeric(d1$V2), as.numeric(d1$V3))
+         predictions <- predict(fit, newdata=f1)
+         l1 <- as.numeric(d1$V1)
+
+         residuals <- l1 - predictions
+         > mean(residuals^2)           # MSE
+         [1] 0.00985449
+         > mean(abs(residuals))        # MAD
+         [1] 0.07961668
+         > cor(predictions, l1)^2   # r^2
+         [1] 0.9998737
+
+         > summary(fit)
+
+          Call:
+          glm(formula = V1 ~ V2 + V3, family = "gaussian", data = d1)
+
+          Deviance Residuals:
+               Min        1Q    Median        3Q       Max
+          -0.47082  -0.06797   0.00002   0.06725   0.34635
+
+          Coefficients:
+                       Estimate Std. Error t value Pr(>|t|)
+          (Intercept) 6.3022157  0.0018600    3388   <2e-16 ***
+          V2          4.6982442  0.0011805    3980   <2e-16 ***
+          V3          7.1994344  0.0009044    7961   <2e-16 ***
+          ---
+
+          ....
        */
-      assert(model.summary.meanSquaredError ~== 0.00972035 relTol 1E-5)
-      assert(model.summary.meanAbsoluteError ~== 0.07863206 relTol 1E-5)
-      assert(model.summary.r2 ~== 0.9998749 relTol 1E-5)
+      assert(model.summary.meanSquaredError ~== 0.00985449 relTol 1E-4)
+      assert(model.summary.meanAbsoluteError ~== 0.07961668 relTol 1E-4)
+      assert(model.summary.r2 ~== 0.9998737 relTol 1E-4)
 
       // Normal solver uses "WeightedLeastSquares". This algorithm does not generate
       // objective history because it does not run through iterations.
@@ -621,14 +649,14 @@ class LinearRegressionSuite
         // To clalify that the normal solver is used here.
         assert(model.summary.objectiveHistory.length == 1)
         assert(model.summary.objectiveHistory(0) == 0.0)
-        val devianceResidualsR = Array(-0.35566, 0.34504)
-        val seCoefR = Array(0.0011756, 0.0009032, 0.0018489)
-        val tValsR = Array(3998, 7971, 3407)
+        val devianceResidualsR = Array(-0.47082, 0.34635)
+        val seCoefR = Array(0.0011805, 0.0009044, 0.0018600)
+        val tValsR = Array(3980, 7961, 3388)
         val pValsR = Array(0, 0, 0)
         model.summary.devianceResiduals.zip(devianceResidualsR).foreach { x =>
-          assert(x._1 ~== x._2 absTol 1E-5) }
+          assert(x._1 ~== x._2 absTol 1E-4) }
         model.summary.coefficientStandardErrors.zip(seCoefR).foreach{ x =>
-          assert(x._1 ~== x._2 absTol 1E-5) }
+          assert(x._1 ~== x._2 absTol 1E-4) }
         model.summary.tValues.map(_.round).zip(tValsR).foreach{ x => assert(x._1 === x._2) }
         model.summary.pValues.map(_.round).zip(pValsR).foreach{ x => assert(x._1 === x._2) }
       }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org