You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by rx...@apache.org on 2014/03/18 23:14:22 UTC
git commit: [SPARK-1260]: faster construction of features with intercept

Repository: spark
Updated Branches:
  refs/heads/master 79e547fe5 -> e108b9ab9


[SPARK-1260]: faster construction of features with intercept

The current implementation uses `Array(1.0, features: _*)` to construct a new array with intercept. This is not efficient for big arrays because `Array.apply` uses a for loop that iterates over the arguments. `Array.+:` is a better choice here.

Also, I don't see a reason to set initial weights to ones. So I set them to zeros.

JIRA: https://spark-project.atlassian.net/browse/SPARK-1260

Author: Xiangrui Meng <me...@databricks.com>

Closes #161 from mengxr/sgd and squashes the following commits:

b5cfc53 [Xiangrui Meng] set default weights to zeros
a1439c2 [Xiangrui Meng] faster construction of features with intercept


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e108b9ab
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e108b9ab
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e108b9ab

Branch: refs/heads/master
Commit: e108b9ab94c4310ec56ef0eda99bb904133f942d
Parents: 79e547f
Author: Xiangrui Meng <me...@databricks.com>
Authored: Tue Mar 18 15:14:13 2014 -0700
Committer: Reynold Xin <rx...@apache.org>
Committed: Tue Mar 18 15:14:13 2014 -0700

----------------------------------------------------------------------
 .../spark/mllib/regression/GeneralizedLinearAlgorithm.scala  | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/e108b9ab/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
index f98b0b5..b962153 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
@@ -119,7 +119,7 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
    */
   def run(input: RDD[LabeledPoint]) : M = {
     val nfeatures: Int = input.first().features.length
-    val initialWeights = Array.fill(nfeatures)(1.0)
+    val initialWeights = new Array[Double](nfeatures)
     run(input, initialWeights)
   }
 
@@ -134,15 +134,15 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
       throw new SparkException("Input validation failed.")
     }
 
-    // Add a extra variable consisting of all 1.0's for the intercept.
+    // Prepend an extra variable consisting of all 1.0's for the intercept.
     val data = if (addIntercept) {
-      input.map(labeledPoint => (labeledPoint.label, Array(1.0, labeledPoint.features:_*)))
+      input.map(labeledPoint => (labeledPoint.label, labeledPoint.features.+:(1.0)))
     } else {
       input.map(labeledPoint => (labeledPoint.label, labeledPoint.features))
     }
 
     val initialWeightsWithIntercept = if (addIntercept) {
-      Array(1.0, initialWeights:_*)
+      initialWeights.+:(1.0)
     } else {
       initialWeights
     }