You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by me...@apache.org on 2015/02/17 07:09:08 UTC
spark git commit: [SPARK-5802][MLLIB] cache transformed data in glm
Repository: spark
Updated Branches:
refs/heads/master d380f324c -> fd84229e2
[SPARK-5802][MLLIB] cache transformed data in glm
If we need to transform the input data, we should cache the output to avoid re-computing feature vectors every iteration. dbtsai
Author: Xiangrui Meng <me...@databricks.com>
Closes #4593 from mengxr/SPARK-5802 and squashes the following commits:
ae3be84 [Xiangrui Meng] cache transformed data in glm
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fd84229e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fd84229e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fd84229e
Branch: refs/heads/master
Commit: fd84229e2aeb6a03760703c9dccd2db853779400
Parents: d380f32
Author: Xiangrui Meng <me...@databricks.com>
Authored: Mon Feb 16 22:09:04 2015 -0800
Committer: Xiangrui Meng <me...@databricks.com>
Committed: Mon Feb 16 22:09:04 2015 -0800
----------------------------------------------------------------------
.../regression/GeneralizedLinearAlgorithm.scala | 29 ++++++++++----------
1 file changed, 15 insertions(+), 14 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/fd84229e/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
index 17de215..2b71453 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
@@ -205,7 +205,7 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
throw new SparkException("Input validation failed.")
}
- /**
+ /*
* Scaling columns to unit variance as a heuristic to reduce the condition number:
*
* During the optimization process, the convergence (rate) depends on the condition number of
@@ -225,26 +225,27 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
* Currently, it's only enabled in LogisticRegressionWithLBFGS
*/
val scaler = if (useFeatureScaling) {
- (new StandardScaler(withStd = true, withMean = false)).fit(input.map(x => x.features))
+ new StandardScaler(withStd = true, withMean = false).fit(input.map(_.features))
} else {
null
}
// Prepend an extra variable consisting of all 1.0's for the intercept.
- val data = if (addIntercept) {
- if (useFeatureScaling) {
- input.map(labeledPoint =>
- (labeledPoint.label, appendBias(scaler.transform(labeledPoint.features))))
- } else {
- input.map(labeledPoint => (labeledPoint.label, appendBias(labeledPoint.features)))
- }
- } else {
- if (useFeatureScaling) {
- input.map(labeledPoint => (labeledPoint.label, scaler.transform(labeledPoint.features)))
+ // TODO: Apply feature scaling to the weight vector instead of input data.
+ val data =
+ if (addIntercept) {
+ if (useFeatureScaling) {
+ input.map(lp => (lp.label, appendBias(scaler.transform(lp.features)))).cache()
+ } else {
+ input.map(lp => (lp.label, appendBias(lp.features))).cache()
+ }
} else {
- input.map(labeledPoint => (labeledPoint.label, labeledPoint.features))
+ if (useFeatureScaling) {
+ input.map(lp => (lp.label, scaler.transform(lp.features))).cache()
+ } else {
+ input.map(lp => (lp.label, lp.features))
+ }
}
- }
/**
* TODO: For better convergence, in logistic regression, the intercepts should be computed
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org