You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by db...@apache.org on 2017/03/25 17:42:40 UTC
spark git commit: [SPARK-17137][ML][WIP] Compress logistic regression
coefficients
Repository: spark
Updated Branches:
refs/heads/master e8ddb91c7 -> be85245a9
[SPARK-17137][ML][WIP] Compress logistic regression coefficients
## What changes were proposed in this pull request?
Use the new `compressed` method on matrices to store the logistic regression coefficients as sparse or dense - whichever is requires less memory.
Marked as WIP so we can add some performance test results. Basically, we should see if prediction is slower because of using a sparse matrix over a dense one. This can happen since sparse matrices do not use native BLAS operations when computing the margins.
## How was this patch tested?
Unit tests added.
Author: sethah <se...@gmail.com>
Closes #17426 from sethah/SPARK-17137.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/be85245a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/be85245a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/be85245a
Branch: refs/heads/master
Commit: be85245a98d58f636ff54956cdfde15ea5cd6122
Parents: e8ddb91
Author: sethah <se...@gmail.com>
Authored: Sat Mar 25 17:41:59 2017 +0000
Committer: DB Tsai <db...@dbtsai.com>
Committed: Sat Mar 25 17:41:59 2017 +0000
----------------------------------------------------------------------
.../ml/classification/LogisticRegression.scala | 28 ++--------
.../LogisticRegressionSuite.scala | 58 +++++++++++++++-----
2 files changed, 49 insertions(+), 37 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/be85245a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 1a78187..7b56bce 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -399,14 +399,9 @@ class LogisticRegression @Since("1.2.0") (
logWarning(s"All labels are the same value and fitIntercept=true, so the coefficients " +
s"will be zeros. Training is not needed.")
val constantLabelIndex = Vectors.dense(histogram).argmax
- // TODO: use `compressed` after SPARK-17471
- val coefMatrix = if (numFeatures < numCoefficientSets) {
- new SparseMatrix(numCoefficientSets, numFeatures,
- Array.fill(numFeatures + 1)(0), Array.empty[Int], Array.empty[Double])
- } else {
- new SparseMatrix(numCoefficientSets, numFeatures, Array.fill(numCoefficientSets + 1)(0),
- Array.empty[Int], Array.empty[Double], isTransposed = true)
- }
+ val coefMatrix = new SparseMatrix(numCoefficientSets, numFeatures,
+ new Array[Int](numCoefficientSets + 1), Array.empty[Int], Array.empty[Double],
+ isTransposed = true).compressed
val interceptVec = if (isMultinomial) {
Vectors.sparse(numClasses, Seq((constantLabelIndex, Double.PositiveInfinity)))
} else {
@@ -617,26 +612,13 @@ class LogisticRegression @Since("1.2.0") (
denseCoefficientMatrix.update(_ - coefficientMean)
}
- // TODO: use `denseCoefficientMatrix.compressed` after SPARK-17471
- val compressedCoefficientMatrix = if (isMultinomial) {
- denseCoefficientMatrix
- } else {
- val compressedVector = Vectors.dense(denseCoefficientMatrix.values).compressed
- compressedVector match {
- case dv: DenseVector => denseCoefficientMatrix
- case sv: SparseVector =>
- new SparseMatrix(1, numFeatures, Array(0, sv.indices.length), sv.indices, sv.values,
- isTransposed = true)
- }
- }
-
// center the intercepts when using multinomial algorithm
if ($(fitIntercept) && isMultinomial) {
val interceptArray = interceptVec.toArray
val interceptMean = interceptArray.sum / interceptArray.length
(0 until interceptVec.size).foreach { i => interceptArray(i) -= interceptMean }
}
- (compressedCoefficientMatrix, interceptVec.compressed, arrayBuilder.result())
+ (denseCoefficientMatrix.compressed, interceptVec.compressed, arrayBuilder.result())
}
}
@@ -713,7 +695,7 @@ class LogisticRegressionModel private[spark] (
// convert to appropriate vector representation without replicating data
private lazy val _coefficients: Vector = {
require(coefficientMatrix.isTransposed,
- "LogisticRegressionModel coefficients should be row major.")
+ "LogisticRegressionModel coefficients should be row major for binomial model.")
coefficientMatrix match {
case dm: DenseMatrix => Vectors.dense(dm.values)
case sm: SparseMatrix => Vectors.sparse(coefficientMatrix.numCols, sm.rowIndices, sm.values)
http://git-wip-us.apache.org/repos/asf/spark/blob/be85245a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index affaa57..1b64480 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -26,7 +26,7 @@ import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.attribute.NominalAttribute
import org.apache.spark.ml.classification.LogisticRegressionSuite._
import org.apache.spark.ml.feature.{Instance, LabeledPoint}
-import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, SparseMatrix, SparseVector, Vector, Vectors}
+import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, SparseMatrix, Vector, Vectors}
import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.ml.util.TestingUtils._
@@ -713,8 +713,6 @@ class LogisticRegressionSuite
assert(model2.intercept ~== interceptR relTol 1E-2)
assert(model2.coefficients ~== coefficientsR absTol 1E-3)
- // TODO: move this to a standalone test of compression after SPARK-17471
- assert(model2.coefficients.isInstanceOf[SparseVector])
}
test("binary logistic regression without intercept with L1 regularization") {
@@ -2031,29 +2029,61 @@ class LogisticRegressionSuite
// TODO: check num iters is zero when it become available in the model
}
- test("compressed storage") {
+ test("compressed storage for constant label") {
+ /*
+ When the label is constant and fit intercept is true, all the coefficients will be
+ zeros, and so the model coefficients should be stored as sparse data structures, except
+ when the matrix dimensions are very small.
+ */
val moreClassesThanFeatures = Seq(
- LabeledPoint(4.0, Vectors.dense(0.0, 0.0, 0.0)),
- LabeledPoint(4.0, Vectors.dense(1.0, 1.0, 1.0)),
- LabeledPoint(4.0, Vectors.dense(2.0, 2.0, 2.0))).toDF()
- val mlr = new LogisticRegression().setFamily("multinomial")
+ LabeledPoint(4.0, Vectors.dense(Array.fill(5)(0.0))),
+ LabeledPoint(4.0, Vectors.dense(Array.fill(5)(1.0))),
+ LabeledPoint(4.0, Vectors.dense(Array.fill(5)(2.0)))).toDF()
+ val mlr = new LogisticRegression().setFamily("multinomial").setFitIntercept(true)
val model = mlr.fit(moreClassesThanFeatures)
assert(model.coefficientMatrix.isInstanceOf[SparseMatrix])
- assert(model.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 4)
+ assert(model.coefficientMatrix.isColMajor)
+
+ // in this case, it should be stored as row major
val moreFeaturesThanClasses = Seq(
- LabeledPoint(1.0, Vectors.dense(0.0, 0.0, 0.0)),
- LabeledPoint(1.0, Vectors.dense(1.0, 1.0, 1.0)),
- LabeledPoint(1.0, Vectors.dense(2.0, 2.0, 2.0))).toDF()
+ LabeledPoint(1.0, Vectors.dense(Array.fill(5)(0.0))),
+ LabeledPoint(1.0, Vectors.dense(Array.fill(5)(1.0))),
+ LabeledPoint(1.0, Vectors.dense(Array.fill(5)(2.0)))).toDF()
val model2 = mlr.fit(moreFeaturesThanClasses)
assert(model2.coefficientMatrix.isInstanceOf[SparseMatrix])
- assert(model2.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 3)
+ assert(model2.coefficientMatrix.isRowMajor)
- val blr = new LogisticRegression().setFamily("binomial")
+ val blr = new LogisticRegression().setFamily("binomial").setFitIntercept(true)
val blrModel = blr.fit(moreFeaturesThanClasses)
assert(blrModel.coefficientMatrix.isInstanceOf[SparseMatrix])
assert(blrModel.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 2)
}
+ test("compressed coefficients") {
+
+ val trainer1 = new LogisticRegression()
+ .setRegParam(0.1)
+ .setElasticNetParam(1.0)
+
+ // compressed row major is optimal
+ val model1 = trainer1.fit(multinomialDataset.limit(100))
+ assert(model1.coefficientMatrix.isInstanceOf[SparseMatrix])
+ assert(model1.coefficientMatrix.isRowMajor)
+
+ // compressed column major is optimal since there are more classes than features
+ val labelMeta = NominalAttribute.defaultAttr.withName("label").withNumValues(6).toMetadata()
+ val model2 = trainer1.fit(multinomialDataset
+ .withColumn("label", col("label").as("label", labelMeta)).limit(100))
+ assert(model2.coefficientMatrix.isInstanceOf[SparseMatrix])
+ assert(model2.coefficientMatrix.isColMajor)
+
+ // coefficients are dense without L1 regularization
+ val trainer2 = new LogisticRegression()
+ .setElasticNetParam(0.0)
+ val model3 = trainer2.fit(multinomialDataset.limit(100))
+ assert(model3.coefficientMatrix.isInstanceOf[DenseMatrix])
+ }
+
test("numClasses specified in metadata/inferred") {
val lr = new LogisticRegression().setMaxIter(1).setFamily("multinomial")
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org