You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by me...@apache.org on 2015/07/30 16:49:13 UTC

spark git commit: [SPARK-7368] [MLLIB] Add QR decomposition for RowMatrix

Repository: spark
Updated Branches:
  refs/heads/master 6175d6cfe -> d31c618e3


[SPARK-7368] [MLLIB] Add QR decomposition for RowMatrix

jira: https://issues.apache.org/jira/browse/SPARK-7368
Add QR decomposition for RowMatrix.

I'm not sure what's the blueprint about the distributed Matrix from community and whether this will be a desirable feature , so I sent a prototype for discussion. I'll go on polish the code and provide ut and performance statistics if it's acceptable.

The implementation refers to the [paper: https://www.cs.purdue.edu/homes/dgleich/publications/Benson%202013%20-%20direct-tsqr.pdf]
Austin R. Benson, David F. Gleich, James Demmel. "Direct QR factorizations for tall-and-skinny matrices in MapReduce architectures", 2013 IEEE International Conference on Big Data, which is a stable algorithm with good scalability.

Currently I tried it on a 400000 * 500 rowMatrix (16 partitions) and it can bring down the computation time from 8.8 mins (using breeze.linalg.qr.reduced)  to 2.6 mins on a 4 worker cluster. I think there will still be some room for performance improvement.

Any trial and suggestion is welcome.

Author: Yuhao Yang <hh...@gmail.com>

Closes #5909 from hhbyyh/qrDecomposition and squashes the following commits:

cec797b [Yuhao Yang] remove unnecessary qr
0fb1012 [Yuhao Yang] hierarchy R computing
3fbdb61 [Yuhao Yang] update qr to indirect and add ut
0d913d3 [Yuhao Yang] Merge remote-tracking branch 'upstream/master' into qrDecomposition
39213c3 [Yuhao Yang] Merge remote-tracking branch 'upstream/master' into qrDecomposition
c0fc0c7 [Yuhao Yang] Merge remote-tracking branch 'upstream/master' into qrDecomposition
39b0b22 [Yuhao Yang] initial draft for discussion


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d31c618e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d31c618e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d31c618e

Branch: refs/heads/master
Commit: d31c618e3c8838f8198556876b9dcbbbf835f7b2
Parents: 6175d6c
Author: Yuhao Yang <hh...@gmail.com>
Authored: Thu Jul 30 07:49:10 2015 -0700
Committer: Xiangrui Meng <me...@databricks.com>
Committed: Thu Jul 30 07:49:10 2015 -0700

----------------------------------------------------------------------
 .../linalg/SingularValueDecomposition.scala     |  8 ++++
 .../mllib/linalg/distributed/RowMatrix.scala    | 46 +++++++++++++++++++-
 .../linalg/distributed/RowMatrixSuite.scala     | 17 ++++++++
 3 files changed, 70 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/d31c618e/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala
index 9669c36..b416d50 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala
@@ -25,3 +25,11 @@ import org.apache.spark.annotation.Experimental
  */
 @Experimental
 case class SingularValueDecomposition[UType, VType](U: UType, s: Vector, V: VType)
+
+/**
+ * :: Experimental ::
+ * Represents QR factors.
+ */
+@Experimental
+case class QRDecomposition[UType, VType](Q: UType, R: VType)
+

http://git-wip-us.apache.org/repos/asf/spark/blob/d31c618e/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
index 1626da9..bfc90c9 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
@@ -22,7 +22,7 @@ import java.util.Arrays
 import scala.collection.mutable.ListBuffer
 
 import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, SparseVector => BSV, axpy => brzAxpy,
-  svd => brzSvd}
+  svd => brzSvd, MatrixSingularException, inv}
 import breeze.numerics.{sqrt => brzSqrt}
 import com.github.fommil.netlib.BLAS.{getInstance => blas}
 
@@ -498,6 +498,50 @@ class RowMatrix(
   }
 
   /**
+   * Compute QR decomposition for [[RowMatrix]]. The implementation is designed to optimize the QR
+   * decomposition (factorization) for the [[RowMatrix]] of a tall and skinny shape.
+   * Reference:
+   *  Paul G. Constantine, David F. Gleich. "Tall and skinny QR factorizations in MapReduce
+   *  architectures"  ([[http://dx.doi.org/10.1145/1996092.1996103]])
+   *
+   * @param computeQ whether to computeQ
+   * @return QRDecomposition(Q, R), Q = null if computeQ = false.
+   */
+  def tallSkinnyQR(computeQ: Boolean = false): QRDecomposition[RowMatrix, Matrix] = {
+    val col = numCols().toInt
+    // split rows horizontally into smaller matrices, and compute QR for each of them
+    val blockQRs = rows.glom().map { partRows =>
+      val bdm = BDM.zeros[Double](partRows.length, col)
+      var i = 0
+      partRows.foreach { row =>
+        bdm(i, ::) := row.toBreeze.t
+        i += 1
+      }
+      breeze.linalg.qr.reduced(bdm).r
+    }
+
+    // combine the R part from previous results vertically into a tall matrix
+    val combinedR = blockQRs.treeReduce{ (r1, r2) =>
+      val stackedR = BDM.vertcat(r1, r2)
+      breeze.linalg.qr.reduced(stackedR).r
+    }
+    val finalR = Matrices.fromBreeze(combinedR.toDenseMatrix)
+    val finalQ = if (computeQ) {
+      try {
+        val invR = inv(combinedR)
+        this.multiply(Matrices.fromBreeze(invR))
+      } catch {
+        case err: MatrixSingularException =>
+          logWarning("R is not invertible and return Q as null")
+          null
+      }
+    } else {
+      null
+    }
+    QRDecomposition(finalQ, finalR)
+  }
+
+  /**
    * Find all similar columns using the DIMSUM sampling algorithm, described in two papers
    *
    * http://arxiv.org/abs/1206.2082

http://git-wip-us.apache.org/repos/asf/spark/blob/d31c618e/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala
index b6cb53d..283ffec 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.mllib.linalg.distributed
 
 import scala.util.Random
 
+import breeze.numerics.abs
 import breeze.linalg.{DenseVector => BDV, DenseMatrix => BDM, norm => brzNorm, svd => brzSvd}
 
 import org.apache.spark.SparkFunSuite
@@ -238,6 +239,22 @@ class RowMatrixSuite extends SparkFunSuite with MLlibTestSparkContext {
       }
     }
   }
+
+  test("QR Decomposition") {
+    for (mat <- Seq(denseMat, sparseMat)) {
+      val result = mat.tallSkinnyQR(true)
+      val expected = breeze.linalg.qr.reduced(mat.toBreeze())
+      val calcQ = result.Q
+      val calcR = result.R
+      assert(closeToZero(abs(expected.q) - abs(calcQ.toBreeze())))
+      assert(closeToZero(abs(expected.r) - abs(calcR.toBreeze.asInstanceOf[BDM[Double]])))
+      assert(closeToZero(calcQ.multiply(calcR).toBreeze - mat.toBreeze()))
+      // Decomposition without computing Q
+      val rOnly = mat.tallSkinnyQR(computeQ = false)
+      assert(rOnly.Q == null)
+      assert(closeToZero(abs(expected.r) - abs(rOnly.R.toBreeze.asInstanceOf[BDM[Double]])))
+    }
+  }
 }
 
 class RowMatrixClusterSuite extends SparkFunSuite with LocalClusterSparkContext {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org