You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by me...@apache.org on 2015/01/06 22:58:01 UTC

spark git commit: SPARK-5017 [MLlib] - Use SVD to compute determinant and inverse of covariance matrix

Repository: spark
Updated Branches:
  refs/heads/master 4cba6eb42 -> 4108e5f36


SPARK-5017 [MLlib] - Use SVD to compute determinant and inverse of covariance matrix

MultivariateGaussian was calling both pinv() and det() on the covariance matrix, effectively performing two matrix decompositions.  Both values are now computed using the singular value decompositon. Both the pseudo-inverse and the pseudo-determinant are used to guard against singular matrices.

Author: Travis Galoppo <tj...@columbia.edu>

Closes #3871 from tgaloppo/spark-5017 and squashes the following commits:

383b5b3 [Travis Galoppo] MultivariateGaussian - minor optimization in density calculation
a5b8bc5 [Travis Galoppo] Added additional points to tests in test suite. Fixed comment in MultivariateGaussian
629d9d0 [Travis Galoppo] Moved some test values from var to val.
dc3d0f7 [Travis Galoppo] Catch potential exception calculating pseudo-determinant. Style improvements.
d448137 [Travis Galoppo] Added test suite for MultivariateGaussian, including test for degenerate case.
1989be0 [Travis Galoppo] SPARK-5017 - Fixed to use SVD to compute determinant and inverse of covariance matrix.  Previous code called both pinv() and det(), effectively performing two matrix decompositions. Additionally, the pinv() implementation in Breeze is known to fail for singular matrices.
b4415ea [Travis Galoppo] Merge branch 'spark-5017' of https://github.com/tgaloppo/spark into spark-5017
6f11b6d [Travis Galoppo] SPARK-5017 - Use SVD to compute determinant and inverse of covariance matrix. Code was calling both det() and pinv(), effectively performing two matrix decompositions. Futhermore, Breeze pinv() currently fails for singular matrices.
fd9784c [Travis Galoppo] SPARK-5017 - Use SVD to compute determinant and inverse of covariance matrix


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4108e5f3
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4108e5f3
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4108e5f3

Branch: refs/heads/master
Commit: 4108e5f36f8553bd728fd271baa69f7dfcc68d9b
Parents: 4cba6eb
Author: Travis Galoppo <tj...@columbia.edu>
Authored: Tue Jan 6 13:57:42 2015 -0800
Committer: Xiangrui Meng <me...@databricks.com>
Committed: Tue Jan 6 13:57:42 2015 -0800

----------------------------------------------------------------------
 .../mllib/stat/impl/MultivariateGaussian.scala  | 83 +++++++++++++++++---
 .../stat/impl/MultivariateGaussianSuite.scala   | 70 +++++++++++++++++
 2 files changed, 142 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/4108e5f3/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala
index 2eab5d2..bc7f6c5 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala
@@ -17,23 +17,84 @@
 
 package org.apache.spark.mllib.stat.impl
 
-import breeze.linalg.{DenseVector => DBV, DenseMatrix => DBM, Transpose, det, pinv}
+import breeze.linalg.{DenseVector => DBV, DenseMatrix => DBM, diag, max, eigSym}
 
-/** 
-   * Utility class to implement the density function for multivariate Gaussian distribution.
-   * Breeze provides this functionality, but it requires the Apache Commons Math library,
-   * so this class is here so-as to not introduce a new dependency in Spark.
-   */
+import org.apache.spark.mllib.util.MLUtils
+
+/**
+ * This class provides basic functionality for a Multivariate Gaussian (Normal) Distribution. In
+ * the event that the covariance matrix is singular, the density will be computed in a
+ * reduced dimensional subspace under which the distribution is supported.
+ * (see [[http://en.wikipedia.org/wiki/Multivariate_normal_distribution#Degenerate_case]])
+ * 
+ * @param mu The mean vector of the distribution
+ * @param sigma The covariance matrix of the distribution
+ */
 private[mllib] class MultivariateGaussian(
     val mu: DBV[Double], 
     val sigma: DBM[Double]) extends Serializable {
-  private val sigmaInv2 = pinv(sigma) * -0.5
-  private val U = math.pow(2.0 * math.Pi, -mu.length / 2.0) * math.pow(det(sigma), -0.5)
-    
+
+  /**
+   * Compute distribution dependent constants:
+   *    rootSigmaInv = D^(-1/2) * U, where sigma = U * D * U.t
+   *    u = (2*pi)^(-k/2) * det(sigma)^(-1/2) 
+   */
+  private val (rootSigmaInv: DBM[Double], u: Double) = calculateCovarianceConstants
+  
   /** Returns density of this multivariate Gaussian at given point, x */
   def pdf(x: DBV[Double]): Double = {
     val delta = x - mu
-    val deltaTranspose = new Transpose(delta)
-    U * math.exp(deltaTranspose * sigmaInv2 * delta)
+    val v = rootSigmaInv * delta
+    u * math.exp(v.t * v * -0.5)
+  }
+  
+  /**
+   * Calculate distribution dependent components used for the density function:
+   *    pdf(x) = (2*pi)^(-k/2) * det(sigma)^(-1/2) * exp( (-1/2) * (x-mu).t * inv(sigma) * (x-mu) )
+   * where k is length of the mean vector.
+   * 
+   * We here compute distribution-fixed parts 
+   *  (2*pi)^(-k/2) * det(sigma)^(-1/2)
+   * and
+   *  D^(-1/2) * U, where sigma = U * D * U.t
+   *  
+   * Both the determinant and the inverse can be computed from the singular value decomposition
+   * of sigma.  Noting that covariance matrices are always symmetric and positive semi-definite,
+   * we can use the eigendecomposition. We also do not compute the inverse directly; noting
+   * that 
+   * 
+   *    sigma = U * D * U.t
+   *    inv(Sigma) = U * inv(D) * U.t 
+   *               = (D^{-1/2} * U).t * (D^{-1/2} * U)
+   * 
+   * and thus
+   * 
+   *    -0.5 * (x-mu).t * inv(Sigma) * (x-mu) = -0.5 * norm(D^{-1/2} * U  * (x-mu))^2
+   *  
+   * To guard against singular covariance matrices, this method computes both the 
+   * pseudo-determinant and the pseudo-inverse (Moore-Penrose).  Singular values are considered
+   * to be non-zero only if they exceed a tolerance based on machine precision, matrix size, and
+   * relation to the maximum singular value (same tolerance used by, e.g., Octave).
+   */
+  private def calculateCovarianceConstants: (DBM[Double], Double) = {
+    val eigSym.EigSym(d, u) = eigSym(sigma) // sigma = u * diag(d) * u.t
+    
+    // For numerical stability, values are considered to be non-zero only if they exceed tol.
+    // This prevents any inverted value from exceeding (eps * n * max(d))^-1
+    val tol = MLUtils.EPSILON * max(d) * d.length
+    
+    try {
+      // pseudo-determinant is product of all non-zero singular values
+      val pdetSigma = d.activeValuesIterator.filter(_ > tol).reduce(_ * _)
+      
+      // calculate the root-pseudo-inverse of the diagonal matrix of singular values 
+      // by inverting the square root of all non-zero values
+      val pinvS = diag(new DBV(d.map(v => if (v > tol) math.sqrt(1.0 / v) else 0.0).toArray))
+    
+      (pinvS * u, math.pow(2.0 * math.Pi, -mu.length / 2.0) * math.pow(pdetSigma, -0.5))
+    } catch {
+      case uex: UnsupportedOperationException =>
+        throw new IllegalArgumentException("Covariance matrix has no non-zero singular values")
+    }
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/4108e5f3/mllib/src/test/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussianSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussianSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussianSuite.scala
new file mode 100644
index 0000000..d58f258
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussianSuite.scala
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.stat.impl
+
+import org.scalatest.FunSuite
+
+import breeze.linalg.{ DenseVector => BDV, DenseMatrix => BDM }
+
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
+
+class MultivariateGaussianSuite extends FunSuite with MLlibTestSparkContext {
+  test("univariate") {
+    val x1 = new BDV(Array(0.0))
+    val x2 = new BDV(Array(1.5))
+                     
+    val mu = new BDV(Array(0.0))
+    val sigma1 = new BDM(1, 1, Array(1.0))
+    val dist1 = new MultivariateGaussian(mu, sigma1)
+    assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5)
+    assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5)
+    
+    val sigma2 = new BDM(1, 1, Array(4.0))
+    val dist2 = new MultivariateGaussian(mu, sigma2)
+    assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5)
+    assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5)
+  }
+  
+  test("multivariate") {
+    val x1 = new BDV(Array(0.0, 0.0))
+    val x2 = new BDV(Array(1.0, 1.0))
+    
+    val mu = new BDV(Array(0.0, 0.0))
+    val sigma1 = new BDM(2, 2, Array(1.0, 0.0, 0.0, 1.0))
+    val dist1 = new MultivariateGaussian(mu, sigma1)
+    assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5)
+    assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5)
+    
+    val sigma2 = new BDM(2, 2, Array(4.0, -1.0, -1.0, 2.0))
+    val dist2 = new MultivariateGaussian(mu, sigma2)
+    assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5)
+    assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5)
+  }
+  
+  test("multivariate degenerate") {
+    val x1 = new BDV(Array(0.0, 0.0))
+    val x2 = new BDV(Array(1.0, 1.0))
+    
+    val mu = new BDV(Array(0.0, 0.0))
+    val sigma = new BDM(2, 2, Array(1.0, 1.0, 1.0, 1.0))
+    val dist = new MultivariateGaussian(mu, sigma)
+    assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5)
+    assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5)
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org