You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2016/06/16 00:01:19 UTC
mahout git commit: MAHOUT-1837: Sparse/Dense Matrix analysis for
Matrix Multiplication. closes apache/mahout#228
Repository: mahout
Updated Branches:
refs/heads/master cfe52f2fa -> d9940489d
MAHOUT-1837: Sparse/Dense Matrix analysis for Matrix Multiplication. closes apache/mahout#228
Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/d9940489
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/d9940489
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/d9940489
Branch: refs/heads/master
Commit: d9940489d2f849d36af396d603f6170ab560e505
Parents: cfe52f2
Author: Andrew Palumbo <ap...@apache.org>
Authored: Wed Jun 15 20:00:26 2016 -0400
Committer: Andrew Palumbo <ap...@apache.org>
Committed: Wed Jun 15 20:00:26 2016 -0400
----------------------------------------------------------------------
.../org/apache/mahout/math/drm/package.scala | 2 +-
.../apache/mahout/math/scalabindings/MMul.scala | 8 +--
.../mahout/math/scalabindings/package.scala | 61 ++++++++++++++++++++
.../mahout/math/scalabindings/MathSuite.scala | 30 ++++++++++
.../apache/mahout/sparkbindings/blas/ABt.scala | 11 +++-
.../mahout/sparkbindings/drm/package.scala | 24 +++++---
.../mahout/sparkbindings/drm/DrmLikeSuite.scala | 4 +-
7 files changed, 123 insertions(+), 17 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mahout/blob/d9940489/math-scala/src/main/scala/org/apache/mahout/math/drm/package.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/package.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/package.scala
index 86c7054..cdec954 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/package.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/package.scala
@@ -23,6 +23,7 @@ import org.apache.mahout.math.scalabindings._
import scala.reflect.ClassTag
import org.apache.mahout.math.drm.logical.OpAewUnaryFunc
+
import collection._
package object drm {
@@ -354,7 +355,6 @@ package object drm {
keys \u2192 block
}
}
-
}
package object indexeddataset {
http://git-wip-us.apache.org/repos/asf/mahout/blob/d9940489/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MMul.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MMul.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MMul.scala
index d72d2f0..6389806 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MMul.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MMul.scala
@@ -35,7 +35,7 @@ object MMul extends MMBinaryFunc {
val (af, bf) = (a.getFlavor, b.getFlavor)
val backs = (af.getBacking, bf.getBacking)
- val sd = (af.getStructure, af.isDense, bf.getStructure, bf.isDense)
+ val sd = (af.getStructure, sparsityAnalysis(a), bf.getStructure, sparsityAnalysis(b))
val alg: MMulAlg = backs match {
@@ -124,7 +124,7 @@ object MMul extends MMBinaryFunc {
require(r.forall(mxR \u21d2 mxR.nrow == a.nrow && mxR.ncol == b.ncol))
val (m, n) = (a.nrow, b.ncol)
- val mxR = r.getOrElse(if (a.getFlavor.isDense) a.like(m, n) else b.like(m, n))
+ val mxR = r.getOrElse(if (sparsityAnalysis(a)) a.like(m, n) else b.like(m, n))
for (row \u2190 0 until mxR.nrow; col \u2190 0 until mxR.ncol) {
// this vector-vector should be sort of optimized, right?
@@ -269,10 +269,10 @@ object MMul extends MMBinaryFunc {
val (m, n) = (a.nrow, b.ncol)
// Prefer col-wise result iff a is dense and b is sparse. In all other cases default to row-wise.
- val preferColWiseR = a.getFlavor.isDense && !b.getFlavor.isDense
+ val preferColWiseR = sparsityAnalysis(a) && !sparsityAnalysis(b)
val mxR = r.getOrElse {
- (a.getFlavor.isDense, preferColWiseR) match {
+ (sparsityAnalysis(a), preferColWiseR) match {
case (false, false) \u21d2 b.like(m, n)
case (false, true) \u21d2 b.like(n, m).t
case (true, false) \u21d2 a.like(m, n)
http://git-wip-us.apache.org/repos/asf/mahout/blob/d9940489/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/package.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/package.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/package.scala
index 10c534b..71a2c11 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/package.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/package.scala
@@ -18,7 +18,9 @@
package org.apache.mahout.math
import org.apache.mahout.math.solver.EigenDecomposition
+
import collection._
+import scala.util.Random
/**
* Mahout matrices and vectors' scala syntactic sugar
@@ -29,6 +31,12 @@ package object scalabindings {
// Reserved "ALL" range
final val `::`: Range = null
+ // values for stochastic sparsityAnalysis
+ final val z95 = 1.959964
+ final val z80 = 1.281552
+ final val maxSamples = 500
+ final val minSamples = 15
+
// Some enums
object AutoBooleanEnum extends Enumeration {
type T = Value
@@ -410,4 +418,57 @@ package object scalabindings {
def dist(mxX: Matrix, mxY: Matrix): Matrix = sqDist(mxX, mxY) := sqrt _
+ /**
+ * Check the density of an in-core matrix based on supplied criteria.
+ * Returns true if we think mx is densier than threshold with at least 80% confidence.
+ *
+ * @param mx The matrix to check density of.
+ * @param threshold the threshold of non-zero elements above which we consider a Matrix Dense
+ */
+ def sparsityAnalysis(mx: Matrix, threshold: Double = 0.25): Boolean = {
+
+ require(threshold >= 0.0 && threshold <= 1.0)
+ var n = minSamples
+ var mean = 0.0
+ val rnd = new Random()
+ val dimm = mx.nrow
+ val dimn = mx.ncol
+ val pq = threshold * (1 - threshold)
+
+ for (s \u2190 0 until minSamples) {
+ if (mx(rnd.nextInt(dimm), rnd.nextInt(dimn)) != 0.0) mean += 1
+ }
+ mean /= minSamples
+ val iv = z80 * math.sqrt(pq / n)
+
+ if (mean < threshold - iv) return false // sparse
+ else if (mean > threshold + iv) return true // dense
+
+ while (n < maxSamples) {
+ // Determine upper bound we may need for n to likely relinquish the uncertainty. Here, we use
+ // confidence interval formula but solved for n.
+ val ivNeeded = math.abs(threshold - mean) max 1e-11
+
+ val stderr = ivNeeded / z80
+ val nNeeded = (math.ceil(pq / (stderr * stderr)).toInt max n min maxSamples) - n
+
+ var meanNext = 0.0
+ for (s \u2190 0 until nNeeded) {
+ if (mx(rnd.nextInt(dimm), rnd.nextInt(dimn)) != 0.0) meanNext += 1
+ }
+ mean = (n * mean + meanNext) / (n + nNeeded)
+ n += nNeeded
+
+ // Are we good now?
+ val iv = z80 * math.sqrt(pq / n)
+ if (mean < threshold - iv) return false // sparse
+ else if (mean > threshold + iv) return true // dense
+ }
+
+ return mean <= threshold
+
+ }
+
+
+
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/d9940489/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MathSuite.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MathSuite.scala b/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MathSuite.scala
index 0503e49..264e375 100644
--- a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MathSuite.scala
+++ b/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MathSuite.scala
@@ -17,6 +17,8 @@
package org.apache.mahout.math.scalabindings
+import org.apache.log4j.Level
+
import org.apache.mahout.logging._
import org.apache.mahout.math._
import org.apache.mahout.math.scalabindings.RLikeOps._
@@ -234,4 +236,32 @@ class MathSuite extends FunSuite with MahoutSuite {
(mxDsq - mxDsqControl).norm should be < 1e-7
}
+ test("sparsity analysis") {
+ setLogLevel(Level.DEBUG)
+
+ val m = 500
+ val n = 800
+ val mxA = new DenseMatrix(m, n)
+
+ sparsityAnalysis(mxA) shouldBe false
+ sparsityAnalysis(mxA, .5) shouldBe false
+ sparsityAnalysis(mxA + 1) shouldBe true
+ sparsityAnalysis(mxA + 1, .95) shouldBe true
+
+ for (i \u2190 0 until m by 5) mxA(i, ::) := 1
+ info(s"20% detected as dense?:${sparsityAnalysis(mxA)}")
+ mxA := 0
+
+ for (i \u2190 0 until m by 3) mxA(i, ::) := 1
+ info(s"33% detected as dense?:${sparsityAnalysis(mxA)}")
+ mxA := 0
+
+ for (i \u2190 0 until m by 4) mxA(i, ::) := 1
+ info(s"25% detected as dense?:${sparsityAnalysis(mxA)}")
+
+ for (i \u2190 0 until m by 2) mxA(i, ::) := 1
+ info(s"50% detected as dense?:${sparsityAnalysis(mxA)}")
+
+ }
+
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/d9940489/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/ABt.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/ABt.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/ABt.scala
index 676b496..b57d8ae 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/ABt.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/ABt.scala
@@ -20,13 +20,15 @@ package org.apache.mahout.sparkbindings.blas
import org.apache.mahout.math.scalabindings._
import RLikeOps._
import org.apache.spark.rdd.RDD
+
import scala.reflect.ClassTag
import org.apache.mahout.sparkbindings._
import org.apache.mahout.math.drm.BlockifiedDrmTuple
import org.apache.mahout.sparkbindings.drm._
-import org.apache.mahout.math.{SparseMatrix, Matrix, SparseRowMatrix}
+import org.apache.mahout.math.{DenseMatrix, Matrix, SparseMatrix, SparseRowMatrix}
import org.apache.mahout.math.drm.logical.OpABt
import org.apache.mahout.logging._
+import org.apache.mahout.math.flavor.MatrixFlavor
/** Contains RDD plans for ABt operator */
object ABt {
@@ -116,7 +118,12 @@ object ABt {
// Empty combiner += value
createCombiner = (t: (Array[K], Array[Int], Matrix)) => {
val (rowKeys, colKeys, block) = t
- val comb = new SparseMatrix(prodNCol, block.nrow).t
+
+ val comb = if (block.getFlavor == MatrixFlavor.SPARSELIKE) {
+ new SparseMatrix(prodNCol, block.nrow).t
+ } else {
+ new DenseMatrix(prodNCol, block.nrow).t
+ }
for ((col, i) <- colKeys.zipWithIndex) comb(::, col) := block(::, i)
rowKeys -> comb
http://git-wip-us.apache.org/repos/asf/mahout/blob/d9940489/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/package.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/package.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/package.scala
index 64065d9..aca7c3c 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/package.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/package.scala
@@ -60,19 +60,27 @@ package object drm {
val keys = data.map(t => t._1).toArray[K]
val vectors = data.map(t => t._2).toArray
- val block = if (vectors(0).isDense) {
- val block = new DenseMatrix(vectors.length, blockncol)
- var row = 0
- while (row < vectors.length) {
- block(row, ::) := vectors(row)
- row += 1
- }
+ // create the block by default as dense.
+ // would probably be better to sample a subset of these
+ // vectors first before creating the entire matrix.
+ // so that we don't have the overhead of creating a full second matrix in
+ // the case that the matrix is not dense.
+ val block = new DenseMatrix(vectors.length, blockncol)
+ var row = 0
+ while (row < vectors.length) {
+ block(row, ::) := vectors(row)
+ row += 1
+ }
+
+ // Test the density of the data. If the matrix does not meet the
+ // requirements for density, convert the Vectors to a sparse Matrix.
+ val resBlock = if (sparsityAnalysis(block)) {
block
} else {
new SparseRowMatrix(vectors.length, blockncol, vectors, true, false)
}
- Iterator(keys -> block)
+ Iterator(keys -> resBlock)
}
})
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/d9940489/spark/src/test/scala/org/apache/mahout/sparkbindings/drm/DrmLikeSuite.scala
----------------------------------------------------------------------
diff --git a/spark/src/test/scala/org/apache/mahout/sparkbindings/drm/DrmLikeSuite.scala b/spark/src/test/scala/org/apache/mahout/sparkbindings/drm/DrmLikeSuite.scala
index a5cb7f8..8f9b00f 100644
--- a/spark/src/test/scala/org/apache/mahout/sparkbindings/drm/DrmLikeSuite.scala
+++ b/spark/src/test/scala/org/apache/mahout/sparkbindings/drm/DrmLikeSuite.scala
@@ -49,7 +49,7 @@ class DrmLikeSuite extends FunSuite with DistributedSparkSuite with DrmLikeSuite
}).norm should be < 1e-4
}
- test("DRM blockify sparse -> SRM") {
+ test("DRM blockify sparse -> DRM") {
val inCoreA = sparse(
(1, 2, 3),
@@ -59,7 +59,7 @@ class DrmLikeSuite extends FunSuite with DistributedSparkSuite with DrmLikeSuite
(inCoreA - drmA.mapBlock() {
case (keys, block) =>
- if (!block.isInstanceOf[SparseRowMatrix])
+ if (block.isInstanceOf[SparseRowMatrix])
throw new AssertionError("Block must be dense.")
keys -> block
}).norm should be < 1e-4