You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by me...@apache.org on 2015/04/29 06:50:00 UTC

spark git commit: [SPARK-6756] [MLLIB] add toSparse, toDense, numActives, numNonzeros, and compressed to Vector

Repository: spark
Updated Branches:
  refs/heads/master a8aeadb7d -> 5ef006fc4


[SPARK-6756] [MLLIB] add toSparse, toDense, numActives, numNonzeros, and compressed to Vector

Add `compressed` to `Vector` with some other methods: `numActives`, `numNonzeros`, `toSparse`, and `toDense`. jkbradley

Author: Xiangrui Meng <me...@databricks.com>

Closes #5756 from mengxr/SPARK-6756 and squashes the following commits:

8d4ecbd [Xiangrui Meng] address comment and add mima excludes
da54179 [Xiangrui Meng] add toSparse, toDense, numActives, numNonzeros, and compressed to Vector


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5ef006fc
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5ef006fc
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5ef006fc

Branch: refs/heads/master
Commit: 5ef006fc4d010905e02cb905c9115b95ba55282b
Parents: a8aeadb
Author: Xiangrui Meng <me...@databricks.com>
Authored: Tue Apr 28 21:49:53 2015 -0700
Committer: Xiangrui Meng <me...@databricks.com>
Committed: Tue Apr 28 21:49:53 2015 -0700

----------------------------------------------------------------------
 .../org/apache/spark/mllib/linalg/Vectors.scala | 93 ++++++++++++++++++++
 .../spark/mllib/linalg/VectorsSuite.scala       | 44 +++++++++
 project/MimaExcludes.scala                      | 12 +++
 3 files changed, 149 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/5ef006fc/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index 34833e9..188d1e5 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -116,6 +116,40 @@ sealed trait Vector extends Serializable {
    *          with type `Double`.
    */
   private[spark] def foreachActive(f: (Int, Double) => Unit)
+
+  /**
+   * Number of active entries.  An "active entry" is an element which is explicitly stored,
+   * regardless of its value.  Note that inactive entries have value 0.
+   */
+  def numActives: Int
+
+  /**
+   * Number of nonzero elements. This scans all active values and count nonzeros.
+   */
+  def numNonzeros: Int
+
+  /**
+   * Converts this vector to a sparse vector with all explicit zeros removed.
+   */
+  def toSparse: SparseVector
+
+  /**
+   * Converts this vector to a dense vector.
+   */
+  def toDense: DenseVector = new DenseVector(this.toArray)
+
+  /**
+   * Returns a vector in either dense or sparse format, whichever uses less storage.
+   */
+  def compressed: Vector = {
+    val nnz = numNonzeros
+    // A dense vector needs 8 * size + 8 bytes, while a sparse vector needs 12 * nnz + 20 bytes.
+    if (1.5 * (nnz + 1.0) < size) {
+      toSparse
+    } else {
+      toDense
+    }
+  }
 }
 
 /**
@@ -525,6 +559,34 @@ class DenseVector(val values: Array[Double]) extends Vector {
     }
     result
   }
+
+  override def numActives: Int = size
+
+  override def numNonzeros: Int = {
+    // same as values.count(_ != 0.0) but faster
+    var nnz = 0
+    values.foreach { v =>
+      if (v != 0.0) {
+        nnz += 1
+      }
+    }
+    nnz
+  }
+
+  override def toSparse: SparseVector = {
+    val nnz = numNonzeros
+    val ii = new Array[Int](nnz)
+    val vv = new Array[Double](nnz)
+    var k = 0
+    foreachActive { (i, v) =>
+      if (v != 0) {
+        ii(k) = i
+        vv(k) = v
+        k += 1
+      }
+    }
+    new SparseVector(size, ii, vv)
+  }
 }
 
 object DenseVector {
@@ -602,6 +664,37 @@ class SparseVector(
     }
     result
   }
+
+  override def numActives: Int = values.length
+
+  override def numNonzeros: Int = {
+    var nnz = 0
+    values.foreach { v =>
+      if (v != 0.0) {
+        nnz += 1
+      }
+    }
+    nnz
+  }
+
+  override def toSparse: SparseVector = {
+    val nnz = numNonzeros
+    if (nnz == numActives) {
+      this
+    } else {
+      val ii = new Array[Int](nnz)
+      val vv = new Array[Double](nnz)
+      var k = 0
+      foreachActive { (i, v) =>
+        if (v != 0.0) {
+          ii(k) = i
+          vv(k) = v
+          k += 1
+        }
+      }
+      new SparseVector(size, ii, vv)
+    }
+  }
 }
 
 object SparseVector {

http://git-wip-us.apache.org/repos/asf/spark/blob/5ef006fc/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
index 2839c4c..24755e9 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
@@ -270,4 +270,48 @@ class VectorsSuite extends FunSuite {
     assert(Vectors.norm(sv, 3.7) ~== math.pow(sv.toArray.foldLeft(0.0)((a, v) =>
       a + math.pow(math.abs(v), 3.7)), 1.0 / 3.7) relTol 1E-8)
   }
+
+  test("Vector numActive and numNonzeros") {
+    val dv = Vectors.dense(0.0, 2.0, 3.0, 0.0)
+    assert(dv.numActives === 4)
+    assert(dv.numNonzeros === 2)
+
+    val sv = Vectors.sparse(4, Array(0, 1, 2), Array(0.0, 2.0, 3.0))
+    assert(sv.numActives === 3)
+    assert(sv.numNonzeros === 2)
+  }
+
+  test("Vector toSparse and toDense") {
+    val dv0 = Vectors.dense(0.0, 2.0, 3.0, 0.0)
+    assert(dv0.toDense === dv0)
+    val dv0s = dv0.toSparse
+    assert(dv0s.numActives === 2)
+    assert(dv0s === dv0)
+
+    val sv0 = Vectors.sparse(4, Array(0, 1, 2), Array(0.0, 2.0, 3.0))
+    assert(sv0.toDense === sv0)
+    val sv0s = sv0.toSparse
+    assert(sv0s.numActives === 2)
+    assert(sv0s === sv0)
+  }
+
+  test("Vector.compressed") {
+    val dv0 = Vectors.dense(1.0, 2.0, 3.0, 0.0)
+    val dv0c = dv0.compressed.asInstanceOf[DenseVector]
+    assert(dv0c === dv0)
+
+    val dv1 = Vectors.dense(0.0, 2.0, 0.0, 0.0)
+    val dv1c = dv1.compressed.asInstanceOf[SparseVector]
+    assert(dv1 === dv1c)
+    assert(dv1c.numActives === 1)
+
+    val sv0 = Vectors.sparse(4, Array(1, 2), Array(2.0, 0.0))
+    val sv0c = sv0.compressed.asInstanceOf[SparseVector]
+    assert(sv0 === sv0c)
+    assert(sv0c.numActives === 1)
+
+    val sv1 = Vectors.sparse(4, Array(0, 1, 2), Array(1.0, 2.0, 3.0))
+    val sv1c = sv1.compressed.asInstanceOf[DenseVector]
+    assert(sv1 === sv1c)
+  }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/5ef006fc/project/MimaExcludes.scala
----------------------------------------------------------------------
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 967961c..3beafa1 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -76,6 +76,18 @@ object MimaExcludes {
             // SPARK-7090 Introduce LDAOptimizer to LDA to further improve extensibility
             ProblemFilters.exclude[MissingClassProblem](
               "org.apache.spark.mllib.clustering.LDA$EMOptimizer")
+          ) ++ Seq(
+            // SPARK-6756 add toSparse, toDense, numActives, numNonzeros, and compressed to Vector
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.mllib.linalg.Vector.compressed"),
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.mllib.linalg.Vector.toDense"),
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.mllib.linalg.Vector.numNonzeros"),
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.mllib.linalg.Vector.toSparse"),
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.mllib.linalg.Vector.numActives")
           )
 
         case v if v.startsWith("1.3") =>


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org