You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by me...@apache.org on 2015/07/20 17:55:45 UTC

spark git commit: [SPARK-7422] [MLLIB] Add argmax to Vector, SparseVector

Repository: spark
Updated Branches:
  refs/heads/master 79ec07290 -> 3f7de7db4


[SPARK-7422] [MLLIB] Add argmax to Vector, SparseVector

Modifying Vector, DenseVector, and SparseVector to implement argmax functionality. This work is to set the stage for changes to be done in Spark-7423.

Author: George Dittmar <ge...@gmail.com>
Author: George <di...@Georges-MacBook-Pro.local>
Author: dittmarg <ge...@webtrends.com>
Author: Xiangrui Meng <me...@databricks.com>

Closes #6112 from GeorgeDittmar/SPARK-7422 and squashes the following commits:

3e0a939 [George Dittmar] Merge pull request #1 from mengxr/SPARK-7422
127dec5 [Xiangrui Meng] update argmax impl
2ea6a55 [George Dittmar] Added MimaExcludes for Vectors.argmax
98058f4 [George Dittmar] Merge branch 'master' of github.com:apache/spark into SPARK-7422
5fd9380 [George Dittmar] fixing style check error
42341fb [George Dittmar] refactoring arg max check to better handle zero values
b22af46 [George Dittmar] Fixing spaces between commas in unit test
f2eba2f [George Dittmar] Cleaning up unit tests to be fewer lines
aa330e3 [George Dittmar] Fixing some last if else spacing issues
ac53c55 [George Dittmar] changing dense vector argmax unit test to be one line call vs 2
d5b5423 [George Dittmar] Fixing code style and updating if logic on when to check for zero values
ee1a85a [George Dittmar] Cleaning up unit tests a bit and modifying a few cases
3ee8711 [George Dittmar] Fixing corner case issue with zeros in the active values of the sparse vector. Updated unit tests
b1f059f [George Dittmar] Added comment before we start arg max calculation. Updated unit tests to cover corner cases
f21dcce [George Dittmar] commit
af17981 [dittmarg] Initial work fixing bug that was made clear in pr
eeda560 [George] Fixing SparseVector argmax function to ignore zero values while doing the calculation.
4526acc [George] Merge branch 'master' of github.com:apache/spark into SPARK-7422
df9538a [George] Added argmax to sparse vector and added unit test
3cffed4 [George] Adding unit tests for argmax functions for Dense and Sparse vectors
04677af [George] initial work on adding argmax to Vector and SparseVector


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3f7de7db
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3f7de7db
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3f7de7db

Branch: refs/heads/master
Commit: 3f7de7db4cf7c5e2824cb91087c5e9d4beb0f738
Parents: 79ec072
Author: George Dittmar <ge...@gmail.com>
Authored: Mon Jul 20 08:55:37 2015 -0700
Committer: Xiangrui Meng <me...@databricks.com>
Committed: Mon Jul 20 08:55:37 2015 -0700

----------------------------------------------------------------------
 .../org/apache/spark/mllib/linalg/Vectors.scala | 57 ++++++++++++++++++--
 .../spark/mllib/linalg/VectorsSuite.scala       | 39 ++++++++++++++
 project/MimaExcludes.scala                      |  4 ++
 3 files changed, 95 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/3f7de7db/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index e048b01..9067b3b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -150,6 +150,12 @@ sealed trait Vector extends Serializable {
       toDense
     }
   }
+
+  /**
+   * Find the index of a maximal element.  Returns the first maximal element in case of a tie.
+   * Returns -1 if vector has length 0.
+   */
+  def argmax: Int
 }
 
 /**
@@ -588,11 +594,7 @@ class DenseVector(val values: Array[Double]) extends Vector {
     new SparseVector(size, ii, vv)
   }
 
-  /**
-   * Find the index of a maximal element.  Returns the first maximal element in case of a tie.
-   * Returns -1 if vector has length 0.
-   */
-  private[spark] def argmax: Int = {
+  override def argmax: Int = {
     if (size == 0) {
       -1
     } else {
@@ -717,6 +719,51 @@ class SparseVector(
       new SparseVector(size, ii, vv)
     }
   }
+
+  override def argmax: Int = {
+    if (size == 0) {
+      -1
+    } else {
+      // Find the max active entry.
+      var maxIdx = indices(0)
+      var maxValue = values(0)
+      var maxJ = 0
+      var j = 1
+      val na = numActives
+      while (j < na) {
+        val v = values(j)
+        if (v > maxValue) {
+          maxValue = v
+          maxIdx = indices(j)
+          maxJ = j
+        }
+        j += 1
+      }
+
+      // If the max active entry is nonpositive and there exists inactive ones, find the first zero.
+      if (maxValue <= 0.0 && na < size) {
+        if (maxValue == 0.0) {
+          // If there exists an inactive entry before maxIdx, find it and return its index.
+          if (maxJ < maxIdx) {
+            var k = 0
+            while (k < maxJ && indices(k) == k) {
+              k += 1
+            }
+            maxIdx = k
+          }
+        } else {
+          // If the max active value is negative, find and return the first inactive index.
+          var k = 0
+          while (k < na && indices(k) == k) {
+            k += 1
+          }
+          maxIdx = k
+        }
+      }
+
+      maxIdx
+    }
+  }
 }
 
 object SparseVector {

http://git-wip-us.apache.org/repos/asf/spark/blob/3f7de7db/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
index 178d95a..03be411 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
@@ -62,11 +62,50 @@ class VectorsSuite extends SparkFunSuite with Logging {
     assert(vec.toArray.eq(arr))
   }
 
+  test("dense argmax") {
+    val vec = Vectors.dense(Array.empty[Double]).asInstanceOf[DenseVector]
+    assert(vec.argmax === -1)
+
+    val vec2 = Vectors.dense(arr).asInstanceOf[DenseVector]
+    assert(vec2.argmax === 3)
+
+    val vec3 = Vectors.dense(Array(-1.0, 0.0, -2.0, 1.0)).asInstanceOf[DenseVector]
+    assert(vec3.argmax === 3)
+  }
+
   test("sparse to array") {
     val vec = Vectors.sparse(n, indices, values).asInstanceOf[SparseVector]
     assert(vec.toArray === arr)
   }
 
+  test("sparse argmax") {
+    val vec = Vectors.sparse(0, Array.empty[Int], Array.empty[Double]).asInstanceOf[SparseVector]
+    assert(vec.argmax === -1)
+
+    val vec2 = Vectors.sparse(n, indices, values).asInstanceOf[SparseVector]
+    assert(vec2.argmax === 3)
+
+    val vec3 = Vectors.sparse(5, Array(2, 3, 4), Array(1.0, 0.0, -.7))
+    assert(vec3.argmax === 2)
+
+    // check for case that sparse vector is created with
+    // only negative values {0.0, 0.0,-1.0, -0.7, 0.0}
+    val vec4 = Vectors.sparse(5, Array(2, 3), Array(-1.0, -.7))
+    assert(vec4.argmax === 0)
+
+    val vec5 = Vectors.sparse(11, Array(0, 3, 10), Array(-1.0, -.7, 0.0))
+    assert(vec5.argmax === 1)
+
+    val vec6 = Vectors.sparse(11, Array(0, 1, 2), Array(-1.0, -.7, 0.0))
+    assert(vec6.argmax === 2)
+
+    val vec7 = Vectors.sparse(5, Array(0, 1, 3), Array(-1.0, 0.0, -.7))
+    assert(vec7.argmax === 1)
+
+    val vec8 = Vectors.sparse(5, Array(1, 2), Array(0.0, -1.0))
+    assert(vec8.argmax === 0)
+  }
+
   test("vector equals") {
     val dv1 = Vectors.dense(arr.clone())
     val dv2 = Vectors.dense(arr.clone())

http://git-wip-us.apache.org/repos/asf/spark/blob/3f7de7db/project/MimaExcludes.scala
----------------------------------------------------------------------
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 36417f5..dd85254 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -98,6 +98,10 @@ object MimaExcludes {
               "org.apache.spark.api.r.StringRRDD.this"),
             ProblemFilters.exclude[MissingMethodProblem](
               "org.apache.spark.api.r.BaseRRDD.this")
+          ) ++ Seq(
+            // SPARK-7422 add argmax for sparse vectors
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.mllib.linalg.Vector.argmax")
           )
 
         case v if v.startsWith("1.4") =>


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org