You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by me...@apache.org on 2015/01/26 07:18:25 UTC

spark git commit: [SPARK-5384][mllib] Vectors.sqdist returns inconsistent results for sparse/dense vectors when the vectors have different lengths

Repository: spark
Updated Branches:
  refs/heads/master 8df943551 -> 81251682e


[SPARK-5384][mllib] Vectors.sqdist returns inconsistent results for sparse/dense vectors when the vectors have different lengths

JIRA issue: https://issues.apache.org/jira/browse/SPARK-5384
Currently `Vectors.sqdist` return inconsistent result for sparse/dense vectors when the vectors have different lengths, please refer to JIRA for sample

PR scope:
Unify the sqdist logic for dense/sparse vectors and fix the inconsistency, also remove the possible sparse to dense conversion in the original code.

For reviewers:
Maybe we should first discuss what's the correct behavior.
1. Vectors for sqdist must have the same length, like in breeze?
2. If they can have different lengths, what's the correct result for sqdist? (should the extra part get into calculation?)

I'll update PR with more optimization and additional ut afterwards. Thanks.

Author: Yuhao Yang <hh...@gmail.com>

Closes #4183 from hhbyyh/fixDouble and squashes the following commits:

1f17328 [Yuhao Yang] limit PR scope to size constraints only
54cbf97 [Yuhao Yang] fix Vectors.sqdist inconsistence


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/81251682
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/81251682
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/81251682

Branch: refs/heads/master
Commit: 81251682edfb6a0de47e9dd72b2f63ee298fca33
Parents: 8df9435
Author: Yuhao Yang <hh...@gmail.com>
Authored: Sun Jan 25 22:18:09 2015 -0800
Committer: Xiangrui Meng <me...@databricks.com>
Committed: Sun Jan 25 22:18:09 2015 -0800

----------------------------------------------------------------------
 .../scala/org/apache/spark/mllib/linalg/Vectors.scala    | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/81251682/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index 7ee0224..b3022ad 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -333,7 +333,7 @@ object Vectors {
       math.pow(sum, 1.0 / p)
     }
   }
- 
+
   /**
    * Returns the squared distance between two Vectors.
    * @param v1 first Vector.
@@ -341,8 +341,9 @@ object Vectors {
    * @return squared distance between two Vectors.
    */
   def sqdist(v1: Vector, v2: Vector): Double = {
+    require(v1.size == v2.size, "vector dimension mismatch")
     var squaredDistance = 0.0
-    (v1, v2) match { 
+    (v1, v2) match {
       case (v1: SparseVector, v2: SparseVector) =>
         val v1Values = v1.values
         val v1Indices = v1.indices
@@ -350,12 +351,12 @@ object Vectors {
         val v2Indices = v2.indices
         val nnzv1 = v1Indices.size
         val nnzv2 = v2Indices.size
-        
+
         var kv1 = 0
         var kv2 = 0
         while (kv1 < nnzv1 || kv2 < nnzv2) {
           var score = 0.0
- 
+
           if (kv2 >= nnzv2 || (kv1 < nnzv1 && v1Indices(kv1) < v2Indices(kv2))) {
             score = v1Values(kv1)
             kv1 += 1
@@ -397,7 +398,7 @@ object Vectors {
     val nnzv1 = indices.size
     val nnzv2 = v2.size
     var iv1 = if (nnzv1 > 0) indices(kv1) else -1
-   
+
     while (kv2 < nnzv2) {
       var score = 0.0
       if (kv2 != iv1) {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org