You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by sr...@apache.org on 2019/07/25 12:59:13 UTC

[spark] branch branch-2.4 updated: [SPARK-28421][ML] SparseVector.apply performance optimization

This is an automated email from the ASF dual-hosted git repository.

srowen pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-2.4 by this push:
     new a285c0d  [SPARK-28421][ML] SparseVector.apply performance optimization
a285c0d is described below

commit a285c0d4fe071ceed05d8ea693febfbac5962d64
Author: zhengruifeng <ru...@foxmail.com>
AuthorDate: Tue Jul 23 20:20:22 2019 -0500

    [SPARK-28421][ML] SparseVector.apply performance optimization
    
    ## What changes were proposed in this pull request?
    optimize the `SparseVector.apply` by avoiding internal conversion
    Since the speed up is significant (2.5X ~ 5X), and this method is widely used in ml, I suggest back porting.
    
    | size|  nnz | apply(old) | apply2(new impl) | apply3(new impl with extra range check)|
    |------|----------|------------|----------|----------|
    |10000000|100|75294|12208|18682|
    |10000000|10000|75616|23132|32932|
    |10000000|1000000|92949|42529|48821|
    
    ## How was this patch tested?
    existing tests
    
    using following code to test performance (here the new impl is named `apply2`, and another impl with extra range check is named `apply3`):
    ```
    import scala.util.Random
    import org.apache.spark.ml.linalg._
    
    val size = 10000000
    for (nnz <- Seq(100, 10000, 1000000)) {
    	val rng = new Random(123)
    	val indices = Array.fill(nnz + nnz)(rng.nextInt.abs % size).distinct.take(nnz).sorted
    	val values = Array.fill(nnz)(rng.nextDouble)
    	val vec = Vectors.sparse(size, indices, values).toSparse
    
    	val tic1 = System.currentTimeMillis;
    	(0 until 100).foreach{ round => var i = 0; var sum = 0.0; while(i < size) {sum+=vec(i); i+=1} };
    	val toc1 = System.currentTimeMillis;
    
    	val tic2 = System.currentTimeMillis;
    	(0 until 100).foreach{ round => var i = 0; var sum = 0.0; while(i < size) {sum+=vec.apply2(i); i+=1} };
    	val toc2 = System.currentTimeMillis;
    
    	val tic3 = System.currentTimeMillis;
    	(0 until 100).foreach{ round => var i = 0; var sum = 0.0; while(i < size) {sum+=vec.apply3(i); i+=1} };
    	val toc3 = System.currentTimeMillis;
    
    	println((size, nnz, toc1 - tic1, toc2 - tic2, toc3 - tic3))
    }
    ```
    
    Closes #25178 from zhengruifeng/sparse_vec_apply.
    
    Authored-by: zhengruifeng <ru...@foxmail.com>
    Signed-off-by: Sean Owen <se...@databricks.com>
---
 .../src/main/scala/org/apache/spark/ml/linalg/Vectors.scala      | 9 +++++++++
 mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala | 9 +++++++++
 2 files changed, 18 insertions(+)

diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala
index 5824e46..827b620 100644
--- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala
+++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala
@@ -603,6 +603,15 @@ class SparseVector @Since("2.0.0") (
 
   private[spark] override def asBreeze: BV[Double] = new BSV[Double](indices, values, size)
 
+  override def apply(i: Int): Double = {
+    if (i < 0 || i >= size) {
+      throw new IndexOutOfBoundsException(s"Index $i out of bounds [0, $size)")
+    }
+
+    val j = util.Arrays.binarySearch(indices, i)
+    if (j < 0) 0.0 else values(j)
+  }
+
   override def foreachActive(f: (Int, Double) => Unit): Unit = {
     var i = 0
     val localValuesSize = values.length
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index 6e68d96..fbc9358 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -785,6 +785,15 @@ class SparseVector @Since("1.0.0") (
 
   private[spark] override def asBreeze: BV[Double] = new BSV[Double](indices, values, size)
 
+  override def apply(i: Int): Double = {
+    if (i < 0 || i >= size) {
+      throw new IndexOutOfBoundsException(s"Index $i out of bounds [0, $size)")
+    }
+
+    val j = util.Arrays.binarySearch(indices, i)
+    if (j < 0) 0.0 else values(j)
+  }
+
   @Since("1.6.0")
   override def foreachActive(f: (Int, Double) => Unit): Unit = {
     var i = 0


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org