You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by sr...@apache.org on 2019/07/25 12:59:13 UTC
[spark] branch branch-2.4 updated: [SPARK-28421][ML]
SparseVector.apply performance optimization
This is an automated email from the ASF dual-hosted git repository.
srowen pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-2.4 by this push:
new a285c0d [SPARK-28421][ML] SparseVector.apply performance optimization
a285c0d is described below
commit a285c0d4fe071ceed05d8ea693febfbac5962d64
Author: zhengruifeng <ru...@foxmail.com>
AuthorDate: Tue Jul 23 20:20:22 2019 -0500
[SPARK-28421][ML] SparseVector.apply performance optimization
## What changes were proposed in this pull request?
optimize the `SparseVector.apply` by avoiding internal conversion
Since the speed up is significant (2.5X ~ 5X), and this method is widely used in ml, I suggest back porting.
| size| nnz | apply(old) | apply2(new impl) | apply3(new impl with extra range check)|
|------|----------|------------|----------|----------|
|10000000|100|75294|12208|18682|
|10000000|10000|75616|23132|32932|
|10000000|1000000|92949|42529|48821|
## How was this patch tested?
existing tests
using following code to test performance (here the new impl is named `apply2`, and another impl with extra range check is named `apply3`):
```
import scala.util.Random
import org.apache.spark.ml.linalg._
val size = 10000000
for (nnz <- Seq(100, 10000, 1000000)) {
val rng = new Random(123)
val indices = Array.fill(nnz + nnz)(rng.nextInt.abs % size).distinct.take(nnz).sorted
val values = Array.fill(nnz)(rng.nextDouble)
val vec = Vectors.sparse(size, indices, values).toSparse
val tic1 = System.currentTimeMillis;
(0 until 100).foreach{ round => var i = 0; var sum = 0.0; while(i < size) {sum+=vec(i); i+=1} };
val toc1 = System.currentTimeMillis;
val tic2 = System.currentTimeMillis;
(0 until 100).foreach{ round => var i = 0; var sum = 0.0; while(i < size) {sum+=vec.apply2(i); i+=1} };
val toc2 = System.currentTimeMillis;
val tic3 = System.currentTimeMillis;
(0 until 100).foreach{ round => var i = 0; var sum = 0.0; while(i < size) {sum+=vec.apply3(i); i+=1} };
val toc3 = System.currentTimeMillis;
println((size, nnz, toc1 - tic1, toc2 - tic2, toc3 - tic3))
}
```
Closes #25178 from zhengruifeng/sparse_vec_apply.
Authored-by: zhengruifeng <ru...@foxmail.com>
Signed-off-by: Sean Owen <se...@databricks.com>
---
.../src/main/scala/org/apache/spark/ml/linalg/Vectors.scala | 9 +++++++++
mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala | 9 +++++++++
2 files changed, 18 insertions(+)
diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala
index 5824e46..827b620 100644
--- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala
+++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala
@@ -603,6 +603,15 @@ class SparseVector @Since("2.0.0") (
private[spark] override def asBreeze: BV[Double] = new BSV[Double](indices, values, size)
+ override def apply(i: Int): Double = {
+ if (i < 0 || i >= size) {
+ throw new IndexOutOfBoundsException(s"Index $i out of bounds [0, $size)")
+ }
+
+ val j = util.Arrays.binarySearch(indices, i)
+ if (j < 0) 0.0 else values(j)
+ }
+
override def foreachActive(f: (Int, Double) => Unit): Unit = {
var i = 0
val localValuesSize = values.length
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index 6e68d96..fbc9358 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -785,6 +785,15 @@ class SparseVector @Since("1.0.0") (
private[spark] override def asBreeze: BV[Double] = new BSV[Double](indices, values, size)
+ override def apply(i: Int): Double = {
+ if (i < 0 || i >= size) {
+ throw new IndexOutOfBoundsException(s"Index $i out of bounds [0, $size)")
+ }
+
+ val j = util.Arrays.binarySearch(indices, i)
+ if (j < 0) 0.0 else values(j)
+ }
+
@Since("1.6.0")
override def foreachActive(f: (Int, Double) => Unit): Unit = {
var i = 0
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org