You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2021/03/15 03:16:14 UTC
[lucene] branch main updated: LUCENE-9837: try to improve performance of VectorUtil.dotProduct (#17)

This is an automated email from the ASF dual-hosted git repository.

rmuir pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/main by this push:
     new d48193e  LUCENE-9837: try to improve performance of VectorUtil.dotProduct (#17)
d48193e is described below

commit d48193e8cff854aea6f5e00c6b6bb7b176b97c72
Author: Robert Muir <rm...@apache.org>
AuthorDate: Sun Mar 14 23:16:08 2021 -0400

    LUCENE-9837: try to improve performance of VectorUtil.dotProduct (#17)
    
    More loop unrolling for VectorUtil.dotProduct to eek out a bit more short-term performance.
---
 .../java/org/apache/lucene/util/VectorUtil.java    | 64 ++++++++++++++++------
 1 file changed, 47 insertions(+), 17 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java b/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java
index cc9cd15..546d13d 100644
--- a/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java
+++ b/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java
@@ -42,25 +42,55 @@ public final class VectorUtil {
     if (a.length < 8) {
       return res;
     }
-    float s0 = 0f;
-    float s1 = 0f;
-    float s2 = 0f;
-    float s3 = 0f;
-    float s4 = 0f;
-    float s5 = 0f;
-    float s6 = 0f;
-    float s7 = 0f;
+    for (; i + 31 < a.length; i += 32) {
+      res +=
+          b[i + 0] * a[i + 0]
+              + b[i + 1] * a[i + 1]
+              + b[i + 2] * a[i + 2]
+              + b[i + 3] * a[i + 3]
+              + b[i + 4] * a[i + 4]
+              + b[i + 5] * a[i + 5]
+              + b[i + 6] * a[i + 6]
+              + b[i + 7] * a[i + 7];
+      res +=
+          b[i + 8] * a[i + 8]
+              + b[i + 9] * a[i + 9]
+              + b[i + 10] * a[i + 10]
+              + b[i + 11] * a[i + 11]
+              + b[i + 12] * a[i + 12]
+              + b[i + 13] * a[i + 13]
+              + b[i + 14] * a[i + 14]
+              + b[i + 15] * a[i + 15];
+      res +=
+          b[i + 16] * a[i + 16]
+              + b[i + 17] * a[i + 17]
+              + b[i + 18] * a[i + 18]
+              + b[i + 19] * a[i + 19]
+              + b[i + 20] * a[i + 20]
+              + b[i + 21] * a[i + 21]
+              + b[i + 22] * a[i + 22]
+              + b[i + 23] * a[i + 23];
+      res +=
+          b[i + 24] * a[i + 24]
+              + b[i + 25] * a[i + 25]
+              + b[i + 26] * a[i + 26]
+              + b[i + 27] * a[i + 27]
+              + b[i + 28] * a[i + 28]
+              + b[i + 29] * a[i + 29]
+              + b[i + 30] * a[i + 30]
+              + b[i + 31] * a[i + 31];
+    }
     for (; i + 7 < a.length; i += 8) {
-      s0 += b[i] * a[i];
-      s1 += b[i + 1] * a[i + 1];
-      s2 += b[i + 2] * a[i + 2];
-      s3 += b[i + 3] * a[i + 3];
-      s4 += b[i + 4] * a[i + 4];
-      s5 += b[i + 5] * a[i + 5];
-      s6 += b[i + 6] * a[i + 6];
-      s7 += b[i + 7] * a[i + 7];
+      res +=
+          b[i + 0] * a[i + 0]
+              + b[i + 1] * a[i + 1]
+              + b[i + 2] * a[i + 2]
+              + b[i + 3] * a[i + 3]
+              + b[i + 4] * a[i + 4]
+              + b[i + 5] * a[i + 5]
+              + b[i + 6] * a[i + 6]
+              + b[i + 7] * a[i + 7];
     }
-    res += s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7;
     return res;
   }