You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ju...@apache.org on 2022/10/18 00:13:35 UTC

[lucene] branch branch_9x updated: Fix failure to load larger data sets in KnnGraphTest (#11849)

This is an automated email from the ASF dual-hosted git repository.

julietibs pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/branch_9x by this push:
     new ff6a2e5da66 Fix failure to load larger data sets in KnnGraphTest (#11849)
ff6a2e5da66 is described below

commit ff6a2e5da66ec6bd24b07c264c430fb064291565
Author: Benjamin Trent <be...@gmail.com>
AuthorDate: Mon Oct 17 19:39:58 2022 -0400

    Fix failure to load larger data sets in KnnGraphTest (#11849)
    
    When running the `reindex` task with KnnGraphTester, exceptionally large
    datasets can be used. Since mmap is used to read the data, we need to know the
    buffer size. This size is limited to Integer.MAX_VALUE, which is inadequate for
    larger datasets.
    
    So, this commit adjusts the reading to only read a single vector at a time.
---
 .../apache/lucene/util/hnsw/KnnGraphTester.java    | 58 +++++++++++-----------
 1 file changed, 28 insertions(+), 30 deletions(-)

diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/KnnGraphTester.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/KnnGraphTester.java
index c79ca6f3e4a..e4cc6cd4606 100644
--- a/lucene/core/src/test/org/apache/lucene/util/hnsw/KnnGraphTester.java
+++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/KnnGraphTester.java
@@ -25,7 +25,6 @@ import java.lang.management.ManagementFactory;
 import java.lang.management.ThreadMXBean;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
-import java.nio.FloatBuffer;
 import java.nio.IntBuffer;
 import java.nio.channels.FileChannel;
 import java.nio.file.Files;
@@ -361,7 +360,7 @@ public class KnnGraphTester {
     TopDocs[] results = new TopDocs[numIters];
     long elapsed, totalCpuTime, totalVisited = 0;
     try (FileChannel input = FileChannel.open(queryPath)) {
-      VectorReader targetReader = VectorReader.create(input, dim, vectorEncoding, numIters);
+      VectorReader targetReader = VectorReader.create(input, dim, vectorEncoding);
       if (quiet == false) {
         System.out.println("running " + numIters + " targets; topK=" + topK + ", fanout=" + fanout);
       }
@@ -469,10 +468,10 @@ public class KnnGraphTester {
   private abstract static class VectorReader {
     final float[] target;
     final ByteBuffer bytes;
+    final FileChannel input;
 
-    static VectorReader create(FileChannel input, int dim, VectorEncoding vectorEncoding, int n)
-        throws IOException {
-      int bufferSize = n * dim * vectorEncoding.byteSize;
+    static VectorReader create(FileChannel input, int dim, VectorEncoding vectorEncoding) {
+      int bufferSize = dim * vectorEncoding.byteSize;
       switch (vectorEncoding) {
         case BYTE:
           return new VectorReaderByte(input, dim, bufferSize);
@@ -482,52 +481,50 @@ public class KnnGraphTester {
       }
     }
 
-    VectorReader(FileChannel input, int dim, int bufferSize) throws IOException {
-      bytes =
-          input.map(FileChannel.MapMode.READ_ONLY, 0, bufferSize).order(ByteOrder.LITTLE_ENDIAN);
+    VectorReader(FileChannel input, int dim, int bufferSize) {
+      this.bytes = ByteBuffer.wrap(new byte[bufferSize]).order(ByteOrder.LITTLE_ENDIAN);
+      this.input = input;
       target = new float[dim];
     }
 
-    void reset() {
+    void reset() throws IOException {
+      input.position(0);
+    }
+
+    protected final void readNext() throws IOException {
+      this.input.read(bytes);
       bytes.position(0);
     }
 
-    abstract float[] next();
+    abstract float[] next() throws IOException;
   }
 
   private static class VectorReaderFloat32 extends VectorReader {
-    private final FloatBuffer floats;
-
-    VectorReaderFloat32(FileChannel input, int dim, int bufferSize) throws IOException {
+    VectorReaderFloat32(FileChannel input, int dim, int bufferSize) {
       super(input, dim, bufferSize);
-      floats = bytes.asFloatBuffer();
-    }
-
-    @Override
-    void reset() {
-      super.reset();
-      floats.position(0);
     }
 
     @Override
-    float[] next() {
-      floats.get(target);
+    float[] next() throws IOException {
+      readNext();
+      bytes.asFloatBuffer().get(target);
       return target;
     }
   }
 
   private static class VectorReaderByte extends VectorReader {
-    private byte[] scratch;
-    private BytesRef bytesRef;
+    private final byte[] scratch;
+    private final BytesRef bytesRef;
 
-    VectorReaderByte(FileChannel input, int dim, int bufferSize) throws IOException {
+    VectorReaderByte(FileChannel input, int dim, int bufferSize) {
       super(input, dim, bufferSize);
       scratch = new byte[dim];
       bytesRef = new BytesRef(scratch);
     }
 
     @Override
-    float[] next() {
+    float[] next() throws IOException {
+      readNext();
       bytes.get(scratch);
       for (int i = 0; i < scratch.length; i++) {
         target[i] = scratch[i];
@@ -535,7 +532,8 @@ public class KnnGraphTester {
       return target;
     }
 
-    BytesRef nextBytes() {
+    BytesRef nextBytes() throws IOException {
+      readNext();
       bytes.get(scratch);
       return bytesRef;
     }
@@ -663,8 +661,8 @@ public class KnnGraphTester {
     }
     try (FileChannel in = FileChannel.open(docPath);
         FileChannel qIn = FileChannel.open(queryPath)) {
-      VectorReader docReader = VectorReader.create(in, dim, encoding, numDocs);
-      VectorReader queryReader = VectorReader.create(qIn, dim, encoding, numIters);
+      VectorReader docReader = VectorReader.create(in, dim, encoding);
+      VectorReader queryReader = VectorReader.create(qIn, dim, encoding);
       for (int i = 0; i < numIters; i++) {
         float[] query = queryReader.next();
         NeighborQueue queue = new NeighborQueue(topK, false);
@@ -714,7 +712,7 @@ public class KnnGraphTester {
     try (FSDirectory dir = FSDirectory.open(indexPath);
         IndexWriter iw = new IndexWriter(dir, iwc)) {
       try (FileChannel in = FileChannel.open(docsPath)) {
-        VectorReader vectorReader = VectorReader.create(in, dim, vectorEncoding, numDocs);
+        VectorReader vectorReader = VectorReader.create(in, dim, vectorEncoding);
         for (int i = 0; i < numDocs; i++) {
           Document doc = new Document();
           switch (vectorEncoding) {