You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ju...@apache.org on 2022/10/18 00:13:35 UTC
[lucene] branch branch_9x updated: Fix failure to load larger data sets in KnnGraphTest (#11849)
This is an automated email from the ASF dual-hosted git repository.
julietibs pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/branch_9x by this push:
new ff6a2e5da66 Fix failure to load larger data sets in KnnGraphTest (#11849)
ff6a2e5da66 is described below
commit ff6a2e5da66ec6bd24b07c264c430fb064291565
Author: Benjamin Trent <be...@gmail.com>
AuthorDate: Mon Oct 17 19:39:58 2022 -0400
Fix failure to load larger data sets in KnnGraphTest (#11849)
When running the `reindex` task with KnnGraphTester, exceptionally large
datasets can be used. Since mmap is used to read the data, we need to know the
buffer size. This size is limited to Integer.MAX_VALUE, which is inadequate for
larger datasets.
So, this commit adjusts the reading to only read a single vector at a time.
---
.../apache/lucene/util/hnsw/KnnGraphTester.java | 58 +++++++++++-----------
1 file changed, 28 insertions(+), 30 deletions(-)
diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/KnnGraphTester.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/KnnGraphTester.java
index c79ca6f3e4a..e4cc6cd4606 100644
--- a/lucene/core/src/test/org/apache/lucene/util/hnsw/KnnGraphTester.java
+++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/KnnGraphTester.java
@@ -25,7 +25,6 @@ import java.lang.management.ManagementFactory;
import java.lang.management.ThreadMXBean;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
-import java.nio.FloatBuffer;
import java.nio.IntBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
@@ -361,7 +360,7 @@ public class KnnGraphTester {
TopDocs[] results = new TopDocs[numIters];
long elapsed, totalCpuTime, totalVisited = 0;
try (FileChannel input = FileChannel.open(queryPath)) {
- VectorReader targetReader = VectorReader.create(input, dim, vectorEncoding, numIters);
+ VectorReader targetReader = VectorReader.create(input, dim, vectorEncoding);
if (quiet == false) {
System.out.println("running " + numIters + " targets; topK=" + topK + ", fanout=" + fanout);
}
@@ -469,10 +468,10 @@ public class KnnGraphTester {
private abstract static class VectorReader {
final float[] target;
final ByteBuffer bytes;
+ final FileChannel input;
- static VectorReader create(FileChannel input, int dim, VectorEncoding vectorEncoding, int n)
- throws IOException {
- int bufferSize = n * dim * vectorEncoding.byteSize;
+ static VectorReader create(FileChannel input, int dim, VectorEncoding vectorEncoding) {
+ int bufferSize = dim * vectorEncoding.byteSize;
switch (vectorEncoding) {
case BYTE:
return new VectorReaderByte(input, dim, bufferSize);
@@ -482,52 +481,50 @@ public class KnnGraphTester {
}
}
- VectorReader(FileChannel input, int dim, int bufferSize) throws IOException {
- bytes =
- input.map(FileChannel.MapMode.READ_ONLY, 0, bufferSize).order(ByteOrder.LITTLE_ENDIAN);
+ VectorReader(FileChannel input, int dim, int bufferSize) {
+ this.bytes = ByteBuffer.wrap(new byte[bufferSize]).order(ByteOrder.LITTLE_ENDIAN);
+ this.input = input;
target = new float[dim];
}
- void reset() {
+ void reset() throws IOException {
+ input.position(0);
+ }
+
+ protected final void readNext() throws IOException {
+ this.input.read(bytes);
bytes.position(0);
}
- abstract float[] next();
+ abstract float[] next() throws IOException;
}
private static class VectorReaderFloat32 extends VectorReader {
- private final FloatBuffer floats;
-
- VectorReaderFloat32(FileChannel input, int dim, int bufferSize) throws IOException {
+ VectorReaderFloat32(FileChannel input, int dim, int bufferSize) {
super(input, dim, bufferSize);
- floats = bytes.asFloatBuffer();
- }
-
- @Override
- void reset() {
- super.reset();
- floats.position(0);
}
@Override
- float[] next() {
- floats.get(target);
+ float[] next() throws IOException {
+ readNext();
+ bytes.asFloatBuffer().get(target);
return target;
}
}
private static class VectorReaderByte extends VectorReader {
- private byte[] scratch;
- private BytesRef bytesRef;
+ private final byte[] scratch;
+ private final BytesRef bytesRef;
- VectorReaderByte(FileChannel input, int dim, int bufferSize) throws IOException {
+ VectorReaderByte(FileChannel input, int dim, int bufferSize) {
super(input, dim, bufferSize);
scratch = new byte[dim];
bytesRef = new BytesRef(scratch);
}
@Override
- float[] next() {
+ float[] next() throws IOException {
+ readNext();
bytes.get(scratch);
for (int i = 0; i < scratch.length; i++) {
target[i] = scratch[i];
@@ -535,7 +532,8 @@ public class KnnGraphTester {
return target;
}
- BytesRef nextBytes() {
+ BytesRef nextBytes() throws IOException {
+ readNext();
bytes.get(scratch);
return bytesRef;
}
@@ -663,8 +661,8 @@ public class KnnGraphTester {
}
try (FileChannel in = FileChannel.open(docPath);
FileChannel qIn = FileChannel.open(queryPath)) {
- VectorReader docReader = VectorReader.create(in, dim, encoding, numDocs);
- VectorReader queryReader = VectorReader.create(qIn, dim, encoding, numIters);
+ VectorReader docReader = VectorReader.create(in, dim, encoding);
+ VectorReader queryReader = VectorReader.create(qIn, dim, encoding);
for (int i = 0; i < numIters; i++) {
float[] query = queryReader.next();
NeighborQueue queue = new NeighborQueue(topK, false);
@@ -714,7 +712,7 @@ public class KnnGraphTester {
try (FSDirectory dir = FSDirectory.open(indexPath);
IndexWriter iw = new IndexWriter(dir, iwc)) {
try (FileChannel in = FileChannel.open(docsPath)) {
- VectorReader vectorReader = VectorReader.create(in, dim, vectorEncoding, numDocs);
+ VectorReader vectorReader = VectorReader.create(in, dim, vectorEncoding);
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
switch (vectorEncoding) {