You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2022/01/17 16:54:07 UTC
[lucene] branch branch_9x updated: Small edits for KnnGraphTester (#575)
This is an automated email from the ASF dual-hosted git repository.
mayya pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/branch_9x by this push:
new 90ce59e Small edits for KnnGraphTester (#575)
90ce59e is described below
commit 90ce59e70b7d957b12ec932a3e450e8ffc7a9b70
Author: Mayya Sharipova <ma...@elastic.co>
AuthorDate: Wed Jan 12 17:23:10 2022 -0500
Small edits for KnnGraphTester (#575)
1. Correct the remaining size for input files larger
than Integer.MAX_VALUE, as currently with every
iteration we try to map the next blockSize of bytes
even if less < blockSize bytes are left in the file.
2. Correct java.lang.ClassCastException when retrieving
KnnGraphValues for stats printing.
3. Add an option for euclidean metric
---
.../apache/lucene/util/hnsw/KnnGraphTester.java | 45 +++++++++++++---------
1 file changed, 26 insertions(+), 19 deletions(-)
diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/KnnGraphTester.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/KnnGraphTester.java
index fcdf0aa..339d8f8 100644
--- a/lucene/core/src/test/org/apache/lucene/util/hnsw/KnnGraphTester.java
+++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/KnnGraphTester.java
@@ -36,9 +36,11 @@ import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
import org.apache.lucene.codecs.KnnVectorsFormat;
+import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.codecs.lucene90.Lucene90Codec;
import org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsReader;
+import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.KnnVectorField;
@@ -74,8 +76,6 @@ public class KnnGraphTester {
private static final String KNN_FIELD = "knn";
private static final String ID_FIELD = "id";
- private static final VectorSimilarityFunction SIMILARITY_FUNCTION =
- VectorSimilarityFunction.DOT_PRODUCT;
private int numDocs;
private int dim;
@@ -90,6 +90,7 @@ public class KnnGraphTester {
private int reindexTimeMsec;
private int beamWidth;
private int maxConn;
+ private VectorSimilarityFunction similarityFunction;
@SuppressForbidden(reason = "uses Random()")
private KnnGraphTester() {
@@ -100,6 +101,7 @@ public class KnnGraphTester {
topK = 100;
warmCount = 1000;
fanout = topK;
+ similarityFunction = VectorSimilarityFunction.DOT_PRODUCT;
}
public static void main(String... args) throws Exception {
@@ -183,6 +185,14 @@ public class KnnGraphTester {
case "-docs":
docVectorsPath = Paths.get(args[++iarg]);
break;
+ case "-metric":
+ String metric = args[++iarg];
+ if (metric.equals("euclidean")) {
+ similarityFunction = VectorSimilarityFunction.EUCLIDEAN;
+ } else if (metric.equals("angular") == false) {
+ throw new IllegalArgumentException("-metric can be 'angular' or 'euclidean' only");
+ }
+ break;
case "-forceMerge":
forceMerge = true;
break;
@@ -237,12 +247,13 @@ public class KnnGraphTester {
private void printFanoutHist(Path indexPath) throws IOException {
try (Directory dir = FSDirectory.open(indexPath);
DirectoryReader reader = DirectoryReader.open(dir)) {
- // int[] globalHist = new int[reader.maxDoc()];
for (LeafReaderContext context : reader.leaves()) {
LeafReader leafReader = context.reader();
+ KnnVectorsReader vectorsReader =
+ ((PerFieldKnnVectorsFormat.FieldsReader) ((CodecReader) leafReader).getVectorReader())
+ .getFieldReader(KNN_FIELD);
KnnGraphValues knnValues =
- ((Lucene90HnswVectorsReader) ((CodecReader) leafReader).getVectorReader())
- .getGraphValues(KNN_FIELD);
+ ((Lucene90HnswVectorsReader) vectorsReader).getGraphValues(KNN_FIELD);
System.out.printf("Leaf %d has %d documents\n", context.ord, leafReader.maxDoc());
printGraphFanout(knnValues, leafReader.maxDoc());
}
@@ -253,7 +264,7 @@ public class KnnGraphTester {
try (BinaryFileVectors vectors = new BinaryFileVectors(docsPath)) {
RandomAccessVectorValues values = vectors.randomAccess();
HnswGraphBuilder builder =
- new HnswGraphBuilder(vectors, SIMILARITY_FUNCTION, maxConn, beamWidth, 0);
+ new HnswGraphBuilder(vectors, similarityFunction, maxConn, beamWidth, 0);
// start at node 1
for (int i = 1; i < numDocs; i++) {
builder.addGraphNode(values.vectorValue(i));
@@ -533,25 +544,21 @@ public class KnnGraphTester {
for (int i = 0; i < numIters; i++) {
queries.get(query);
long totalBytes = (long) numDocs * dim * Float.BYTES;
- int
- blockSize =
- (int)
- Math.min(
- totalBytes,
- (Integer.MAX_VALUE / (dim * Float.BYTES)) * (dim * Float.BYTES)),
- offset = 0;
+ final int maxBlockSize = (Integer.MAX_VALUE / (dim * Float.BYTES)) * (dim * Float.BYTES);
+ int offset = 0;
int j = 0;
// System.out.println("totalBytes=" + totalBytes);
while (j < numDocs) {
+ int blockSize = (int) Math.min(totalBytes - offset, maxBlockSize);
FloatBuffer vectors =
in.map(FileChannel.MapMode.READ_ONLY, offset, blockSize)
.order(ByteOrder.LITTLE_ENDIAN)
.asFloatBuffer();
offset += blockSize;
- NeighborQueue queue = new NeighborQueue(topK, SIMILARITY_FUNCTION.reversed);
+ NeighborQueue queue = new NeighborQueue(topK, similarityFunction.reversed);
for (; j < numDocs && vectors.hasRemaining(); j++) {
vectors.get(vector);
- float d = SIMILARITY_FUNCTION.compare(query, vector);
+ float d = similarityFunction.compare(query, vector);
queue.insertWithOverflow(j, d);
}
result[i] = new int[topK];
@@ -583,22 +590,22 @@ public class KnnGraphTester {
iwc.setRAMBufferSizeMB(1994d);
// iwc.setMaxBufferedDocs(10000);
- FieldType fieldType = KnnVectorField.createFieldType(dim, VectorSimilarityFunction.DOT_PRODUCT);
+ FieldType fieldType = KnnVectorField.createFieldType(dim, similarityFunction);
if (quiet == false) {
iwc.setInfoStream(new PrintStreamInfoStream(System.out));
System.out.println("creating index in " + indexPath);
}
long start = System.nanoTime();
long totalBytes = (long) numDocs * dim * Float.BYTES, offset = 0;
+ final int maxBlockSize = (Integer.MAX_VALUE / (dim * Float.BYTES)) * (dim * Float.BYTES);
+
try (FSDirectory dir = FSDirectory.open(indexPath);
IndexWriter iw = new IndexWriter(dir, iwc)) {
- int blockSize =
- (int)
- Math.min(totalBytes, (Integer.MAX_VALUE / (dim * Float.BYTES)) * (dim * Float.BYTES));
float[] vector = new float[dim];
try (FileChannel in = FileChannel.open(docsPath)) {
int i = 0;
while (i < numDocs) {
+ int blockSize = (int) Math.min(totalBytes - offset, maxBlockSize);
FloatBuffer vectors =
in.map(FileChannel.MapMode.READ_ONLY, offset, blockSize)
.order(ByteOrder.LITTLE_ENDIAN)