You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2022/11/10 14:55:44 UTC
[lucene] branch branch_9x updated: Fix integer overflow when seeking the vector index for connections (#11905)
This is an automated email from the ASF dual-hosted git repository.
rmuir pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/branch_9x by this push:
new 08ecf7a1300 Fix integer overflow when seeking the vector index for connections (#11905)
08ecf7a1300 is described below
commit 08ecf7a1300698ae40fcefcb13e4c062a942ceeb
Author: Benjamin Trent <be...@gmail.com>
AuthorDate: Thu Nov 10 08:24:32 2022 -0500
Fix integer overflow when seeking the vector index for connections (#11905)
* Fix integer overflow when seeking the vector index for connections
* Adding monster test to cause overflow failure
---
lucene/CHANGES.txt | 8 +++
.../lucene91/Lucene91HnswVectorsReader.java | 8 ++-
.../lucene92/Lucene92HnswVectorsReader.java | 15 +++--
.../codecs/lucene94/Lucene94HnswVectorsReader.java | 15 +++--
.../codecs/lucene94/Lucene94HnswVectorsWriter.java | 5 +-
.../apache/lucene/document/TestManyKnnDocs.java | 67 ++++++++++++++++++++++
6 files changed, 104 insertions(+), 14 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index bca0ff80bda..4640b5e4bfa 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -90,6 +90,14 @@ Build
* GITHUB#11886: Upgrade to gradle 7.5.1 (Dawid Weiss)
+======================== Lucene 9.4.2 =======================
+
+Bug Fixes
+---------------------
+* GITHUB#11905: Fix integer overflow when seeking the vector index for connections in a single segment.
+ This addresses a bug that was introduced in 9.2.0 where having many vectors is not handled well
+ in the vector connections reader.
+
======================== Lucene 9.4.1 =======================
Bug Fixes
diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java
index 2832327082f..3eaac9e5ecb 100644
--- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java
@@ -383,13 +383,17 @@ public final class Lucene91HnswVectorsReader extends KnnVectorsReader {
// calculate for each level the start offsets in vectorIndex file from where to read
// neighbours
graphOffsetsByLevel = new long[numLevels];
+ final long connectionsAndSizeBytes =
+ Math.multiplyExact(Math.addExact(1L, maxConn), Integer.BYTES);
for (int level = 0; level < numLevels; level++) {
if (level == 0) {
graphOffsetsByLevel[level] = 0;
} else {
int numNodesOnPrevLevel = level == 1 ? size : nodesByLevel[level - 1].length;
graphOffsetsByLevel[level] =
- graphOffsetsByLevel[level - 1] + (1 + maxConn) * Integer.BYTES * numNodesOnPrevLevel;
+ Math.addExact(
+ graphOffsetsByLevel[level - 1],
+ Math.multiplyExact(connectionsAndSizeBytes, numNodesOnPrevLevel));
}
}
}
@@ -542,7 +546,7 @@ public final class Lucene91HnswVectorsReader extends KnnVectorsReader {
this.entryNode = numLevels > 1 ? nodesByLevel[numLevels - 1][0] : 0;
this.size = entry.size();
this.graphOffsetsByLevel = entry.graphOffsetsByLevel;
- this.bytesForConns = ((long) entry.maxConn + 1) * Integer.BYTES;
+ this.bytesForConns = Math.multiplyExact(Math.addExact(entry.maxConn, 1L), Integer.BYTES);
}
@Override
diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsReader.java
index 32709c47450..fd0ccf5e63c 100644
--- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsReader.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsReader.java
@@ -366,16 +366,20 @@ public final class Lucene92HnswVectorsReader extends KnnVectorsReader {
// calculate for each level the start offsets in vectorIndex file from where to read
// neighbours
graphOffsetsByLevel = new long[numLevels];
+ final long connectionsAndSizeLevel0Bytes =
+ Math.multiplyExact(Math.addExact(1, Math.multiplyExact(M, 2L)), Integer.BYTES);
+ final long connectionsAndSizeBytes = Math.multiplyExact(Math.addExact(1L, M), Integer.BYTES);
for (int level = 0; level < numLevels; level++) {
if (level == 0) {
graphOffsetsByLevel[level] = 0;
} else if (level == 1) {
- int numNodesOnLevel0 = size;
- graphOffsetsByLevel[level] = (1 + (M * 2)) * Integer.BYTES * numNodesOnLevel0;
+ graphOffsetsByLevel[level] = Math.multiplyExact(connectionsAndSizeLevel0Bytes, size);
} else {
int numNodesOnPrevLevel = nodesByLevel[level - 1].length;
graphOffsetsByLevel[level] =
- graphOffsetsByLevel[level - 1] + (1 + M) * Integer.BYTES * numNodesOnPrevLevel;
+ Math.addExact(
+ graphOffsetsByLevel[level - 1],
+ Math.multiplyExact(connectionsAndSizeBytes, numNodesOnPrevLevel));
}
}
}
@@ -408,8 +412,9 @@ public final class Lucene92HnswVectorsReader extends KnnVectorsReader {
this.entryNode = numLevels > 1 ? nodesByLevel[numLevels - 1][0] : 0;
this.size = entry.size();
this.graphOffsetsByLevel = entry.graphOffsetsByLevel;
- this.bytesForConns = ((long) entry.M + 1) * Integer.BYTES;
- this.bytesForConns0 = ((long) (entry.M * 2) + 1) * Integer.BYTES;
+ this.bytesForConns = Math.multiplyExact(Math.addExact(entry.M, 1L), Integer.BYTES);
+ this.bytesForConns0 =
+ Math.multiplyExact(Math.addExact(Math.multiplyExact(entry.M, 2L), 1), Integer.BYTES);
}
@Override
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94HnswVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94HnswVectorsReader.java
index 035be10e41a..c5a3a39a328 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94HnswVectorsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94HnswVectorsReader.java
@@ -400,16 +400,20 @@ public final class Lucene94HnswVectorsReader extends KnnVectorsReader {
// calculate for each level the start offsets in vectorIndex file from where to read
// neighbours
graphOffsetsByLevel = new long[numLevels];
+ final long connectionsAndSizeLevel0Bytes =
+ Math.multiplyExact(Math.addExact(1, Math.multiplyExact(M, 2L)), Integer.BYTES);
+ final long connectionsAndSizeBytes = Math.multiplyExact(Math.addExact(1L, M), Integer.BYTES);
for (int level = 0; level < numLevels; level++) {
if (level == 0) {
graphOffsetsByLevel[level] = 0;
} else if (level == 1) {
- int numNodesOnLevel0 = size;
- graphOffsetsByLevel[level] = (1 + (M * 2)) * Integer.BYTES * numNodesOnLevel0;
+ graphOffsetsByLevel[level] = connectionsAndSizeLevel0Bytes * size;
} else {
int numNodesOnPrevLevel = nodesByLevel[level - 1].length;
graphOffsetsByLevel[level] =
- graphOffsetsByLevel[level - 1] + (1 + M) * Integer.BYTES * numNodesOnPrevLevel;
+ Math.addExact(
+ graphOffsetsByLevel[level - 1],
+ Math.multiplyExact(connectionsAndSizeBytes, numNodesOnPrevLevel));
}
}
}
@@ -442,8 +446,9 @@ public final class Lucene94HnswVectorsReader extends KnnVectorsReader {
this.entryNode = numLevels > 1 ? nodesByLevel[numLevels - 1][0] : 0;
this.size = entry.size();
this.graphOffsetsByLevel = entry.graphOffsetsByLevel;
- this.bytesForConns = ((long) entry.M + 1) * Integer.BYTES;
- this.bytesForConns0 = ((long) (entry.M * 2) + 1) * Integer.BYTES;
+ this.bytesForConns = Math.multiplyExact(Math.addExact(entry.M, 1L), Integer.BYTES);
+ this.bytesForConns0 =
+ Math.multiplyExact(Math.addExact(Math.multiplyExact(entry.M, 2L), 1), Integer.BYTES);
}
@Override
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94HnswVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94HnswVectorsWriter.java
index cdd877cc104..9ae11111bfb 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94HnswVectorsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94HnswVectorsWriter.java
@@ -671,10 +671,11 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
@Override
public long ramBytesUsed() {
if (vectors.size() == 0) return 0;
+ long vectorSize = vectors.size();
return docsWithField.ramBytesUsed()
- + vectors.size()
+ + vectorSize
* (RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER)
- + vectors.size() * fieldInfo.getVectorDimension() * fieldInfo.getVectorEncoding().byteSize
+ + vectorSize * fieldInfo.getVectorDimension() * fieldInfo.getVectorEncoding().byteSize
+ hnswGraphBuilder.getGraph().ramBytesUsed();
}
}
diff --git a/lucene/core/src/test/org/apache/lucene/document/TestManyKnnDocs.java b/lucene/core/src/test/org/apache/lucene/document/TestManyKnnDocs.java
new file mode 100644
index 00000000000..bc3a249ffa0
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/document/TestManyKnnDocs.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.document;
+
+import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
+import org.apache.lucene.index.*;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.KnnVectorQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.tests.util.LuceneTestCase;
+import org.apache.lucene.tests.util.LuceneTestCase.Monster;
+import org.apache.lucene.tests.util.TestUtil;
+
+@TimeoutSuite(millis = 86_400_000) // 24 hour timeout
+@Monster("takes ~2 hours and needs extra heap, disk space, file handles")
+public class TestManyKnnDocs extends LuceneTestCase {
+ // gradlew -p lucene/core test --tests TestManyKnnDocs -Ptests.heapsize=16g -Dtests.monster=true
+
+ public void testLargeSegment() throws Exception {
+ IndexWriterConfig iwc = new IndexWriterConfig();
+ iwc.setCodec(
+ TestUtil.getDefaultCodec()); // Make sure to use the default codec instead of a random one
+ iwc.setRAMBufferSizeMB(64); // Use a 64MB buffer to create larger initial segments
+ TieredMergePolicy mp = new TieredMergePolicy();
+ mp.setMaxMergeAtOnce(256); // avoid intermediate merges (waste of time with HNSW?)
+ mp.setSegmentsPerTier(256); // only merge once at the end when we ask
+ iwc.setMergePolicy(mp);
+ String fieldName = "field";
+ VectorSimilarityFunction similarityFunction = VectorSimilarityFunction.DOT_PRODUCT;
+
+ try (Directory dir = FSDirectory.open(createTempDir("ManyKnnVectorDocs"));
+ IndexWriter iw = new IndexWriter(dir, iwc)) {
+
+ int numVectors = 16268816;
+ float[] vector = new float[1];
+ Document doc = new Document();
+ doc.add(new KnnVectorField(fieldName, vector, similarityFunction));
+ for (int i = 0; i < numVectors; i++) {
+ vector[0] = (i % 256);
+ iw.addDocument(doc);
+ }
+
+ // merge to single segment and then verify
+ iw.forceMerge(1);
+ iw.commit();
+ IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(dir));
+ TopDocs docs = searcher.search(new KnnVectorQuery("field", new float[] {120}, 10), 5);
+ assertEquals(5, docs.scoreDocs.length);
+ }
+ }
+}