You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@lucene.apache.org by GitBox <gi...@apache.org> on 2021/08/09 12:53:19 UTC

[GitHub] [lucene] msokolov commented on a change in pull request #239: LUCENE-10040: Handle deletions in nearest vector search

msokolov commented on a change in pull request #239:
URL: https://github.com/apache/lucene/pull/239#discussion_r685157613



##########
File path: lucene/core/src/java/org/apache/lucene/util/hnsw/NeighborQueue.java
##########
@@ -42,13 +42,6 @@
     }
   }
 
-  NeighborQueue copy(boolean reversed) {

Review comment:
       this was unused I guess?

##########
File path: lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90HnswVectorsFormat.java
##########
@@ -16,13 +16,79 @@
  */
 package org.apache.lucene.codecs.lucene90;
 
+import static com.carrotsearch.randomizedtesting.RandomizedTest.frequently;
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
 import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.KnnVectorField;
+import org.apache.lucene.document.StringField;
 import org.apache.lucene.index.BaseKnnVectorsFormatTestCase;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.TestUtil;
 
 public class TestLucene90HnswVectorsFormat extends BaseKnnVectorsFormatTestCase {
   @Override
   protected Codec getCodec() {
     return TestUtil.getDefaultCodec();
   }
+
+  public void testSearchWithDeletions() throws IOException {
+    Directory dir = newDirectory();
+    IndexWriterConfig cfg = new IndexWriterConfig();
+    IndexWriter w = new IndexWriter(dir, cfg);

Review comment:
       I like to use try-with-resources in these tests; then the directory, reader and writer get auto-closed when exiting the scope, and in case there is a failure you don't get messy nastygrams from the test framework about unclosed resources.

##########
File path: lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90HnswVectorsFormat.java
##########
@@ -16,13 +16,79 @@
  */
 package org.apache.lucene.codecs.lucene90;
 
+import static com.carrotsearch.randomizedtesting.RandomizedTest.frequently;
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
 import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.KnnVectorField;
+import org.apache.lucene.document.StringField;
 import org.apache.lucene.index.BaseKnnVectorsFormatTestCase;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.TestUtil;
 
 public class TestLucene90HnswVectorsFormat extends BaseKnnVectorsFormatTestCase {
   @Override
   protected Codec getCodec() {
     return TestUtil.getDefaultCodec();
   }
+
+  public void testSearchWithDeletions() throws IOException {

Review comment:
       It's a good test; fine to have it here, but there is also TestKnnGraph which is not tied to any specific class.
   
   How about also having a simple test for deletions where we delete all the documents and make sure none are returned yet the TopDocs.totalHits.value > 0 ?

##########
File path: lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraph.java
##########
@@ -138,10 +145,14 @@ public static NeighborQueue search(
           continue;
         }
         visited.set(friendOrd);
+
         float score = similarityFunction.compare(query, vectors.vectorValue(friendOrd));
-        if (results.insertWithOverflow(friendOrd, score)) {
+        if (results.size() < topK || bound.check(score) == false) {

Review comment:
       yes I think so. They're generally the same now in the normal path, but unless/until we eliminate the distinction there are a few cases where they differ (tiny index, mostly) and we should use numSeed here.

##########
File path: lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraph.java
##########
@@ -109,13 +115,14 @@ public static NeighborQueue search(
       if (visited.get(entryPoint) == false) {
         visited.set(entryPoint);
         // explore the topK starting points of some random numSeed probes
-        results.add(entryPoint, similarityFunction.compare(query, vectors.vectorValue(entryPoint)));
+        float score = similarityFunction.compare(query, vectors.vectorValue(entryPoint));
+        candidates.add(entryPoint, score);
+        if (acceptOrds == null || acceptOrds.get(entryPoint)) {

Review comment:
       Maybe cleaner to require non-null. The caller can supply a `MatchAllBits` if that's what they need

##########
File path: lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java
##########
@@ -136,7 +136,7 @@ public void setInfoStream(InfoStream infoStream) {
   void addGraphNode(float[] value) throws IOException {
     NeighborQueue candidates =
         HnswGraph.search(
-            value, beamWidth, beamWidth, vectorValues, similarityFunction, hnsw, random);
+            value, beamWidth, beamWidth, vectorValues, similarityFunction, hnsw, null, random);

Review comment:
       Maybe add a comment to the effect we don't expect to see any deleted documents here?

##########
File path: lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90HnswVectorsFormat.java
##########
@@ -16,13 +16,79 @@
  */
 package org.apache.lucene.codecs.lucene90;
 
+import static com.carrotsearch.randomizedtesting.RandomizedTest.frequently;
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
 import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.KnnVectorField;
+import org.apache.lucene.document.StringField;
 import org.apache.lucene.index.BaseKnnVectorsFormatTestCase;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.TestUtil;
 
 public class TestLucene90HnswVectorsFormat extends BaseKnnVectorsFormatTestCase {
   @Override
   protected Codec getCodec() {
     return TestUtil.getDefaultCodec();
   }
+
+  public void testSearchWithDeletions() throws IOException {
+    Directory dir = newDirectory();
+    IndexWriterConfig cfg = new IndexWriterConfig();
+    IndexWriter w = new IndexWriter(dir, cfg);
+
+    final int numDocs = atLeast(100);
+    final int dim = 30;
+    int docIndex = 0;
+    for (int i = 0; i < numDocs; ++i) {
+      Document d = new Document();
+      if (frequently()) {
+        d.add(new StringField("id", String.valueOf(docIndex), Field.Store.YES));
+        d.add(new KnnVectorField("vector", randomVector(dim)));
+        docIndex++;
+      } else {
+        d.add(new StringField("other", "value", Field.Store.NO));

Review comment:
       Maybe use "id" for these too so they sometimes get deleted?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org