You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2022/01/31 14:21:01 UTC

[lucene] branch main updated: LUCENE-9573 Add Vectors to TestBackwardsCompatibility (#616)

This is an automated email from the ASF dual-hosted git repository.

mayya pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/main by this push:
     new 8dfdb26  LUCENE-9573 Add Vectors to TestBackwardsCompatibility (#616)
8dfdb26 is described below

commit 8dfdb261e76db9db3d6e56f2e8bb0aac5b6871c8
Author: Mayya Sharipova <ma...@elastic.co>
AuthorDate: Mon Jan 31 09:20:53 2022 -0500

    LUCENE-9573 Add Vectors to TestBackwardsCompatibility (#616)
    
    This patch adds KNN vectors for testing backward compatible indices
    
    - Add a KnnVectorField to documents when creating a new backward
      compatible index
    - Add knn vectors search and check for vector values to the testing
      of search of backward compatible indices
    - Add tests for knn vector search when changing backward compatible
     indices (merging them and adding new documents to them)
---
 .../backward_index/TestBackwardsCompatibility.java | 136 ++++++++++++++++++---
 1 file changed, 120 insertions(+), 16 deletions(-)

diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBackwardsCompatibility.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBackwardsCompatibility.java
index 3623261..ceb2029 100644
--- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBackwardsCompatibility.java
+++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBackwardsCompatibility.java
@@ -16,6 +16,9 @@
  */
 package org.apache.lucene.backward_index;
 
+import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
+import static org.apache.lucene.util.Version.LUCENE_10_0_0;
+
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
@@ -50,6 +53,7 @@ import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.FloatDocValuesField;
 import org.apache.lucene.document.FloatPoint;
 import org.apache.lucene.document.IntPoint;
+import org.apache.lucene.document.KnnVectorField;
 import org.apache.lucene.document.LongPoint;
 import org.apache.lucene.document.NumericDocValuesField;
 import org.apache.lucene.document.SortedDocValuesField;
@@ -90,10 +94,13 @@ import org.apache.lucene.index.StandardDirectoryReader;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.index.VectorSimilarityFunction;
+import org.apache.lucene.index.VectorValues;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.DocValuesFieldExistsQuery;
 import org.apache.lucene.search.FieldDoc;
 import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.KnnVectorQuery;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.NormsFieldExistsQuery;
 import org.apache.lucene.search.ScoreDoc;
@@ -153,6 +160,16 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
   // Then move those 2 zip files to your trunk checkout and add them
   // to the oldNames array.
 
+  private static final int DOCS_COUNT = 35;
+  private static final int DELETED_ID = 7;
+
+  // change version to 9 when we update 9 backward indices to include KNN field
+  private static final int KNN_VECTOR_MIN_SUPPORTED_VERSION = LUCENE_10_0_0.major;
+  private static final String KNN_VECTOR_FIELD = "knn_field";
+  private static final FieldType KNN_VECTOR_FIELD_TYPE =
+      KnnVectorField.createFieldType(3, VectorSimilarityFunction.COSINE);
+  private static final float[] KNN_VECTOR = {0.2f, -0.1f, 0.1f};
+
   public void testCreateCFS() throws IOException {
     createIndex("index.cfs", true, false);
   }
@@ -1037,10 +1054,11 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
     }
   }
 
-  public void testIndexOldIndexNoAdds() throws IOException {
+  public void testIndexOldIndexNoAdds() throws Exception {
     for (String name : oldNames) {
       Directory dir = newDirectory(oldIndexDirs.get(name));
-      changeIndexNoAdds(random(), dir);
+      Version version = Version.parse(name.substring(0, name.indexOf('-')));
+      changeIndexNoAdds(random(), dir, version);
       dir.close();
     }
   }
@@ -1080,7 +1098,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
     final Bits liveDocs = MultiBits.getLiveDocs(reader);
     assertNotNull(liveDocs);
 
-    for (int i = 0; i < 35; i++) {
+    for (int i = 0; i < DOCS_COUNT; i++) {
       if (liveDocs.get(i)) {
         Document d = reader.document(i);
         List<IndexableField> fields = d.getFields();
@@ -1110,8 +1128,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
         Terms tfv = tfvFields.terms("utf8");
         assertNotNull("docID=" + i + " index=" + oldName, tfv);
       } else {
-        // Only ID 7 is deleted
-        assertEquals(7, i);
+        assertEquals(DELETED_ID, i);
       }
     }
 
@@ -1137,7 +1154,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
     SortedNumericDocValues dvSortedNumeric =
         MultiDocValues.getSortedNumericValues(reader, "dvSortedNumeric");
 
-    for (int i = 0; i < 35; i++) {
+    for (int i = 0; i < DOCS_COUNT; i++) {
       int id = Integer.parseInt(reader.document(i).get("id"));
       assertEquals(i, dvByte.nextDoc());
       assertEquals(id, dvByte.longValue());
@@ -1278,9 +1295,51 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
         34,
         searcher.getIndexReader());
 
+    // test vector values and KNN search
+    if (minIndexMajorVersion >= KNN_VECTOR_MIN_SUPPORTED_VERSION) {
+      // test vector values
+      int cnt = 0;
+      for (LeafReaderContext ctx : reader.leaves()) {
+        VectorValues values = ctx.reader().getVectorValues(KNN_VECTOR_FIELD);
+        if (values != null) {
+          assertEquals(KNN_VECTOR_FIELD_TYPE.vectorDimension(), values.dimension());
+          for (int doc = values.nextDoc(); doc != NO_MORE_DOCS; doc = values.nextDoc()) {
+            float[] expectedVector = {KNN_VECTOR[0], KNN_VECTOR[1], KNN_VECTOR[2] + 0.1f * cnt};
+            assertArrayEquals(
+                "vectors do not match for doc=" + cnt, expectedVector, values.vectorValue(), 0);
+            cnt++;
+          }
+        }
+      }
+      assertEquals(DOCS_COUNT, cnt);
+
+      // test KNN search
+      ScoreDoc[] scoreDocs = assertKNNSearch(searcher, KNN_VECTOR, 10, 10, "0");
+      for (int i = 0; i < scoreDocs.length; i++) {
+        int id = Integer.parseInt(reader.document(scoreDocs[i].doc).get("id"));
+        int expectedId = i < DELETED_ID ? i : i + 1;
+        assertEquals(expectedId, id);
+      }
+    }
+
     reader.close();
   }
 
+  private static ScoreDoc[] assertKNNSearch(
+      IndexSearcher searcher,
+      float[] queryVector,
+      int k,
+      int expectedHitsCount,
+      String expectedFirstDocId)
+      throws IOException {
+    ScoreDoc[] hits =
+        searcher.search(new KnnVectorQuery(KNN_VECTOR_FIELD, queryVector, k), k).scoreDocs;
+    assertEquals("wrong number of hits", expectedHitsCount, hits.length);
+    Document d = searcher.doc(hits[0].doc);
+    assertEquals("wrong first document", expectedFirstDocId, d.get("id"));
+    return hits;
+  }
+
   public void changeIndexWithAdds(Random random, Directory dir, Version nameVersion)
       throws IOException {
     SegmentInfos infos = SegmentInfos.readLatestCommit(dir);
@@ -1296,7 +1355,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
                 .setMergePolicy(newLogMergePolicy()));
     // add 10 docs
     for (int i = 0; i < 10; i++) {
-      addDoc(writer, 35 + i);
+      addDoc(writer, DOCS_COUNT + i);
     }
 
     // make sure writer sees right total -- writer seems not to know about deletes in .del?
@@ -1304,13 +1363,25 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
     assertEquals("wrong doc count", expected, writer.getDocStats().numDocs);
     writer.close();
 
-    // make sure searching sees right # hits
+    // make sure searching sees right # hits for term search
     IndexReader reader = DirectoryReader.open(dir);
     IndexSearcher searcher = newSearcher(reader);
     ScoreDoc[] hits = searcher.search(new TermQuery(new Term("content", "aaa")), 1000).scoreDocs;
     Document d = searcher.getIndexReader().document(hits[0].doc);
     assertEquals("wrong first document", "0", d.get("id"));
     doTestHits(hits, 44, searcher.getIndexReader());
+
+    if (nameVersion.major >= KNN_VECTOR_MIN_SUPPORTED_VERSION) {
+      // make sure KNN search sees all hits (graph may not be used if k is big)
+      assertKNNSearch(searcher, KNN_VECTOR, 1000, 44, "0");
+      // make sure KNN search using HNSW graph sees newly added docs
+      assertKNNSearch(
+          searcher,
+          new float[] {KNN_VECTOR[0], KNN_VECTOR[1], KNN_VECTOR[2] + 0.1f * 44},
+          10,
+          10,
+          "44");
+    }
     reader.close();
 
     // fully merge
@@ -1325,22 +1396,43 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
 
     reader = DirectoryReader.open(dir);
     searcher = newSearcher(reader);
+    // make sure searching sees right # hits fot term search
     hits = searcher.search(new TermQuery(new Term("content", "aaa")), 1000).scoreDocs;
     assertEquals("wrong number of hits", 44, hits.length);
     d = searcher.doc(hits[0].doc);
     doTestHits(hits, 44, searcher.getIndexReader());
     assertEquals("wrong first document", "0", d.get("id"));
+
+    if (nameVersion.major >= KNN_VECTOR_MIN_SUPPORTED_VERSION) {
+      // make sure KNN search sees all hits
+      assertKNNSearch(searcher, KNN_VECTOR, 1000, 44, "0");
+      // make sure KNN search using HNSW graph sees newly added docs
+      assertKNNSearch(
+          searcher,
+          new float[] {KNN_VECTOR[0], KNN_VECTOR[1], KNN_VECTOR[2] + 0.1f * 44},
+          10,
+          10,
+          "44");
+    }
     reader.close();
   }
 
-  public void changeIndexNoAdds(Random random, Directory dir) throws IOException {
-    // make sure searching sees right # hits
+  public void changeIndexNoAdds(Random random, Directory dir, Version nameVersion)
+      throws IOException {
+    // make sure searching sees right # hits for term search
     DirectoryReader reader = DirectoryReader.open(dir);
     IndexSearcher searcher = newSearcher(reader);
     ScoreDoc[] hits = searcher.search(new TermQuery(new Term("content", "aaa")), 1000).scoreDocs;
     assertEquals("wrong number of hits", 34, hits.length);
     Document d = searcher.doc(hits[0].doc);
     assertEquals("wrong first document", "0", d.get("id"));
+
+    if (nameVersion.major >= KNN_VECTOR_MIN_SUPPORTED_VERSION) {
+      // make sure KNN search sees all hits
+      assertKNNSearch(searcher, KNN_VECTOR, 1000, 34, "0");
+      // make sure KNN search using HNSW graph retrieves correct results
+      assertKNNSearch(searcher, KNN_VECTOR, 10, 10, "0");
+    }
     reader.close();
 
     // fully merge
@@ -1352,9 +1444,17 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
 
     reader = DirectoryReader.open(dir);
     searcher = newSearcher(reader);
+    // make sure searching sees right # hits fot term search
     hits = searcher.search(new TermQuery(new Term("content", "aaa")), 1000).scoreDocs;
     assertEquals("wrong number of hits", 34, hits.length);
     doTestHits(hits, 34, searcher.getIndexReader());
+    // make sure searching sees right # hits for KNN search
+    if (nameVersion.major >= KNN_VECTOR_MIN_SUPPORTED_VERSION) {
+      // make sure KNN search sees all hits
+      assertKNNSearch(searcher, KNN_VECTOR, 1000, 34, "0");
+      // make sure KNN search using HNSW graph retrieves correct results
+      assertKNNSearch(searcher, KNN_VECTOR, 10, 10, "0");
+    }
     reader.close();
   }
 
@@ -1372,10 +1472,10 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
             .setMergePolicy(NoMergePolicy.INSTANCE);
     IndexWriter writer = new IndexWriter(dir, conf);
 
-    for (int i = 0; i < 35; i++) {
+    for (int i = 0; i < DOCS_COUNT; i++) {
       addDoc(writer, i);
     }
-    assertEquals("wrong doc count", 35, writer.getDocStats().maxDoc);
+    assertEquals("wrong doc count", DOCS_COUNT, writer.getDocStats().maxDoc);
     if (fullyMerged) {
       writer.forceMerge(1);
     }
@@ -1399,7 +1499,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
               .setMaxBufferedDocs(10)
               .setMergePolicy(NoMergePolicy.INSTANCE);
       writer = new IndexWriter(dir, conf);
-      Term searchTerm = new Term("id", "7");
+      Term searchTerm = new Term("id", String.valueOf(DELETED_ID));
       writer.deleteDocuments(searchTerm);
       writer.close();
     }
@@ -1469,6 +1569,10 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
     customType4.setStoreTermVectorOffsets(true);
     customType4.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
     doc.add(new Field("content6", "here is more content with aaa aaa aaa", customType4));
+
+    float[] vector = {KNN_VECTOR[0], KNN_VECTOR[1], KNN_VECTOR[2] + 0.1f * id};
+    doc.add(new KnnVectorField(KNN_VECTOR_FIELD, vector, KNN_VECTOR_FIELD_TYPE));
+
     // TODO:
     //   index different norms types via similarity (we use a random one currently?!)
     //   remove any analyzer randomness, explicitly add payloads for certain fields.
@@ -1514,7 +1618,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
 
       // should be found exactly
       assertEquals(TermsEnum.SeekStatus.FOUND, terms.seekCeil(aaaTerm));
-      assertEquals(35, countDocs(TestUtil.docs(random(), terms, null, PostingsEnum.NONE)));
+      assertEquals(DOCS_COUNT, countDocs(TestUtil.docs(random(), terms, null, PostingsEnum.NONE)));
       assertNull(terms.next());
 
       // should hit end of field
@@ -1524,11 +1628,11 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
       // should seek to aaa
       assertEquals(TermsEnum.SeekStatus.NOT_FOUND, terms.seekCeil(new BytesRef("a")));
       assertTrue(terms.term().bytesEquals(aaaTerm));
-      assertEquals(35, countDocs(TestUtil.docs(random(), terms, null, PostingsEnum.NONE)));
+      assertEquals(DOCS_COUNT, countDocs(TestUtil.docs(random(), terms, null, PostingsEnum.NONE)));
       assertNull(terms.next());
 
       assertEquals(TermsEnum.SeekStatus.FOUND, terms.seekCeil(aaaTerm));
-      assertEquals(35, countDocs(TestUtil.docs(random(), terms, null, PostingsEnum.NONE)));
+      assertEquals(DOCS_COUNT, countDocs(TestUtil.docs(random(), terms, null, PostingsEnum.NONE)));
       assertNull(terms.next());
 
       r.close();