You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ju...@apache.org on 2022/01/24 00:19:34 UTC

[lucene] branch main updated: LUCENE-10375: Write vectors to file in flush (#617)

This is an automated email from the ASF dual-hosted git repository.

julietibs pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/main by this push:
     new 7ece814  LUCENE-10375: Write vectors to file in flush (#617)
7ece814 is described below

commit 7ece8145bcafc60513e196ce10b0007ac05a83c8
Author: Julie Tibshirani <ju...@apache.org>
AuthorDate: Sun Jan 23 16:19:23 2022 -0800

    LUCENE-10375: Write vectors to file in flush (#617)
    
    In a previous commit, we updated HNSW merge to first write the combined segment
    vectors to a file, then use that file to build the graph. This commit applies
    the same strategy to flush, which lets us use the same logic for flush and
    merge.
---
 build.gradle                                       |  2 +
 .../org/apache/lucene/codecs/KnnVectorsWriter.java |  4 +-
 .../codecs/lucene90/Lucene90HnswVectorsReader.java |  2 +-
 .../codecs/lucene90/Lucene90HnswVectorsWriter.java | 72 +---------------------
 .../tests/index/BaseKnnVectorsFormatTestCase.java  |  8 ---
 5 files changed, 7 insertions(+), 81 deletions(-)

diff --git a/build.gradle b/build.gradle
index 5162a72..d61f5d3 100644
--- a/build.gradle
+++ b/build.gradle
@@ -183,3 +183,5 @@ apply from: file('gradle/hacks/turbocharge-jvm-opts.gradle')
 apply from: file('gradle/hacks/dummy-outputs.gradle')
 
 apply from: file('gradle/pylucene/pylucene.gradle')
+sourceCompatibility = JavaVersion.VERSION_16
+targetCompatibility = JavaVersion.VERSION_16
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java
index 2c32fae..b2d978f 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java
@@ -117,7 +117,7 @@ public abstract class KnnVectorsWriter implements Closeable {
   }
 
   /** View over multiple VectorValues supporting iterator-style access via DocIdMerger. */
-  public static class MergedVectorValues extends VectorValues {
+  private static class MergedVectorValues extends VectorValues {
     private final List<VectorValuesSub> subs;
     private final DocIDMerger<VectorValuesSub> docIdMerger;
     private final int cost;
@@ -127,7 +127,7 @@ public abstract class KnnVectorsWriter implements Closeable {
     private VectorValuesSub current;
 
     /** Returns a merged view over all the segment's {@link VectorValues}. */
-    public static MergedVectorValues mergeVectorValues(FieldInfo fieldInfo, MergeState mergeState)
+    static MergedVectorValues mergeVectorValues(FieldInfo fieldInfo, MergeState mergeState)
         throws IOException {
       assert fieldInfo != null && fieldInfo.hasVectorValues();
 
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsReader.java
index 11fd80f..bb62ab9 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsReader.java
@@ -354,7 +354,7 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader {
   }
 
   /** Read the vector values from the index input. This supports both iterated and random access. */
-  public static class OffHeapVectorValues extends VectorValues
+  static class OffHeapVectorValues extends VectorValues
       implements RandomAccessVectorValues, RandomAccessVectorValuesProducer {
 
     final int dimension;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsWriter.java
index f3f468a..0fbd1be 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsWriter.java
@@ -26,7 +26,6 @@ import org.apache.lucene.codecs.KnnVectorsReader;
 import org.apache.lucene.codecs.KnnVectorsWriter;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.IndexFileNames;
-import org.apache.lucene.index.MergeState;
 import org.apache.lucene.index.RandomAccessVectorValuesProducer;
 import org.apache.lucene.index.SegmentWriteState;
 import org.apache.lucene.index.VectorSimilarityFunction;
@@ -114,79 +113,16 @@ public final class Lucene90HnswVectorsWriter extends KnnVectorsWriter {
   public void writeField(FieldInfo fieldInfo, KnnVectorsReader knnVectorsReader)
       throws IOException {
     long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES);
-
     VectorValues vectors = knnVectorsReader.getVectorValues(fieldInfo.name);
-    // TODO - use a better data structure; a bitset? DocsWithFieldSet is p.p. in o.a.l.index
-    int[] docIds = writeVectorData(vectorData, vectors);
-    assert vectors.size() == docIds.length;
-
-    long[] offsets = new long[docIds.length];
-    long vectorIndexOffset = vectorIndex.getFilePointer();
-    if (vectors instanceof RandomAccessVectorValuesProducer) {
-      writeGraph(
-          vectorIndex,
-          (RandomAccessVectorValuesProducer) vectors,
-          fieldInfo.getVectorSimilarityFunction(),
-          vectorIndexOffset,
-          offsets,
-          maxConn,
-          beamWidth);
-    } else {
-      throw new IllegalArgumentException(
-          "Indexing an HNSW graph requires a random access vector values, got " + vectors);
-    }
-
-    long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;
-    long vectorIndexLength = vectorIndex.getFilePointer() - vectorIndexOffset;
-    writeMeta(
-        fieldInfo,
-        vectorDataOffset,
-        vectorDataLength,
-        vectorIndexOffset,
-        vectorIndexLength,
-        docIds);
-    writeGraphOffsets(meta, offsets);
-  }
-
-  @Override
-  public void merge(MergeState mergeState) throws IOException {
-    for (int i = 0; i < mergeState.fieldInfos.length; i++) {
-      KnnVectorsReader reader = mergeState.knnVectorsReaders[i];
-      assert reader != null || mergeState.fieldInfos[i].hasVectorValues() == false;
-      if (reader != null) {
-        reader.checkIntegrity();
-      }
-    }
-
-    for (FieldInfo fieldInfo : mergeState.mergeFieldInfos) {
-      if (fieldInfo.hasVectorValues()) {
-        if (mergeState.infoStream.isEnabled("VV")) {
-          mergeState.infoStream.message("VV", "merging " + mergeState.segmentInfo);
-        }
-        mergeField(fieldInfo, mergeState);
-        if (mergeState.infoStream.isEnabled("VV")) {
-          mergeState.infoStream.message("VV", "merge done " + mergeState.segmentInfo);
-        }
-      }
-    }
-    finish();
-  }
 
-  private void mergeField(FieldInfo fieldInfo, MergeState mergeState) throws IOException {
-    if (mergeState.infoStream.isEnabled("VV")) {
-      mergeState.infoStream.message("VV", "merging " + mergeState.segmentInfo);
-    }
-
-    long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES);
-
-    VectorValues vectors = MergedVectorValues.mergeVectorValues(fieldInfo, mergeState);
     IndexOutput tempVectorData =
         segmentWriteState.directory.createTempOutput(
             vectorData.getName(), "temp", segmentWriteState.context);
     IndexInput vectorDataInput = null;
     boolean success = false;
     try {
-      // write the merged vector data to a temporary file
+      // write the vector data to a temporary file
+      // TODO - use a better data structure; a bitset? DocsWithFieldSet is p.p. in o.a.l.index
       int[] docIds = writeVectorData(tempVectorData, vectors);
       CodecUtil.writeFooter(tempVectorData);
       IOUtils.close(tempVectorData);
@@ -235,10 +171,6 @@ public final class Lucene90HnswVectorsWriter extends KnnVectorsWriter {
             segmentWriteState.directory, tempVectorData.getName());
       }
     }
-
-    if (mergeState.infoStream.isEnabled("VV")) {
-      mergeState.infoStream.message("VV", "merge done " + mergeState.segmentInfo);
-    }
   }
 
   /**
diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java
index 84d83f0..59be16d 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java
@@ -37,8 +37,6 @@ import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.index.RandomAccessVectorValues;
-import org.apache.lucene.index.RandomAccessVectorValuesProducer;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.VectorSimilarityFunction;
 import org.apache.lucene.index.VectorValues;
@@ -693,12 +691,6 @@ public abstract class BaseKnnVectorsFormatTestCase extends BaseIndexFileFormatTe
         assertEquals("4", leaf.document(vectorValues.nextDoc()).get("id"));
         assertEquals(0, vectorValues.vectorValue()[0], 0);
         assertEquals(NO_MORE_DOCS, vectorValues.nextDoc());
-
-        RandomAccessVectorValues ra =
-            ((RandomAccessVectorValuesProducer) vectorValues).randomAccess();
-        assertEquals(-1f, ra.vectorValue(0)[0], 0);
-        assertEquals(1f, ra.vectorValue(1)[0], 0);
-        assertEquals(0f, ra.vectorValue(2)[0], 0);
       }
     }
   }