You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ju...@apache.org on 2022/01/24 00:19:34 UTC
[lucene] branch main updated: LUCENE-10375: Write vectors to file in flush (#617)
This is an automated email from the ASF dual-hosted git repository.
julietibs pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/main by this push:
new 7ece814 LUCENE-10375: Write vectors to file in flush (#617)
7ece814 is described below
commit 7ece8145bcafc60513e196ce10b0007ac05a83c8
Author: Julie Tibshirani <ju...@apache.org>
AuthorDate: Sun Jan 23 16:19:23 2022 -0800
LUCENE-10375: Write vectors to file in flush (#617)
In a previous commit, we updated HNSW merge to first write the combined segment
vectors to a file, then use that file to build the graph. This commit applies
the same strategy to flush, which lets us use the same logic for flush and
merge.
---
build.gradle | 2 +
.../org/apache/lucene/codecs/KnnVectorsWriter.java | 4 +-
.../codecs/lucene90/Lucene90HnswVectorsReader.java | 2 +-
.../codecs/lucene90/Lucene90HnswVectorsWriter.java | 72 +---------------------
.../tests/index/BaseKnnVectorsFormatTestCase.java | 8 ---
5 files changed, 7 insertions(+), 81 deletions(-)
diff --git a/build.gradle b/build.gradle
index 5162a72..d61f5d3 100644
--- a/build.gradle
+++ b/build.gradle
@@ -183,3 +183,5 @@ apply from: file('gradle/hacks/turbocharge-jvm-opts.gradle')
apply from: file('gradle/hacks/dummy-outputs.gradle')
apply from: file('gradle/pylucene/pylucene.gradle')
+sourceCompatibility = JavaVersion.VERSION_16
+targetCompatibility = JavaVersion.VERSION_16
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java
index 2c32fae..b2d978f 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java
@@ -117,7 +117,7 @@ public abstract class KnnVectorsWriter implements Closeable {
}
/** View over multiple VectorValues supporting iterator-style access via DocIdMerger. */
- public static class MergedVectorValues extends VectorValues {
+ private static class MergedVectorValues extends VectorValues {
private final List<VectorValuesSub> subs;
private final DocIDMerger<VectorValuesSub> docIdMerger;
private final int cost;
@@ -127,7 +127,7 @@ public abstract class KnnVectorsWriter implements Closeable {
private VectorValuesSub current;
/** Returns a merged view over all the segment's {@link VectorValues}. */
- public static MergedVectorValues mergeVectorValues(FieldInfo fieldInfo, MergeState mergeState)
+ static MergedVectorValues mergeVectorValues(FieldInfo fieldInfo, MergeState mergeState)
throws IOException {
assert fieldInfo != null && fieldInfo.hasVectorValues();
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsReader.java
index 11fd80f..bb62ab9 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsReader.java
@@ -354,7 +354,7 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader {
}
/** Read the vector values from the index input. This supports both iterated and random access. */
- public static class OffHeapVectorValues extends VectorValues
+ static class OffHeapVectorValues extends VectorValues
implements RandomAccessVectorValues, RandomAccessVectorValuesProducer {
final int dimension;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsWriter.java
index f3f468a..0fbd1be 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsWriter.java
@@ -26,7 +26,6 @@ import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.codecs.KnnVectorsWriter;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
-import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.RandomAccessVectorValuesProducer;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.VectorSimilarityFunction;
@@ -114,79 +113,16 @@ public final class Lucene90HnswVectorsWriter extends KnnVectorsWriter {
public void writeField(FieldInfo fieldInfo, KnnVectorsReader knnVectorsReader)
throws IOException {
long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES);
-
VectorValues vectors = knnVectorsReader.getVectorValues(fieldInfo.name);
- // TODO - use a better data structure; a bitset? DocsWithFieldSet is p.p. in o.a.l.index
- int[] docIds = writeVectorData(vectorData, vectors);
- assert vectors.size() == docIds.length;
-
- long[] offsets = new long[docIds.length];
- long vectorIndexOffset = vectorIndex.getFilePointer();
- if (vectors instanceof RandomAccessVectorValuesProducer) {
- writeGraph(
- vectorIndex,
- (RandomAccessVectorValuesProducer) vectors,
- fieldInfo.getVectorSimilarityFunction(),
- vectorIndexOffset,
- offsets,
- maxConn,
- beamWidth);
- } else {
- throw new IllegalArgumentException(
- "Indexing an HNSW graph requires a random access vector values, got " + vectors);
- }
-
- long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;
- long vectorIndexLength = vectorIndex.getFilePointer() - vectorIndexOffset;
- writeMeta(
- fieldInfo,
- vectorDataOffset,
- vectorDataLength,
- vectorIndexOffset,
- vectorIndexLength,
- docIds);
- writeGraphOffsets(meta, offsets);
- }
-
- @Override
- public void merge(MergeState mergeState) throws IOException {
- for (int i = 0; i < mergeState.fieldInfos.length; i++) {
- KnnVectorsReader reader = mergeState.knnVectorsReaders[i];
- assert reader != null || mergeState.fieldInfos[i].hasVectorValues() == false;
- if (reader != null) {
- reader.checkIntegrity();
- }
- }
-
- for (FieldInfo fieldInfo : mergeState.mergeFieldInfos) {
- if (fieldInfo.hasVectorValues()) {
- if (mergeState.infoStream.isEnabled("VV")) {
- mergeState.infoStream.message("VV", "merging " + mergeState.segmentInfo);
- }
- mergeField(fieldInfo, mergeState);
- if (mergeState.infoStream.isEnabled("VV")) {
- mergeState.infoStream.message("VV", "merge done " + mergeState.segmentInfo);
- }
- }
- }
- finish();
- }
- private void mergeField(FieldInfo fieldInfo, MergeState mergeState) throws IOException {
- if (mergeState.infoStream.isEnabled("VV")) {
- mergeState.infoStream.message("VV", "merging " + mergeState.segmentInfo);
- }
-
- long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES);
-
- VectorValues vectors = MergedVectorValues.mergeVectorValues(fieldInfo, mergeState);
IndexOutput tempVectorData =
segmentWriteState.directory.createTempOutput(
vectorData.getName(), "temp", segmentWriteState.context);
IndexInput vectorDataInput = null;
boolean success = false;
try {
- // write the merged vector data to a temporary file
+ // write the vector data to a temporary file
+ // TODO - use a better data structure; a bitset? DocsWithFieldSet is p.p. in o.a.l.index
int[] docIds = writeVectorData(tempVectorData, vectors);
CodecUtil.writeFooter(tempVectorData);
IOUtils.close(tempVectorData);
@@ -235,10 +171,6 @@ public final class Lucene90HnswVectorsWriter extends KnnVectorsWriter {
segmentWriteState.directory, tempVectorData.getName());
}
}
-
- if (mergeState.infoStream.isEnabled("VV")) {
- mergeState.infoStream.message("VV", "merge done " + mergeState.segmentInfo);
- }
}
/**
diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java
index 84d83f0..59be16d 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java
@@ -37,8 +37,6 @@ import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.index.RandomAccessVectorValues;
-import org.apache.lucene.index.RandomAccessVectorValuesProducer;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.index.VectorValues;
@@ -693,12 +691,6 @@ public abstract class BaseKnnVectorsFormatTestCase extends BaseIndexFileFormatTe
assertEquals("4", leaf.document(vectorValues.nextDoc()).get("id"));
assertEquals(0, vectorValues.vectorValue()[0], 0);
assertEquals(NO_MORE_DOCS, vectorValues.nextDoc());
-
- RandomAccessVectorValues ra =
- ((RandomAccessVectorValuesProducer) vectorValues).randomAccess();
- assertEquals(-1f, ra.vectorValue(0)[0], 0);
- assertEquals(1f, ra.vectorValue(1)[0], 0);
- assertEquals(0f, ra.vectorValue(2)[0], 0);
}
}
}