You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@solr.apache.org by ab...@apache.org on 2022/07/06 13:45:48 UTC
[solr] branch main updated: SOLR-16245: make DenseVectorField codec agnostic (#910)
This is an automated email from the ASF dual-hosted git repository.
abenedetti pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/main by this push:
new fd9921be9e3 SOLR-16245: make DenseVectorField codec agnostic (#910)
fd9921be9e3 is described below
commit fd9921be9e36d45da029c60e484b43188944e919
Author: Elia Porciani <e....@sease.io>
AuthorDate: Wed Jul 6 15:45:42 2022 +0200
SOLR-16245: make DenseVectorField codec agnostic (#910)
* make DenseVectorField codec agnostic
* introduce new parameter 'knnAlgorithm' and related documentation
---
solr/CHANGES.txt | 2 ++
.../org/apache/solr/core/SchemaCodecFactory.java | 17 ++++------
.../org/apache/solr/schema/DenseVectorField.java | 36 +++++++++-------------
.../schema-densevector-codec-hyperparameter.xml | 6 ++--
.../apache/solr/schema/DenseVectorFieldTest.java | 8 ++---
.../query-guide/pages/dense-vector-search.adoc | 23 ++++++--------
6 files changed, 38 insertions(+), 54 deletions(-)
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 3335128f7ea..e41d1360f8e 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -97,6 +97,8 @@ Bug Fixes
Other Changes
---------------------
+* SOLR-16245: Make DenseVectorField codec agnostic (Elia Porciani via Alessandro Benedetti)
+
* SOLR-15897: Remove <jmx/> from all unit test solrconfig.xml files. (Eric Pugh)
* SOLR-15776: Admin UI is now aware of logged-in user's permissions and can adapt accordingly (janhoy)
diff --git a/solr/core/src/java/org/apache/solr/core/SchemaCodecFactory.java b/solr/core/src/java/org/apache/solr/core/SchemaCodecFactory.java
index 2e896693d93..6c1b9eb4fa9 100644
--- a/solr/core/src/java/org/apache/solr/core/SchemaCodecFactory.java
+++ b/solr/core/src/java/org/apache/solr/core/SchemaCodecFactory.java
@@ -19,7 +19,6 @@ package org.apache.solr.core;
import java.lang.invoke.MethodHandles;
import java.util.Arrays;
import java.util.Locale;
-import org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.KnnVectorsFormat;
@@ -124,20 +123,16 @@ public class SchemaCodecFactory extends CodecFactory implements SolrCoreAware {
FieldType fieldType = (schemaField == null ? null : schemaField.getType());
if (fieldType instanceof DenseVectorField) {
DenseVectorField vectorType = (DenseVectorField) fieldType;
- String knnVectorFormatName = vectorType.getCodecFormat();
- if (knnVectorFormatName != null) {
- if (knnVectorFormatName.equals(Lucene91HnswVectorsFormat.class.getSimpleName())) {
+ String knnAlgorithm = vectorType.getKnnAlgorithm();
+ if (knnAlgorithm != null) {
+ if (knnAlgorithm.equals(DenseVectorField.HNSW_ALGORITHM)) {
int maxConn = vectorType.getHnswMaxConn();
int beamWidth = vectorType.getHnswBeamWidth();
return new Lucene91HnswVectorsFormat(maxConn, beamWidth);
- } else if (knnVectorFormatName.equals(
- Lucene90HnswVectorsFormat.class.getSimpleName())) {
- int maxConn = vectorType.getHnswMaxConn();
- int beamWidth = vectorType.getHnswBeamWidth();
- return new Lucene90HnswVectorsFormat(maxConn, beamWidth);
- } else {
- return KnnVectorsFormat.forName(knnVectorFormatName);
}
+ } else {
+ throw new SolrException(
+ ErrorCode.SERVER_ERROR, knnAlgorithm + " KNN algorithm is not supported");
}
}
return super.getKnnVectorsFormatForField(field);
diff --git a/solr/core/src/java/org/apache/solr/schema/DenseVectorField.java b/solr/core/src/java/org/apache/solr/schema/DenseVectorField.java
index 59cf68713f0..8b44f006f33 100644
--- a/solr/core/src/java/org/apache/solr/schema/DenseVectorField.java
+++ b/solr/core/src/java/org/apache/solr/schema/DenseVectorField.java
@@ -24,8 +24,6 @@ import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
-import org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat;
-import org.apache.lucene.codecs.lucene91.Lucene91HnswVectorsFormat;
import org.apache.lucene.document.KnnVectorField;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.VectorSimilarityFunction;
@@ -42,20 +40,17 @@ import org.apache.solr.uninverting.UninvertingReader;
* Provides a field type to support Lucene's {@link org.apache.lucene.document.KnnVectorField}. See
* {@link org.apache.lucene.search.KnnVectorQuery} for more details. It supports a fixed cardinality
* dimension for the vector and a fixed similarity function. The default similarity is
- * EUCLIDEAN_HNSW (L2). The default index codec format is specified in the Lucene Codec constructor.
- * For Lucene 9.1 e.g. See {@link org.apache.lucene.codecs.lucene91.Lucene91Codec} Currently {@link
- * org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat} and {@link
- * org.apache.lucene.codecs.lucene91.Lucene91HnswVectorsFormat} are supported for advanced
- * hyper-parameter customisation. See {@link org.apache.lucene.util.hnsw.HnswGraph} for more details
- * about the implementation. <br>
+ * EUCLIDEAN_HNSW (L2). The default algorithm is HNSW. For Lucene 9.1 e.g. See {@link
+ * org.apache.lucene.util.hnsw.HnswGraph} for more details about the implementation. <br>
* Only {@code Indexed} and {@code Stored} attributes are supported.
*/
public class DenseVectorField extends FloatPointField {
+ public static final String HNSW_ALGORITHM = "hnsw";
static final String KNN_VECTOR_DIMENSION = "vectorDimension";
static final String KNN_SIMILARITY_FUNCTION = "similarityFunction";
- static final String CODEC_FORMAT = "codecFormat";
+ static final String KNN_ALGORITHM = "knnAlgorithm";
static final String HNSW_MAX_CONNECTIONS = "hnswMaxConnections";
static final String HNSW_BEAM_WIDTH = "hnswBeamWidth";
@@ -63,19 +58,15 @@ public class DenseVectorField extends FloatPointField {
private VectorSimilarityFunction similarityFunction;
private VectorSimilarityFunction DEFAULT_SIMILARITY = VectorSimilarityFunction.EUCLIDEAN;
- private String codecFormat;
+ private String knnAlgorithm;
/**
- * This parameter is coupled with the {@link Lucene90HnswVectorsFormat} and {@link
- * Lucene91HnswVectorsFormat} format implementations. Controls how many of the nearest neighbor
- * candidates are connected to the new node. Defaults to {@link
- * Lucene91HnswVectorsFormat#DEFAULT_MAX_CONN}. See {@link HnswGraph} for more details.
+ * This parameter is coupled with the hnsw algorithm. Controls how many of the nearest neighbor
+ * candidates are connected to the new node. See {@link HnswGraph} for more details.
*/
private int hnswMaxConn;
/**
- * This parameter is coupled with the {@link Lucene90HnswVectorsFormat} and {@link
- * Lucene91HnswVectorsFormat} format implementations. The number of candidate neighbors to track
- * while searching the graph for each newly inserted node. Defaults to to {@link
- * Lucene91HnswVectorsFormat#DEFAULT_BEAM_WIDTH}. See {@link HnswGraph} for details.
+ * This parameter is coupled with the hnsw algorithm. The number of candidate neighbors to track
+ * while searching the graph for each newly inserted node. See {@link HnswGraph} for details.
*/
private int hnswBeamWidth;
@@ -113,8 +104,9 @@ public class DenseVectorField extends FloatPointField {
.orElse(DEFAULT_SIMILARITY);
args.remove(KNN_SIMILARITY_FUNCTION);
- this.codecFormat = args.get(CODEC_FORMAT);
- args.remove(CODEC_FORMAT);
+ this.knnAlgorithm = args.get(KNN_ALGORITHM);
+
+ args.remove(KNN_ALGORITHM);
this.hnswMaxConn =
ofNullable(args.get(HNSW_MAX_CONNECTIONS))
@@ -142,8 +134,8 @@ public class DenseVectorField extends FloatPointField {
return similarityFunction;
}
- public String getCodecFormat() {
- return codecFormat;
+ public String getKnnAlgorithm() {
+ return knnAlgorithm;
}
public Integer getHnswMaxConn() {
diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-densevector-codec-hyperparameter.xml b/solr/core/src/test-files/solr/collection1/conf/schema-densevector-codec-hyperparameter.xml
index 8d81e431a57..74034c93d17 100644
--- a/solr/core/src/test-files/solr/collection1/conf/schema-densevector-codec-hyperparameter.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schema-densevector-codec-hyperparameter.xml
@@ -21,9 +21,9 @@
<schema name="schema-densevector-codec-hyperparameter" version="1.0">
<fieldType name="string" class="solr.StrField" multiValued="true"/>
<fieldType name="knn_vector_default" class="solr.DenseVectorField" vectorDimension="4" similarityFunction="cosine"/>
- <fieldType name="knn_vector" class="solr.DenseVectorField" vectorDimension="4" similarityFunction="cosine" codecFormat="Lucene91HnswVectorsFormat" hnswMaxConnections="10" hnswBeamWidth="40"/>
- <fieldType name="knn_vector2" class="solr.DenseVectorField" vectorDimension="4" similarityFunction="cosine" codecFormat="Lucene91HnswVectorsFormat" hnswMaxConnections="6" hnswBeamWidth="60"/>
- <fieldType name="knn_vector3" class="solr.DenseVectorField" vectorDimension="5" similarityFunction="cosine" codecFormat="Lucene90HnswVectorsFormat" hnswMaxConnections="8" hnswBeamWidth="46"/>
+ <fieldType name="knn_vector" class="solr.DenseVectorField" vectorDimension="4" similarityFunction="cosine" knnAlgorithm="hnsw" hnswMaxConnections="10" hnswBeamWidth="40"/>
+ <fieldType name="knn_vector2" class="solr.DenseVectorField" vectorDimension="4" similarityFunction="cosine" knnAlgorithm="hnsw" hnswMaxConnections="6" hnswBeamWidth="60"/>
+ <fieldType name="knn_vector3" class="solr.DenseVectorField" vectorDimension="5" similarityFunction="cosine" knnAlgorithm="hnsw" hnswMaxConnections="8" hnswBeamWidth="46"/>
<field name="id" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
diff --git a/solr/core/src/test/org/apache/solr/schema/DenseVectorFieldTest.java b/solr/core/src/test/org/apache/solr/schema/DenseVectorFieldTest.java
index 61980b4ded6..eb84ee237d1 100644
--- a/solr/core/src/test/org/apache/solr/schema/DenseVectorFieldTest.java
+++ b/solr/core/src/test/org/apache/solr/schema/DenseVectorFieldTest.java
@@ -125,7 +125,7 @@ public class DenseVectorFieldTest extends AbstractBadConfigTestBase {
DenseVectorField type1 = (DenseVectorField) vector.getType();
MatcherAssert.assertThat(type1.getSimilarityFunction(), is(VectorSimilarityFunction.COSINE));
MatcherAssert.assertThat(type1.getDimension(), is(4));
- MatcherAssert.assertThat(type1.getCodecFormat(), is("Lucene91HnswVectorsFormat"));
+ MatcherAssert.assertThat(type1.getKnnAlgorithm(), is("hnsw"));
MatcherAssert.assertThat(type1.getHnswMaxConn(), is(10));
MatcherAssert.assertThat(type1.getHnswBeamWidth(), is(40));
@@ -135,7 +135,7 @@ public class DenseVectorFieldTest extends AbstractBadConfigTestBase {
DenseVectorField type2 = (DenseVectorField) vector2.getType();
MatcherAssert.assertThat(type2.getSimilarityFunction(), is(VectorSimilarityFunction.COSINE));
MatcherAssert.assertThat(type2.getDimension(), is(4));
- MatcherAssert.assertThat(type2.getCodecFormat(), is("Lucene91HnswVectorsFormat"));
+ MatcherAssert.assertThat(type2.getKnnAlgorithm(), is("hnsw"));
MatcherAssert.assertThat(type2.getHnswMaxConn(), is(6));
MatcherAssert.assertThat(type2.getHnswBeamWidth(), is(60));
@@ -145,7 +145,7 @@ public class DenseVectorFieldTest extends AbstractBadConfigTestBase {
DenseVectorField type3 = (DenseVectorField) vector3.getType();
MatcherAssert.assertThat(type3.getSimilarityFunction(), is(VectorSimilarityFunction.COSINE));
MatcherAssert.assertThat(type3.getDimension(), is(5));
- MatcherAssert.assertThat(type3.getCodecFormat(), is("Lucene90HnswVectorsFormat"));
+ MatcherAssert.assertThat(type3.getKnnAlgorithm(), is("hnsw"));
MatcherAssert.assertThat(type3.getHnswMaxConn(), is(8));
MatcherAssert.assertThat(type3.getHnswBeamWidth(), is(46));
@@ -156,7 +156,7 @@ public class DenseVectorFieldTest extends AbstractBadConfigTestBase {
MatcherAssert.assertThat(
typeDefault.getSimilarityFunction(), is(VectorSimilarityFunction.COSINE));
MatcherAssert.assertThat(typeDefault.getDimension(), is(4));
- assertNull(typeDefault.getCodecFormat());
+ assertNull(typeDefault.getKnnAlgorithm());
MatcherAssert.assertThat(typeDefault.getHnswMaxConn(), is(16));
MatcherAssert.assertThat(typeDefault.getHnswBeamWidth(), is(100));
} finally {
diff --git a/solr/solr-ref-guide/modules/query-guide/pages/dense-vector-search.adoc b/solr/solr-ref-guide/modules/query-guide/pages/dense-vector-search.adoc
index af4a8ceb8ad..d868ea21381 100644
--- a/solr/solr-ref-guide/modules/query-guide/pages/dense-vector-search.adoc
+++ b/solr/solr-ref-guide/modules/query-guide/pages/dense-vector-search.adoc
@@ -106,32 +106,27 @@ and the hyper-parameter of the HNSW algorithm make sure you set this configurati
<codecFactory class="solr.SchemaCodecFactory"/>
...
-Here's how `DenseVectorField` can be configured with the advanced codec hyper-parameters:
+Here's how `DenseVectorField` can be configured with the advanced hyper-parameters:
[source,xml]
-<fieldType name="knn_vector" class="solr.DenseVectorField" vectorDimension="4" similarityFunction="cosine" codecFormat="Lucene91HnswVectorsFormat" hnswMaxConnections="10" hnswBeamWidth="40"/>
+<fieldType name="knn_vector" class="solr.DenseVectorField" vectorDimension="4" similarityFunction="cosine" knnAlgorithm="hnsw" hnswMaxConnections="10" hnswBeamWidth="40"/>
<field name="vector" type="knn_vector" indexed="true" stored="true"/>
-`codecFormat`::
+`knnAlgorithm`::
+
[%autowidth,frame=none]
|===
-|Optional |Default: `Lucene91HnswVectorsFormat`
+|Optional |Default: `hnsw`
|===
+
-(advanced) Specifies the knn codec implementation to use
+(advanced) Specifies the underlying knn algorithm to use
+
-Accepted values: `Lucene90HnswVectorsFormat`, `Lucene91HnswVectorsFormat` .
+Accepted values: `hnsw`.
-Please note that the `codecFormat` accepted values may change in future releases.
+Please note that the `knnAlgorithm` accepted values may change in future releases.
-
-[NOTE]
-Lucene index back-compatibility is only supported for the default codec.
-If you choose to customize the `codecFormat` in your schema, upgrading to a future version of Solr may require you to either switch back to the default codec and optimize your index to rewrite it into the default codec before upgrading, or re-build your entire index from scratch after upgrading.
-
`hnswMaxConnections`::
+
[%autowidth,frame=none]
@@ -139,7 +134,7 @@ If you choose to customize the `codecFormat` in your schema, upgrading to a futu
|Optional |Default: `16`
|===
+
-(advanced) This parameter is specific for the `Lucene90HnswVectorsFormat` and `Lucene91HnswVectorsFormat` codec formats:
+(advanced) This parameter is specific for the `hnsw` knn algorithm:
+
Controls how many of the nearest neighbor candidates are connected to the new node.
+
@@ -155,7 +150,7 @@ Any integer.
|Optional |Default: `100`
|===
+
-(advanced) This parameter is specific for the `Lucene90HnswVectorsFormat` and `Lucene91HnswVectorsFormat` codec formats:
+(advanced) This parameter is specific for the `hnsw` knn algorithm:
+
It is the number of nearest neighbor candidates to track while searching the graph for each newly inserted node.
+