You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@solr.apache.org by ab...@apache.org on 2022/07/06 13:45:48 UTC

[solr] branch main updated: SOLR-16245: make DenseVectorField codec agnostic (#910)

This is an automated email from the ASF dual-hosted git repository.

abenedetti pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/main by this push:
     new fd9921be9e3 SOLR-16245: make DenseVectorField codec agnostic (#910)
fd9921be9e3 is described below

commit fd9921be9e36d45da029c60e484b43188944e919
Author: Elia Porciani <e....@sease.io>
AuthorDate: Wed Jul 6 15:45:42 2022 +0200

    SOLR-16245: make DenseVectorField codec agnostic (#910)
    
    * make DenseVectorField codec agnostic
    * introduce new parameter 'knnAlgorithm' and related documentation
---
 solr/CHANGES.txt                                   |  2 ++
 .../org/apache/solr/core/SchemaCodecFactory.java   | 17 ++++------
 .../org/apache/solr/schema/DenseVectorField.java   | 36 +++++++++-------------
 .../schema-densevector-codec-hyperparameter.xml    |  6 ++--
 .../apache/solr/schema/DenseVectorFieldTest.java   |  8 ++---
 .../query-guide/pages/dense-vector-search.adoc     | 23 ++++++--------
 6 files changed, 38 insertions(+), 54 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 3335128f7ea..e41d1360f8e 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -97,6 +97,8 @@ Bug Fixes
 
 Other Changes
 ---------------------
+* SOLR-16245: Make DenseVectorField codec agnostic (Elia Porciani via Alessandro Benedetti)
+
 * SOLR-15897: Remove <jmx/> from all unit test solrconfig.xml files. (Eric Pugh)
 
 * SOLR-15776: Admin UI is now aware of logged-in user's permissions and can adapt accordingly (janhoy)
diff --git a/solr/core/src/java/org/apache/solr/core/SchemaCodecFactory.java b/solr/core/src/java/org/apache/solr/core/SchemaCodecFactory.java
index 2e896693d93..6c1b9eb4fa9 100644
--- a/solr/core/src/java/org/apache/solr/core/SchemaCodecFactory.java
+++ b/solr/core/src/java/org/apache/solr/core/SchemaCodecFactory.java
@@ -19,7 +19,6 @@ package org.apache.solr.core;
 import java.lang.invoke.MethodHandles;
 import java.util.Arrays;
 import java.util.Locale;
-import org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat;
 import org.apache.lucene.codecs.Codec;
 import org.apache.lucene.codecs.DocValuesFormat;
 import org.apache.lucene.codecs.KnnVectorsFormat;
@@ -124,20 +123,16 @@ public class SchemaCodecFactory extends CodecFactory implements SolrCoreAware {
             FieldType fieldType = (schemaField == null ? null : schemaField.getType());
             if (fieldType instanceof DenseVectorField) {
               DenseVectorField vectorType = (DenseVectorField) fieldType;
-              String knnVectorFormatName = vectorType.getCodecFormat();
-              if (knnVectorFormatName != null) {
-                if (knnVectorFormatName.equals(Lucene91HnswVectorsFormat.class.getSimpleName())) {
+              String knnAlgorithm = vectorType.getKnnAlgorithm();
+              if (knnAlgorithm != null) {
+                if (knnAlgorithm.equals(DenseVectorField.HNSW_ALGORITHM)) {
                   int maxConn = vectorType.getHnswMaxConn();
                   int beamWidth = vectorType.getHnswBeamWidth();
                   return new Lucene91HnswVectorsFormat(maxConn, beamWidth);
-                } else if (knnVectorFormatName.equals(
-                    Lucene90HnswVectorsFormat.class.getSimpleName())) {
-                  int maxConn = vectorType.getHnswMaxConn();
-                  int beamWidth = vectorType.getHnswBeamWidth();
-                  return new Lucene90HnswVectorsFormat(maxConn, beamWidth);
-                } else {
-                  return KnnVectorsFormat.forName(knnVectorFormatName);
                 }
+              } else {
+                throw new SolrException(
+                    ErrorCode.SERVER_ERROR, knnAlgorithm + " KNN algorithm is not supported");
               }
             }
             return super.getKnnVectorsFormatForField(field);
diff --git a/solr/core/src/java/org/apache/solr/schema/DenseVectorField.java b/solr/core/src/java/org/apache/solr/schema/DenseVectorField.java
index 59cf68713f0..8b44f006f33 100644
--- a/solr/core/src/java/org/apache/solr/schema/DenseVectorField.java
+++ b/solr/core/src/java/org/apache/solr/schema/DenseVectorField.java
@@ -24,8 +24,6 @@ import java.util.ArrayList;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
-import org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat;
-import org.apache.lucene.codecs.lucene91.Lucene91HnswVectorsFormat;
 import org.apache.lucene.document.KnnVectorField;
 import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.index.VectorSimilarityFunction;
@@ -42,20 +40,17 @@ import org.apache.solr.uninverting.UninvertingReader;
  * Provides a field type to support Lucene's {@link org.apache.lucene.document.KnnVectorField}. See
  * {@link org.apache.lucene.search.KnnVectorQuery} for more details. It supports a fixed cardinality
  * dimension for the vector and a fixed similarity function. The default similarity is
- * EUCLIDEAN_HNSW (L2). The default index codec format is specified in the Lucene Codec constructor.
- * For Lucene 9.1 e.g. See {@link org.apache.lucene.codecs.lucene91.Lucene91Codec} Currently {@link
- * org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat} and {@link
- * org.apache.lucene.codecs.lucene91.Lucene91HnswVectorsFormat} are supported for advanced
- * hyper-parameter customisation. See {@link org.apache.lucene.util.hnsw.HnswGraph} for more details
- * about the implementation. <br>
+ * EUCLIDEAN_HNSW (L2). The default algorithm is HNSW. For Lucene 9.1 e.g. See {@link
+ * org.apache.lucene.util.hnsw.HnswGraph} for more details about the implementation. <br>
  * Only {@code Indexed} and {@code Stored} attributes are supported.
  */
 public class DenseVectorField extends FloatPointField {
+  public static final String HNSW_ALGORITHM = "hnsw";
 
   static final String KNN_VECTOR_DIMENSION = "vectorDimension";
   static final String KNN_SIMILARITY_FUNCTION = "similarityFunction";
 
-  static final String CODEC_FORMAT = "codecFormat";
+  static final String KNN_ALGORITHM = "knnAlgorithm";
   static final String HNSW_MAX_CONNECTIONS = "hnswMaxConnections";
   static final String HNSW_BEAM_WIDTH = "hnswBeamWidth";
 
@@ -63,19 +58,15 @@ public class DenseVectorField extends FloatPointField {
   private VectorSimilarityFunction similarityFunction;
   private VectorSimilarityFunction DEFAULT_SIMILARITY = VectorSimilarityFunction.EUCLIDEAN;
 
-  private String codecFormat;
+  private String knnAlgorithm;
   /**
-   * This parameter is coupled with the {@link Lucene90HnswVectorsFormat} and {@link
-   * Lucene91HnswVectorsFormat} format implementations. Controls how many of the nearest neighbor
-   * candidates are connected to the new node. Defaults to {@link
-   * Lucene91HnswVectorsFormat#DEFAULT_MAX_CONN}. See {@link HnswGraph} for more details.
+   * This parameter is coupled with the hnsw algorithm. Controls how many of the nearest neighbor
+   * candidates are connected to the new node. See {@link HnswGraph} for more details.
    */
   private int hnswMaxConn;
   /**
-   * This parameter is coupled with the {@link Lucene90HnswVectorsFormat} and {@link
-   * Lucene91HnswVectorsFormat} format implementations. The number of candidate neighbors to track
-   * while searching the graph for each newly inserted node. Defaults to to {@link
-   * Lucene91HnswVectorsFormat#DEFAULT_BEAM_WIDTH}. See {@link HnswGraph} for details.
+   * This parameter is coupled with the hnsw algorithm. The number of candidate neighbors to track
+   * while searching the graph for each newly inserted node. See {@link HnswGraph} for details.
    */
   private int hnswBeamWidth;
 
@@ -113,8 +104,9 @@ public class DenseVectorField extends FloatPointField {
             .orElse(DEFAULT_SIMILARITY);
     args.remove(KNN_SIMILARITY_FUNCTION);
 
-    this.codecFormat = args.get(CODEC_FORMAT);
-    args.remove(CODEC_FORMAT);
+    this.knnAlgorithm = args.get(KNN_ALGORITHM);
+
+    args.remove(KNN_ALGORITHM);
 
     this.hnswMaxConn =
         ofNullable(args.get(HNSW_MAX_CONNECTIONS))
@@ -142,8 +134,8 @@ public class DenseVectorField extends FloatPointField {
     return similarityFunction;
   }
 
-  public String getCodecFormat() {
-    return codecFormat;
+  public String getKnnAlgorithm() {
+    return knnAlgorithm;
   }
 
   public Integer getHnswMaxConn() {
diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-densevector-codec-hyperparameter.xml b/solr/core/src/test-files/solr/collection1/conf/schema-densevector-codec-hyperparameter.xml
index 8d81e431a57..74034c93d17 100644
--- a/solr/core/src/test-files/solr/collection1/conf/schema-densevector-codec-hyperparameter.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schema-densevector-codec-hyperparameter.xml
@@ -21,9 +21,9 @@
 <schema name="schema-densevector-codec-hyperparameter" version="1.0">
   <fieldType name="string" class="solr.StrField" multiValued="true"/>
   <fieldType name="knn_vector_default" class="solr.DenseVectorField" vectorDimension="4" similarityFunction="cosine"/>
-  <fieldType name="knn_vector" class="solr.DenseVectorField" vectorDimension="4" similarityFunction="cosine" codecFormat="Lucene91HnswVectorsFormat" hnswMaxConnections="10" hnswBeamWidth="40"/>
-  <fieldType name="knn_vector2" class="solr.DenseVectorField" vectorDimension="4" similarityFunction="cosine" codecFormat="Lucene91HnswVectorsFormat" hnswMaxConnections="6" hnswBeamWidth="60"/>
-  <fieldType name="knn_vector3" class="solr.DenseVectorField" vectorDimension="5" similarityFunction="cosine" codecFormat="Lucene90HnswVectorsFormat" hnswMaxConnections="8" hnswBeamWidth="46"/>
+  <fieldType name="knn_vector" class="solr.DenseVectorField" vectorDimension="4" similarityFunction="cosine" knnAlgorithm="hnsw" hnswMaxConnections="10" hnswBeamWidth="40"/>
+  <fieldType name="knn_vector2" class="solr.DenseVectorField" vectorDimension="4" similarityFunction="cosine" knnAlgorithm="hnsw" hnswMaxConnections="6" hnswBeamWidth="60"/>
+  <fieldType name="knn_vector3" class="solr.DenseVectorField" vectorDimension="5" similarityFunction="cosine" knnAlgorithm="hnsw" hnswMaxConnections="8" hnswBeamWidth="46"/>
 
 
   <field name="id" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
diff --git a/solr/core/src/test/org/apache/solr/schema/DenseVectorFieldTest.java b/solr/core/src/test/org/apache/solr/schema/DenseVectorFieldTest.java
index 61980b4ded6..eb84ee237d1 100644
--- a/solr/core/src/test/org/apache/solr/schema/DenseVectorFieldTest.java
+++ b/solr/core/src/test/org/apache/solr/schema/DenseVectorFieldTest.java
@@ -125,7 +125,7 @@ public class DenseVectorFieldTest extends AbstractBadConfigTestBase {
       DenseVectorField type1 = (DenseVectorField) vector.getType();
       MatcherAssert.assertThat(type1.getSimilarityFunction(), is(VectorSimilarityFunction.COSINE));
       MatcherAssert.assertThat(type1.getDimension(), is(4));
-      MatcherAssert.assertThat(type1.getCodecFormat(), is("Lucene91HnswVectorsFormat"));
+      MatcherAssert.assertThat(type1.getKnnAlgorithm(), is("hnsw"));
       MatcherAssert.assertThat(type1.getHnswMaxConn(), is(10));
       MatcherAssert.assertThat(type1.getHnswBeamWidth(), is(40));
 
@@ -135,7 +135,7 @@ public class DenseVectorFieldTest extends AbstractBadConfigTestBase {
       DenseVectorField type2 = (DenseVectorField) vector2.getType();
       MatcherAssert.assertThat(type2.getSimilarityFunction(), is(VectorSimilarityFunction.COSINE));
       MatcherAssert.assertThat(type2.getDimension(), is(4));
-      MatcherAssert.assertThat(type2.getCodecFormat(), is("Lucene91HnswVectorsFormat"));
+      MatcherAssert.assertThat(type2.getKnnAlgorithm(), is("hnsw"));
       MatcherAssert.assertThat(type2.getHnswMaxConn(), is(6));
       MatcherAssert.assertThat(type2.getHnswBeamWidth(), is(60));
 
@@ -145,7 +145,7 @@ public class DenseVectorFieldTest extends AbstractBadConfigTestBase {
       DenseVectorField type3 = (DenseVectorField) vector3.getType();
       MatcherAssert.assertThat(type3.getSimilarityFunction(), is(VectorSimilarityFunction.COSINE));
       MatcherAssert.assertThat(type3.getDimension(), is(5));
-      MatcherAssert.assertThat(type3.getCodecFormat(), is("Lucene90HnswVectorsFormat"));
+      MatcherAssert.assertThat(type3.getKnnAlgorithm(), is("hnsw"));
       MatcherAssert.assertThat(type3.getHnswMaxConn(), is(8));
       MatcherAssert.assertThat(type3.getHnswBeamWidth(), is(46));
 
@@ -156,7 +156,7 @@ public class DenseVectorFieldTest extends AbstractBadConfigTestBase {
       MatcherAssert.assertThat(
           typeDefault.getSimilarityFunction(), is(VectorSimilarityFunction.COSINE));
       MatcherAssert.assertThat(typeDefault.getDimension(), is(4));
-      assertNull(typeDefault.getCodecFormat());
+      assertNull(typeDefault.getKnnAlgorithm());
       MatcherAssert.assertThat(typeDefault.getHnswMaxConn(), is(16));
       MatcherAssert.assertThat(typeDefault.getHnswBeamWidth(), is(100));
     } finally {
diff --git a/solr/solr-ref-guide/modules/query-guide/pages/dense-vector-search.adoc b/solr/solr-ref-guide/modules/query-guide/pages/dense-vector-search.adoc
index af4a8ceb8ad..d868ea21381 100644
--- a/solr/solr-ref-guide/modules/query-guide/pages/dense-vector-search.adoc
+++ b/solr/solr-ref-guide/modules/query-guide/pages/dense-vector-search.adoc
@@ -106,32 +106,27 @@ and the hyper-parameter of the HNSW algorithm make sure you set this configurati
 <codecFactory class="solr.SchemaCodecFactory"/>
 ...
 
-Here's how `DenseVectorField` can be configured with the advanced codec hyper-parameters:
+Here's how `DenseVectorField` can be configured with the advanced hyper-parameters:
 
 [source,xml]
-<fieldType name="knn_vector" class="solr.DenseVectorField" vectorDimension="4" similarityFunction="cosine" codecFormat="Lucene91HnswVectorsFormat" hnswMaxConnections="10" hnswBeamWidth="40"/>
+<fieldType name="knn_vector" class="solr.DenseVectorField" vectorDimension="4" similarityFunction="cosine" knnAlgorithm="hnsw" hnswMaxConnections="10" hnswBeamWidth="40"/>
 <field name="vector" type="knn_vector" indexed="true" stored="true"/>
 
-`codecFormat`::
+`knnAlgorithm`::
 +
 [%autowidth,frame=none]
 |===
-|Optional |Default: `Lucene91HnswVectorsFormat`
+|Optional |Default: `hnsw`
 |===
 +
-(advanced) Specifies the knn codec implementation to use
+(advanced) Specifies the underlying knn algorithm to use
 +
 
-Accepted values: `Lucene90HnswVectorsFormat`, `Lucene91HnswVectorsFormat` .
+Accepted values: `hnsw`.
 
-Please note that the `codecFormat` accepted values may change in future releases.
+Please note that the `knnAlgorithm` accepted values may change in future releases.
 
 
-
-[NOTE]
-Lucene index back-compatibility is only supported for the default codec.
-If you choose to customize the `codecFormat` in your schema, upgrading to a future version of Solr may require you to either switch back to the default codec and optimize your index to rewrite it into the default codec before upgrading, or re-build your entire index from scratch after upgrading.
-
 `hnswMaxConnections`::
 +
 [%autowidth,frame=none]
@@ -139,7 +134,7 @@ If you choose to customize the `codecFormat` in your schema, upgrading to a futu
 |Optional |Default: `16`
 |===
 +
-(advanced) This parameter is specific for the `Lucene90HnswVectorsFormat` and `Lucene91HnswVectorsFormat` codec formats:
+(advanced) This parameter is specific for the `hnsw` knn algorithm:
 +
 Controls how many of the nearest neighbor candidates are connected to the new node.
 +
@@ -155,7 +150,7 @@ Any integer.
 |Optional |Default: `100`
 |===
 +
-(advanced) This parameter is specific for the `Lucene90HnswVectorsFormat` and `Lucene91HnswVectorsFormat` codec formats:
+(advanced) This parameter is specific for the `hnsw` knn algorithm:
 +
 It is the number of nearest neighbor candidates to track while searching the graph for each newly inserted node.
 +