You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@solr.apache.org by ab...@apache.org on 2023/06/14 14:53:55 UTC

[solr] branch branch_9x updated: SOLR-16836: introduce support for high dimensional vectors (#1680)

This is an automated email from the ASF dual-hosted git repository.

abenedetti pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/branch_9x by this push:
     new f0dac9c48e4 SOLR-16836: introduce support for high dimensional vectors (#1680)
f0dac9c48e4 is described below

commit f0dac9c48e4c4affeb754aecece6867fc861e9da
Author: Alessandro Benedetti <a....@sease.io>
AuthorDate: Wed Jun 14 15:37:17 2023 +0100

    SOLR-16836: introduce support for high dimensional vectors (#1680)
---
 solr/CHANGES.txt                                   |  2 +
 .../org/apache/solr/schema/DenseVectorField.java   | 67 ++++++++++++++++-
 ... => schema-densevector-high-dimensionality.xml} |  8 +-
 .../solr/collection1/conf/schema-densevector.xml   |  9 ++-
 .../apache/solr/schema/DenseVectorFieldTest.java   | 21 ++++++
 .../apache/solr/search/neural/KnnQParserTest.java  | 87 ++++++++++++++++++++++
 .../query-guide/pages/dense-vector-search.adoc     |  2 +-
 7 files changed, 184 insertions(+), 12 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 5d1fa87757b..ff5e53c2237 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -20,6 +20,8 @@ New Features
 * SOLR-16719: AffinityPlacementFactory now supports spreading replicas across domains within the availablity zone and
   optionally fail the request if more than a configurable number of replicas need to be placed in a single domain. (Houston Putman, Tomás Fernández Löbbe)
 
+* SOLR-16836: Introduced support for high dimensional vectors (Alessandro Benedetti).
+
 Improvements
 ---------------------
 
diff --git a/solr/core/src/java/org/apache/solr/schema/DenseVectorField.java b/solr/core/src/java/org/apache/solr/schema/DenseVectorField.java
index b57dc39d430..4ab30fd72a8 100644
--- a/solr/core/src/java/org/apache/solr/schema/DenseVectorField.java
+++ b/solr/core/src/java/org/apache/solr/schema/DenseVectorField.java
@@ -20,13 +20,17 @@ import static java.util.Optional.ofNullable;
 import static org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat.DEFAULT_BEAM_WIDTH;
 import static org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat.DEFAULT_MAX_CONN;
 
+import java.lang.invoke.MethodHandles;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.KnnByteVectorField;
 import org.apache.lucene.document.KnnFloatVectorField;
 import org.apache.lucene.document.StoredField;
+import org.apache.lucene.index.ByteVectorValues;
+import org.apache.lucene.index.FloatVectorValues;
 import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.index.VectorEncoding;
 import org.apache.lucene.index.VectorSimilarityFunction;
@@ -43,6 +47,8 @@ import org.apache.solr.uninverting.UninvertingReader;
 import org.apache.solr.util.vector.ByteDenseVectorParser;
 import org.apache.solr.util.vector.DenseVectorParser;
 import org.apache.solr.util.vector.FloatDenseVectorParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * Provides a field type to support Lucene's {@link org.apache.lucene.document.KnnVectorField}. See
@@ -53,6 +59,7 @@ import org.apache.solr.util.vector.FloatDenseVectorParser;
  * Only {@code Indexed} and {@code Stored} attributes are supported.
  */
 public class DenseVectorField extends FloatPointField {
+  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
   public static final String HNSW_ALGORITHM = "hnsw";
   public static final String DEFAULT_KNN_ALGORITHM = HNSW_ALGORITHM;
   static final String KNN_VECTOR_DIMENSION = "vectorDimension";
@@ -182,6 +189,31 @@ public class DenseVectorField extends FloatPointField {
           SolrException.ErrorCode.SERVER_ERROR,
           getClass().getSimpleName() + " fields can not have docValues: " + field.getName());
     }
+
+    switch (vectorEncoding) {
+      case FLOAT32:
+        if (dimension > FloatVectorValues.MAX_DIMENSIONS) {
+          if (log.isWarnEnabled()) {
+            log.warn(
+                "The vector dimension {} specified for field {} exceeds the current Lucene default max dimension of {}. It's un-tested territory, extra caution and benchmarks are recommended for production systems.",
+                dimension,
+                field.getName(),
+                FloatVectorValues.MAX_DIMENSIONS);
+          }
+        }
+        break;
+      case BYTE:
+        if (dimension > ByteVectorValues.MAX_DIMENSIONS) {
+          if (log.isWarnEnabled()) {
+            log.warn(
+                "The vector dimension {} specified for field {} exceeds the current Lucene default max dimension of {}. It's un-tested territory, extra caution and benchmarks are recommended for production systems.",
+                dimension,
+                field.getName(),
+                ByteVectorValues.MAX_DIMENSIONS);
+          }
+        }
+        break;
+    }
   }
 
   @Override
@@ -218,15 +250,17 @@ public class DenseVectorField extends FloatPointField {
 
   @Override
   public IndexableField createField(SchemaField field, Object vectorValue) {
+    FieldType denseVectorFieldType = getDenseVectorFieldType();
+
     if (vectorValue == null) return null;
     DenseVectorParser vectorBuilder = (DenseVectorParser) vectorValue;
     switch (vectorEncoding) {
       case BYTE:
         return new KnnByteVectorField(
-            field.getName(), vectorBuilder.getByteVector(), similarityFunction);
+            field.getName(), vectorBuilder.getByteVector(), denseVectorFieldType);
       case FLOAT32:
         return new KnnFloatVectorField(
-            field.getName(), vectorBuilder.getFloatVector(), similarityFunction);
+            field.getName(), vectorBuilder.getFloatVector(), denseVectorFieldType);
       default:
         throw new SolrException(
             SolrException.ErrorCode.SERVER_ERROR,
@@ -234,6 +268,35 @@ public class DenseVectorField extends FloatPointField {
     }
   }
 
+  /**
+   * This is needed at the moment to support dimensions higher than a hard-coded arbitrary Lucene
+   * max dimension. N.B. this may stop working and need changes when adopting future Lucene
+   * releases.
+   *
+   * @return a FieldType compatible with Dense vectors
+   */
+  private FieldType getDenseVectorFieldType() {
+    FieldType vectorFieldType =
+        new FieldType() {
+          @Override
+          public int vectorDimension() {
+            return dimension;
+          }
+
+          @Override
+          public VectorEncoding vectorEncoding() {
+            return vectorEncoding;
+          }
+
+          @Override
+          public VectorSimilarityFunction vectorSimilarityFunction() {
+            return similarityFunction;
+          }
+        };
+
+    return vectorFieldType;
+  }
+
   @Override
   public Object toObject(IndexableField f) {
     if (vectorEncoding.equals(VectorEncoding.BYTE)) {
diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-densevector.xml b/solr/core/src/test-files/solr/collection1/conf/schema-densevector-high-dimensionality.xml
similarity index 86%
copy from solr/core/src/test-files/solr/collection1/conf/schema-densevector.xml
copy to solr/core/src/test-files/solr/collection1/conf/schema-densevector-high-dimensionality.xml
index 7e6cfdbf328..aff9f94f20e 100644
--- a/solr/core/src/test-files/solr/collection1/conf/schema-densevector.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schema-densevector-high-dimensionality.xml
@@ -20,15 +20,11 @@
 
 <schema name="schema-densevector" version="1.0">
   <fieldType name="string" class="solr.StrField" multiValued="true"/>  
-  <fieldType name="knn_vector" class="solr.DenseVectorField" vectorDimension="4" similarityFunction="cosine" />
+  <fieldType name="knn_vector" class="solr.DenseVectorField" vectorDimension="2048" similarityFunction="cosine" />
   <fieldType name="plong" class="solr.LongPointField" useDocValuesAsStored="false"/>
-
-  <fieldType name="knn_vector_byte_encoding" class="solr.DenseVectorField" vectorDimension="4" similarityFunction="cosine" vectorEncoding="BYTE"/>
-
+  
   <field name="id" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
   <field name="vector" type="knn_vector" indexed="true" stored="true"/>
-  <field name="vector2" type="knn_vector" indexed="true" stored="true"/>
-  <field name="vector_byte_encoding" type="knn_vector_byte_encoding" indexed="true" stored="true" />
   <field name="string_field" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
 
   <field name="_version_" type="plong" indexed="true" stored="true" multiValued="false" />
diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-densevector.xml b/solr/core/src/test-files/solr/collection1/conf/schema-densevector.xml
index 7e6cfdbf328..42db078a6e2 100644
--- a/solr/core/src/test-files/solr/collection1/conf/schema-densevector.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schema-densevector.xml
@@ -21,14 +21,17 @@
 <schema name="schema-densevector" version="1.0">
   <fieldType name="string" class="solr.StrField" multiValued="true"/>  
   <fieldType name="knn_vector" class="solr.DenseVectorField" vectorDimension="4" similarityFunction="cosine" />
-  <fieldType name="plong" class="solr.LongPointField" useDocValuesAsStored="false"/>
-
   <fieldType name="knn_vector_byte_encoding" class="solr.DenseVectorField" vectorDimension="4" similarityFunction="cosine" vectorEncoding="BYTE"/>
-
+  <fieldType name="high_dimensional_float_knn_vector" class="solr.DenseVectorField" vectorDimension="2048" similarityFunction="cosine" vectorEncoding="FLOAT32"/>
+  <fieldType name="high_dimensional_byte_knn_vector" class="solr.DenseVectorField" vectorDimension="2048" similarityFunction="cosine" vectorEncoding="BYTE"/>
+  <fieldType name="plong" class="solr.LongPointField" useDocValuesAsStored="false"/>
+  
   <field name="id" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
   <field name="vector" type="knn_vector" indexed="true" stored="true"/>
   <field name="vector2" type="knn_vector" indexed="true" stored="true"/>
   <field name="vector_byte_encoding" type="knn_vector_byte_encoding" indexed="true" stored="true" />
+  <field name="2048_byte_vector" type="high_dimensional_byte_knn_vector" indexed="true" stored="true" />
+  <field name="2048_float_vector" type="high_dimensional_float_knn_vector" indexed="true" stored="true" />
   <field name="string_field" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
 
   <field name="_version_" type="plong" indexed="true" stored="true" multiValued="false" />
diff --git a/solr/core/src/test/org/apache/solr/schema/DenseVectorFieldTest.java b/solr/core/src/test/org/apache/solr/schema/DenseVectorFieldTest.java
index 55f05efb093..ed501a65f7c 100644
--- a/solr/core/src/test/org/apache/solr/schema/DenseVectorFieldTest.java
+++ b/solr/core/src/test/org/apache/solr/schema/DenseVectorFieldTest.java
@@ -18,7 +18,9 @@ package org.apache.solr.schema;
 
 import static org.hamcrest.core.Is.is;
 
+import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.List;
 import java.util.Map;
 import org.apache.lucene.index.VectorEncoding;
 import org.apache.lucene.index.VectorSimilarityFunction;
@@ -456,6 +458,25 @@ public class DenseVectorFieldTest extends AbstractBadConfigTestBase {
     }
   }
 
+  @Test
+  public void indexing_highDimensionalityVectorDocument_shouldBeIndexed() throws Exception {
+    try {
+      initCore("solrconfig-basic.xml", "schema-densevector-high-dimensionality.xml");
+
+      List<Float> highDimensionalityVector = new ArrayList<>();
+      for (float i = 0; i < 2048f; i++) {
+        highDimensionalityVector.add(i);
+      }
+      SolrInputDocument correctDoc = new SolrInputDocument();
+      correctDoc.addField("id", "0");
+      correctDoc.addField("vector", highDimensionalityVector);
+
+      assertU(adoc(correctDoc));
+    } finally {
+      deleteCore();
+    }
+  }
+
   @Test
   public void query_vectorFloatEncoded_storedField_shouldBeReturnedInResults() throws Exception {
     try {
diff --git a/solr/core/src/test/org/apache/solr/search/neural/KnnQParserTest.java b/solr/core/src/test/org/apache/solr/search/neural/KnnQParserTest.java
index 13c694080c9..92a05ce5593 100644
--- a/solr/core/src/test/org/apache/solr/search/neural/KnnQParserTest.java
+++ b/solr/core/src/test/org/apache/solr/search/neural/KnnQParserTest.java
@@ -20,6 +20,7 @@ import static org.apache.solr.search.neural.KnnQParser.DEFAULT_TOP_K;
 
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 import org.apache.solr.SolrTestCaseJ4;
 import org.apache.solr.common.SolrException;
@@ -214,6 +215,92 @@ public class KnnQParserTest extends SolrTestCaseJ4 {
         "//result/doc[3]/str[@name='id'][.='12']");
   }
 
+  @Test
+  public void highDimensionFloatVectorField_shouldSearchOnThatField() {
+    int highDimension = 2048;
+    List<SolrInputDocument> docsToIndex = this.prepareHighDimensionFloatVectorsDocs(highDimension);
+    for (SolrInputDocument doc : docsToIndex) {
+      assertU(adoc(doc));
+    }
+    assertU(commit());
+
+    float[] highDimensionalityQueryVector = new float[highDimension];
+    for (int i = 0; i < highDimension; i++) {
+      highDimensionalityQueryVector[i] = i;
+    }
+    String vectorToSearch = Arrays.toString(highDimensionalityQueryVector);
+
+    assertQ(
+        req(CommonParams.Q, "{!knn f=2048_float_vector topK=1}" + vectorToSearch, "fl", "id"),
+        "//result[@numFound='1']",
+        "//result/doc[1]/str[@name='id'][.='1']");
+  }
+
+  @Test
+  public void highDimensionByteVectorField_shouldSearchOnThatField() {
+    int highDimension = 2048;
+    List<SolrInputDocument> docsToIndex = this.prepareHighDimensionByteVectorsDocs(highDimension);
+    for (SolrInputDocument doc : docsToIndex) {
+      assertU(adoc(doc));
+    }
+    assertU(commit());
+
+    byte[] highDimensionalityQueryVector = new byte[highDimension];
+    for (int i = 0; i < highDimension; i++) {
+      highDimensionalityQueryVector[i] = (byte) (i % 127);
+    }
+    String vectorToSearch = Arrays.toString(highDimensionalityQueryVector);
+
+    assertQ(
+        req(CommonParams.Q, "{!knn f=2048_byte_vector topK=1}" + vectorToSearch, "fl", "id"),
+        "//result[@numFound='1']",
+        "//result/doc[1]/str[@name='id'][.='1']");
+  }
+
+  private List<SolrInputDocument> prepareHighDimensionFloatVectorsDocs(int highDimension) {
+    int docsCount = 13;
+    String field = "2048_float_vector";
+    List<SolrInputDocument> docs = new ArrayList<>(docsCount);
+
+    for (int i = 1; i < docsCount + 1; i++) {
+      SolrInputDocument doc = new SolrInputDocument();
+      doc.addField(IDField, i);
+      docs.add(doc);
+    }
+
+    for (int i = 0; i < docsCount; i++) {
+      List<Integer> highDimensionalityVector = new ArrayList<>();
+      for (int j = i * highDimension; j < highDimension; j++) {
+        highDimensionalityVector.add(j);
+      }
+      docs.get(i).addField(field, highDimensionalityVector);
+    }
+    Collections.reverse(docs);
+    return docs;
+  }
+
+  private List<SolrInputDocument> prepareHighDimensionByteVectorsDocs(int highDimension) {
+    int docsCount = 13;
+    String field = "2048_byte_vector";
+    List<SolrInputDocument> docs = new ArrayList<>(docsCount);
+
+    for (int i = 1; i < docsCount + 1; i++) {
+      SolrInputDocument doc = new SolrInputDocument();
+      doc.addField(IDField, i);
+      docs.add(doc);
+    }
+
+    for (int i = 0; i < docsCount; i++) {
+      List<Integer> highDimensionalityVector = new ArrayList<>();
+      for (int j = i * highDimension; j < highDimension; j++) {
+        highDimensionalityVector.add(j % 127);
+      }
+      docs.get(i).addField(field, highDimensionalityVector);
+    }
+    Collections.reverse(docs);
+    return docs;
+  }
+
   @Test
   public void vectorByteEncodingField_shouldSearchOnThatField() {
     String vectorToSearch = "[2, 2, 1, 3]";
diff --git a/solr/solr-ref-guide/modules/query-guide/pages/dense-vector-search.adoc b/solr/solr-ref-guide/modules/query-guide/pages/dense-vector-search.adoc
index 8cc75bed709..10b49ccc07c 100644
--- a/solr/solr-ref-guide/modules/query-guide/pages/dense-vector-search.adoc
+++ b/solr/solr-ref-guide/modules/query-guide/pages/dense-vector-search.adoc
@@ -75,7 +75,7 @@ s|Required |Default: none
 The dimension of the dense vector to pass in.
 +
 Accepted values:
-Any integer < = `1024`.
+Any integer.
 
 `similarityFunction`::
 +