You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@solr.apache.org by ab...@apache.org on 2023/06/14 14:37:24 UTC
[solr] branch main updated: SOLR-16836: introduce support for high dimensional vectors (#1680)
This is an automated email from the ASF dual-hosted git repository.
abenedetti pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/main by this push:
new cedb246ee40 SOLR-16836: introduce support for high dimensional vectors (#1680)
cedb246ee40 is described below
commit cedb246ee407d0ca38b72634de6c43de53780011
Author: Alessandro Benedetti <a....@sease.io>
AuthorDate: Wed Jun 14 15:37:17 2023 +0100
SOLR-16836: introduce support for high dimensional vectors (#1680)
---
solr/CHANGES.txt | 2 +
.../org/apache/solr/schema/DenseVectorField.java | 67 ++++++++++++++++-
... => schema-densevector-high-dimensionality.xml} | 8 +-
.../solr/collection1/conf/schema-densevector.xml | 9 ++-
.../apache/solr/schema/DenseVectorFieldTest.java | 21 ++++++
.../apache/solr/search/neural/KnnQParserTest.java | 87 ++++++++++++++++++++++
.../query-guide/pages/dense-vector-search.adoc | 2 +-
7 files changed, 184 insertions(+), 12 deletions(-)
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 0d61771348b..ba8ee395994 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -63,6 +63,8 @@ New Features
* SOLR-16719: AffinityPlacementFactory now supports spreading replicas across domains within the availablity zone and
optionally fail the request if more than a configurable number of replicas need to be placed in a single domain. (Houston Putman, Tomás Fernández Löbbe)
+* SOLR-16836: Introduced support for high dimensional vectors (Alessandro Benedetti).
+
Improvements
---------------------
diff --git a/solr/core/src/java/org/apache/solr/schema/DenseVectorField.java b/solr/core/src/java/org/apache/solr/schema/DenseVectorField.java
index b57dc39d430..4ab30fd72a8 100644
--- a/solr/core/src/java/org/apache/solr/schema/DenseVectorField.java
+++ b/solr/core/src/java/org/apache/solr/schema/DenseVectorField.java
@@ -20,13 +20,17 @@ import static java.util.Optional.ofNullable;
import static org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat.DEFAULT_BEAM_WIDTH;
import static org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat.DEFAULT_MAX_CONN;
+import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
+import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.KnnByteVectorField;
import org.apache.lucene.document.KnnFloatVectorField;
import org.apache.lucene.document.StoredField;
+import org.apache.lucene.index.ByteVectorValues;
+import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.VectorEncoding;
import org.apache.lucene.index.VectorSimilarityFunction;
@@ -43,6 +47,8 @@ import org.apache.solr.uninverting.UninvertingReader;
import org.apache.solr.util.vector.ByteDenseVectorParser;
import org.apache.solr.util.vector.DenseVectorParser;
import org.apache.solr.util.vector.FloatDenseVectorParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
* Provides a field type to support Lucene's {@link org.apache.lucene.document.KnnVectorField}. See
@@ -53,6 +59,7 @@ import org.apache.solr.util.vector.FloatDenseVectorParser;
* Only {@code Indexed} and {@code Stored} attributes are supported.
*/
public class DenseVectorField extends FloatPointField {
+ private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
public static final String HNSW_ALGORITHM = "hnsw";
public static final String DEFAULT_KNN_ALGORITHM = HNSW_ALGORITHM;
static final String KNN_VECTOR_DIMENSION = "vectorDimension";
@@ -182,6 +189,31 @@ public class DenseVectorField extends FloatPointField {
SolrException.ErrorCode.SERVER_ERROR,
getClass().getSimpleName() + " fields can not have docValues: " + field.getName());
}
+
+ switch (vectorEncoding) {
+ case FLOAT32:
+ if (dimension > FloatVectorValues.MAX_DIMENSIONS) {
+ if (log.isWarnEnabled()) {
+ log.warn(
+ "The vector dimension {} specified for field {} exceeds the current Lucene default max dimension of {}. It's un-tested territory, extra caution and benchmarks are recommended for production systems.",
+ dimension,
+ field.getName(),
+ FloatVectorValues.MAX_DIMENSIONS);
+ }
+ }
+ break;
+ case BYTE:
+ if (dimension > ByteVectorValues.MAX_DIMENSIONS) {
+ if (log.isWarnEnabled()) {
+ log.warn(
+ "The vector dimension {} specified for field {} exceeds the current Lucene default max dimension of {}. It's un-tested territory, extra caution and benchmarks are recommended for production systems.",
+ dimension,
+ field.getName(),
+ ByteVectorValues.MAX_DIMENSIONS);
+ }
+ }
+ break;
+ }
}
@Override
@@ -218,15 +250,17 @@ public class DenseVectorField extends FloatPointField {
@Override
public IndexableField createField(SchemaField field, Object vectorValue) {
+ FieldType denseVectorFieldType = getDenseVectorFieldType();
+
if (vectorValue == null) return null;
DenseVectorParser vectorBuilder = (DenseVectorParser) vectorValue;
switch (vectorEncoding) {
case BYTE:
return new KnnByteVectorField(
- field.getName(), vectorBuilder.getByteVector(), similarityFunction);
+ field.getName(), vectorBuilder.getByteVector(), denseVectorFieldType);
case FLOAT32:
return new KnnFloatVectorField(
- field.getName(), vectorBuilder.getFloatVector(), similarityFunction);
+ field.getName(), vectorBuilder.getFloatVector(), denseVectorFieldType);
default:
throw new SolrException(
SolrException.ErrorCode.SERVER_ERROR,
@@ -234,6 +268,35 @@ public class DenseVectorField extends FloatPointField {
}
}
+ /**
+ * This is needed at the moment to support dimensions higher than a hard-coded arbitrary Lucene
+ * max dimension. N.B. this may stop working and need changes when adopting future Lucene
+ * releases.
+ *
+ * @return a FieldType compatible with Dense vectors
+ */
+ private FieldType getDenseVectorFieldType() {
+ FieldType vectorFieldType =
+ new FieldType() {
+ @Override
+ public int vectorDimension() {
+ return dimension;
+ }
+
+ @Override
+ public VectorEncoding vectorEncoding() {
+ return vectorEncoding;
+ }
+
+ @Override
+ public VectorSimilarityFunction vectorSimilarityFunction() {
+ return similarityFunction;
+ }
+ };
+
+ return vectorFieldType;
+ }
+
@Override
public Object toObject(IndexableField f) {
if (vectorEncoding.equals(VectorEncoding.BYTE)) {
diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-densevector.xml b/solr/core/src/test-files/solr/collection1/conf/schema-densevector-high-dimensionality.xml
similarity index 86%
copy from solr/core/src/test-files/solr/collection1/conf/schema-densevector.xml
copy to solr/core/src/test-files/solr/collection1/conf/schema-densevector-high-dimensionality.xml
index 7e6cfdbf328..aff9f94f20e 100644
--- a/solr/core/src/test-files/solr/collection1/conf/schema-densevector.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schema-densevector-high-dimensionality.xml
@@ -20,15 +20,11 @@
<schema name="schema-densevector" version="1.0">
<fieldType name="string" class="solr.StrField" multiValued="true"/>
- <fieldType name="knn_vector" class="solr.DenseVectorField" vectorDimension="4" similarityFunction="cosine" />
+ <fieldType name="knn_vector" class="solr.DenseVectorField" vectorDimension="2048" similarityFunction="cosine" />
<fieldType name="plong" class="solr.LongPointField" useDocValuesAsStored="false"/>
-
- <fieldType name="knn_vector_byte_encoding" class="solr.DenseVectorField" vectorDimension="4" similarityFunction="cosine" vectorEncoding="BYTE"/>
-
+
<field name="id" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
<field name="vector" type="knn_vector" indexed="true" stored="true"/>
- <field name="vector2" type="knn_vector" indexed="true" stored="true"/>
- <field name="vector_byte_encoding" type="knn_vector_byte_encoding" indexed="true" stored="true" />
<field name="string_field" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
<field name="_version_" type="plong" indexed="true" stored="true" multiValued="false" />
diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-densevector.xml b/solr/core/src/test-files/solr/collection1/conf/schema-densevector.xml
index 7e6cfdbf328..42db078a6e2 100644
--- a/solr/core/src/test-files/solr/collection1/conf/schema-densevector.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schema-densevector.xml
@@ -21,14 +21,17 @@
<schema name="schema-densevector" version="1.0">
<fieldType name="string" class="solr.StrField" multiValued="true"/>
<fieldType name="knn_vector" class="solr.DenseVectorField" vectorDimension="4" similarityFunction="cosine" />
- <fieldType name="plong" class="solr.LongPointField" useDocValuesAsStored="false"/>
-
<fieldType name="knn_vector_byte_encoding" class="solr.DenseVectorField" vectorDimension="4" similarityFunction="cosine" vectorEncoding="BYTE"/>
-
+ <fieldType name="high_dimensional_float_knn_vector" class="solr.DenseVectorField" vectorDimension="2048" similarityFunction="cosine" vectorEncoding="FLOAT32"/>
+ <fieldType name="high_dimensional_byte_knn_vector" class="solr.DenseVectorField" vectorDimension="2048" similarityFunction="cosine" vectorEncoding="BYTE"/>
+ <fieldType name="plong" class="solr.LongPointField" useDocValuesAsStored="false"/>
+
<field name="id" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
<field name="vector" type="knn_vector" indexed="true" stored="true"/>
<field name="vector2" type="knn_vector" indexed="true" stored="true"/>
<field name="vector_byte_encoding" type="knn_vector_byte_encoding" indexed="true" stored="true" />
+ <field name="2048_byte_vector" type="high_dimensional_byte_knn_vector" indexed="true" stored="true" />
+ <field name="2048_float_vector" type="high_dimensional_float_knn_vector" indexed="true" stored="true" />
<field name="string_field" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
<field name="_version_" type="plong" indexed="true" stored="true" multiValued="false" />
diff --git a/solr/core/src/test/org/apache/solr/schema/DenseVectorFieldTest.java b/solr/core/src/test/org/apache/solr/schema/DenseVectorFieldTest.java
index 55f05efb093..ed501a65f7c 100644
--- a/solr/core/src/test/org/apache/solr/schema/DenseVectorFieldTest.java
+++ b/solr/core/src/test/org/apache/solr/schema/DenseVectorFieldTest.java
@@ -18,7 +18,9 @@ package org.apache.solr.schema;
import static org.hamcrest.core.Is.is;
+import java.util.ArrayList;
import java.util.Arrays;
+import java.util.List;
import java.util.Map;
import org.apache.lucene.index.VectorEncoding;
import org.apache.lucene.index.VectorSimilarityFunction;
@@ -456,6 +458,25 @@ public class DenseVectorFieldTest extends AbstractBadConfigTestBase {
}
}
+ @Test
+ public void indexing_highDimensionalityVectorDocument_shouldBeIndexed() throws Exception {
+ try {
+ initCore("solrconfig-basic.xml", "schema-densevector-high-dimensionality.xml");
+
+ List<Float> highDimensionalityVector = new ArrayList<>();
+ for (float i = 0; i < 2048f; i++) {
+ highDimensionalityVector.add(i);
+ }
+ SolrInputDocument correctDoc = new SolrInputDocument();
+ correctDoc.addField("id", "0");
+ correctDoc.addField("vector", highDimensionalityVector);
+
+ assertU(adoc(correctDoc));
+ } finally {
+ deleteCore();
+ }
+ }
+
@Test
public void query_vectorFloatEncoded_storedField_shouldBeReturnedInResults() throws Exception {
try {
diff --git a/solr/core/src/test/org/apache/solr/search/neural/KnnQParserTest.java b/solr/core/src/test/org/apache/solr/search/neural/KnnQParserTest.java
index 13c694080c9..92a05ce5593 100644
--- a/solr/core/src/test/org/apache/solr/search/neural/KnnQParserTest.java
+++ b/solr/core/src/test/org/apache/solr/search/neural/KnnQParserTest.java
@@ -20,6 +20,7 @@ import static org.apache.solr.search.neural.KnnQParser.DEFAULT_TOP_K;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collections;
import java.util.List;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.SolrException;
@@ -214,6 +215,92 @@ public class KnnQParserTest extends SolrTestCaseJ4 {
"//result/doc[3]/str[@name='id'][.='12']");
}
+ @Test
+ public void highDimensionFloatVectorField_shouldSearchOnThatField() {
+ int highDimension = 2048;
+ List<SolrInputDocument> docsToIndex = this.prepareHighDimensionFloatVectorsDocs(highDimension);
+ for (SolrInputDocument doc : docsToIndex) {
+ assertU(adoc(doc));
+ }
+ assertU(commit());
+
+ float[] highDimensionalityQueryVector = new float[highDimension];
+ for (int i = 0; i < highDimension; i++) {
+ highDimensionalityQueryVector[i] = i;
+ }
+ String vectorToSearch = Arrays.toString(highDimensionalityQueryVector);
+
+ assertQ(
+ req(CommonParams.Q, "{!knn f=2048_float_vector topK=1}" + vectorToSearch, "fl", "id"),
+ "//result[@numFound='1']",
+ "//result/doc[1]/str[@name='id'][.='1']");
+ }
+
+ @Test
+ public void highDimensionByteVectorField_shouldSearchOnThatField() {
+ int highDimension = 2048;
+ List<SolrInputDocument> docsToIndex = this.prepareHighDimensionByteVectorsDocs(highDimension);
+ for (SolrInputDocument doc : docsToIndex) {
+ assertU(adoc(doc));
+ }
+ assertU(commit());
+
+ byte[] highDimensionalityQueryVector = new byte[highDimension];
+ for (int i = 0; i < highDimension; i++) {
+ highDimensionalityQueryVector[i] = (byte) (i % 127);
+ }
+ String vectorToSearch = Arrays.toString(highDimensionalityQueryVector);
+
+ assertQ(
+ req(CommonParams.Q, "{!knn f=2048_byte_vector topK=1}" + vectorToSearch, "fl", "id"),
+ "//result[@numFound='1']",
+ "//result/doc[1]/str[@name='id'][.='1']");
+ }
+
+ private List<SolrInputDocument> prepareHighDimensionFloatVectorsDocs(int highDimension) {
+ int docsCount = 13;
+ String field = "2048_float_vector";
+ List<SolrInputDocument> docs = new ArrayList<>(docsCount);
+
+ for (int i = 1; i < docsCount + 1; i++) {
+ SolrInputDocument doc = new SolrInputDocument();
+ doc.addField(IDField, i);
+ docs.add(doc);
+ }
+
+ for (int i = 0; i < docsCount; i++) {
+ List<Integer> highDimensionalityVector = new ArrayList<>();
+ for (int j = i * highDimension; j < highDimension; j++) {
+ highDimensionalityVector.add(j);
+ }
+ docs.get(i).addField(field, highDimensionalityVector);
+ }
+ Collections.reverse(docs);
+ return docs;
+ }
+
+ private List<SolrInputDocument> prepareHighDimensionByteVectorsDocs(int highDimension) {
+ int docsCount = 13;
+ String field = "2048_byte_vector";
+ List<SolrInputDocument> docs = new ArrayList<>(docsCount);
+
+ for (int i = 1; i < docsCount + 1; i++) {
+ SolrInputDocument doc = new SolrInputDocument();
+ doc.addField(IDField, i);
+ docs.add(doc);
+ }
+
+ for (int i = 0; i < docsCount; i++) {
+ List<Integer> highDimensionalityVector = new ArrayList<>();
+ for (int j = i * highDimension; j < highDimension; j++) {
+ highDimensionalityVector.add(j % 127);
+ }
+ docs.get(i).addField(field, highDimensionalityVector);
+ }
+ Collections.reverse(docs);
+ return docs;
+ }
+
@Test
public void vectorByteEncodingField_shouldSearchOnThatField() {
String vectorToSearch = "[2, 2, 1, 3]";
diff --git a/solr/solr-ref-guide/modules/query-guide/pages/dense-vector-search.adoc b/solr/solr-ref-guide/modules/query-guide/pages/dense-vector-search.adoc
index 8cc75bed709..10b49ccc07c 100644
--- a/solr/solr-ref-guide/modules/query-guide/pages/dense-vector-search.adoc
+++ b/solr/solr-ref-guide/modules/query-guide/pages/dense-vector-search.adoc
@@ -75,7 +75,7 @@ s|Required |Default: none
The dimension of the dense vector to pass in.
+
Accepted values:
-Any integer < = `1024`.
+Any integer.
`similarityFunction`::
+