You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@solr.apache.org by ab...@apache.org on 2022/07/20 15:47:21 UTC

[solr] branch branch_9x updated: SOLR-16246: Introduced pre-filtering in KnnQParser (#904)

This is an automated email from the ASF dual-hosted git repository.

abenedetti pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/branch_9x by this push:
     new ba58f521657 SOLR-16246: Introduced pre-filtering in KnnQParser (#904)
ba58f521657 is described below

commit ba58f5216578e714e309e78b53b7551cd92fb7d7
Author: Elia Porciani <e....@sease.io>
AuthorDate: Wed Jul 20 17:44:13 2022 +0200

    SOLR-16246: Introduced pre-filtering in KnnQParser (#904)
    
    * Introduced pre-filtering in Knn Query Parser
---
 solr/CHANGES.txt                                   |  2 +
 .../apache/solr/handler/MoreLikeThisHandler.java   | 12 +----
 .../solr/handler/component/QueryComponent.java     | 10 ++--
 .../handler/component/RealTimeGetComponent.java    | 10 +---
 .../org/apache/solr/schema/DenseVectorField.java   |  5 +-
 .../java/org/apache/solr/search/QueryUtils.java    | 40 ++++++++++++++++
 .../org/apache/solr/search/neural/KnnQParser.java  | 38 ++++++++++++++-
 .../apache/solr/search/neural/KnnQParserTest.java  | 54 +++++++++++++---------
 .../org/apache/solr/handler/AnalyticsHandler.java  | 11 +----
 .../query-guide/pages/dense-vector-search.adoc     | 14 ++++--
 10 files changed, 133 insertions(+), 63 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 3729dd66c51..3bbe18bf35d 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -8,6 +8,8 @@ https://github.com/apache/solr/blob/main/solr/solr-ref-guide/modules/upgrade-not
 
 New Features
 ---------------------
+* SOLR-16246: Introduced pre-filtering in KnnQParser (Elia Porciani via Alessandro Benedetti).
+
 * SOLR-16111: Add hl.queryFieldPattern support, an advanced alternative to the hl.requireFieldMatch boolean flag.
   (Christine Poerschke, David Smiley)
 
diff --git a/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java b/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java
index 9a9e4fa7e05..51271492afe 100644
--- a/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java
+++ b/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java
@@ -59,6 +59,7 @@ import org.apache.solr.search.DocListAndSet;
 import org.apache.solr.search.QParser;
 import org.apache.solr.search.QParserPlugin;
 import org.apache.solr.search.QueryParsing;
+import org.apache.solr.search.QueryUtils;
 import org.apache.solr.search.ReturnFields;
 import org.apache.solr.search.SolrIndexSearcher;
 import org.apache.solr.search.SolrQueryTimeoutImpl;
@@ -117,16 +118,7 @@ public class MoreLikeThisHandler extends RequestHandlerBase {
           sortSpec = parser.getSortSpec(true);
         }
 
-        String[] fqs = req.getParams().getParams(CommonParams.FQ);
-        if (fqs != null && fqs.length != 0) {
-          filters = new ArrayList<>();
-          for (String fq : fqs) {
-            if (fq != null && fq.trim().length() != 0) {
-              QParser fqp = QParser.getParser(fq, req);
-              filters.add(fqp.getQuery());
-            }
-          }
-        }
+        filters = QueryUtils.parseFilterQueries(req, false);
       } catch (SyntaxError e) {
         throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
       }
diff --git a/solr/core/src/java/org/apache/solr/handler/component/QueryComponent.java b/solr/core/src/java/org/apache/solr/handler/component/QueryComponent.java
index 7e210c6e611..3624423c003 100644
--- a/solr/core/src/java/org/apache/solr/handler/component/QueryComponent.java
+++ b/solr/core/src/java/org/apache/solr/handler/component/QueryComponent.java
@@ -85,6 +85,7 @@ import org.apache.solr.search.QParserPlugin;
 import org.apache.solr.search.QueryCommand;
 import org.apache.solr.search.QueryParsing;
 import org.apache.solr.search.QueryResult;
+import org.apache.solr.search.QueryUtils;
 import org.apache.solr.search.RankQuery;
 import org.apache.solr.search.ReturnFields;
 import org.apache.solr.search.SolrIndexSearcher;
@@ -201,13 +202,8 @@ public class QueryComponent extends SearchComponent {
         List<Query> filters = rb.getFilters();
         // if filters already exists, make a copy instead of modifying the original
         filters = filters == null ? new ArrayList<>(fqs.length) : new ArrayList<>(filters);
-        for (String fq : fqs) {
-          if (fq != null && fq.trim().length() != 0) {
-            QParser fqp = QParser.getParser(fq, req);
-            fqp.setIsFilter(true);
-            filters.add(fqp.getQuery());
-          }
-        }
+        filters.addAll(QueryUtils.parseFilterQueries(req, false));
+
         // only set the filters if they are not empty otherwise
         // fq=&someotherParam= will trigger all docs filter for every request
         // if filter cache is disabled
diff --git a/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java b/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java
index c913a35b340..e53b1b06985 100644
--- a/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java
+++ b/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java
@@ -81,7 +81,6 @@ import org.apache.solr.schema.FieldType;
 import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.schema.SchemaField;
 import org.apache.solr.search.DocList;
-import org.apache.solr.search.QParser;
 import org.apache.solr.search.QueryUtils;
 import org.apache.solr.search.ReturnFields;
 import org.apache.solr.search.SolrDocumentFetcher;
@@ -205,13 +204,8 @@ public class RealTimeGetComponent extends SearchComponent {
       if (fqs != null && fqs.length != 0) {
         List<Query> filters = rb.getFilters();
         // if filters already exists, make a copy instead of modifying the original
-        filters = filters == null ? new ArrayList<Query>(fqs.length) : new ArrayList<>(filters);
-        for (String fq : fqs) {
-          if (fq != null && fq.trim().length() != 0) {
-            QParser fqp = QParser.getParser(fq, req);
-            filters.add(QueryUtils.makeQueryable(fqp.getQuery()));
-          }
-        }
+        filters = filters == null ? new ArrayList<>(fqs.length) : new ArrayList<>(filters);
+        filters.addAll(QueryUtils.parseFilterQueries(req, true));
         if (!filters.isEmpty()) {
           rb.setFilters(filters);
         }
diff --git a/solr/core/src/java/org/apache/solr/schema/DenseVectorField.java b/solr/core/src/java/org/apache/solr/schema/DenseVectorField.java
index 8b44f006f33..87aa70e7e4b 100644
--- a/solr/core/src/java/org/apache/solr/schema/DenseVectorField.java
+++ b/solr/core/src/java/org/apache/solr/schema/DenseVectorField.java
@@ -264,8 +264,9 @@ public class DenseVectorField extends FloatPointField {
         "Function queries are not supported for Dense Vector fields.");
   }
 
-  public Query getKnnVectorQuery(String fieldName, float[] vectorToSearch, int topK) {
-    return new KnnVectorQuery(fieldName, vectorToSearch, topK);
+  public Query getKnnVectorQuery(
+      String fieldName, float[] vectorToSearch, int topK, Query filterQuery) {
+    return new KnnVectorQuery(fieldName, vectorToSearch, topK, filterQuery);
   }
 
   /**
diff --git a/solr/core/src/java/org/apache/solr/search/QueryUtils.java b/solr/core/src/java/org/apache/solr/search/QueryUtils.java
index ff03b64fcce..2709d9d6b7a 100644
--- a/solr/core/src/java/org/apache/solr/search/QueryUtils.java
+++ b/solr/core/src/java/org/apache/solr/search/QueryUtils.java
@@ -16,7 +16,10 @@
  */
 package org.apache.solr.search;
 
+import java.util.ArrayList;
 import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.search.BooleanQuery;
@@ -27,6 +30,8 @@ import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.MatchNoDocsQuery;
 import org.apache.lucene.search.Query;
 import org.apache.solr.common.SolrException;
+import org.apache.solr.common.params.CommonParams;
+import org.apache.solr.request.SolrQueryRequest;
 
 /** */
 public class QueryUtils {
@@ -166,6 +171,7 @@ public class QueryUtils {
    * @lucene.experimental throw exception if max boolean clauses are exceeded
    */
   public static BooleanQuery build(BooleanQuery.Builder builder, QParser parser) {
+
     int configuredMax =
         parser != null
             ? parser.getReq().getCore().getSolrConfig().booleanQueryMaxClauseCount
@@ -215,4 +221,38 @@ public class QueryUtils {
       }
     }
   }
+
+  /**
+   * Parse the filter queries in Solr request
+   *
+   * @param req Solr request
+   * @param fixNegativeQueries if true, negative queries are rewritten by adding a MatchAllDocs
+   *     query clause
+   * @return and array of Query. If the request does not contain filter queries, returns an empty
+   *     list.
+   * @throws SyntaxError if an error occurs during parsing
+   */
+  public static List<Query> parseFilterQueries(SolrQueryRequest req, boolean fixNegativeQueries)
+      throws SyntaxError {
+
+    String[] filterQueriesStr = req.getParams().getParams(CommonParams.FQ);
+
+    if (filterQueriesStr != null) {
+      List<Query> filters = new ArrayList<>(filterQueriesStr.length);
+      for (String fq : filterQueriesStr) {
+        if (fq != null && fq.trim().length() != 0) {
+          QParser fqp = QParser.getParser(fq, req);
+          fqp.setIsFilter(true);
+          Query query = fqp.getQuery();
+          if (fixNegativeQueries) {
+            query = makeQueryable(query);
+          }
+          filters.add(query);
+        }
+      }
+      return filters;
+    }
+
+    return Collections.emptyList();
+  }
 }
diff --git a/solr/core/src/java/org/apache/solr/search/neural/KnnQParser.java b/solr/core/src/java/org/apache/solr/search/neural/KnnQParser.java
index 76b15a0bfb4..7255562d8fe 100644
--- a/solr/core/src/java/org/apache/solr/search/neural/KnnQParser.java
+++ b/solr/core/src/java/org/apache/solr/search/neural/KnnQParser.java
@@ -16,9 +16,13 @@
  */
 package org.apache.solr.search.neural;
 
+import java.util.List;
 import org.apache.commons.lang3.StringUtils;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.Query;
 import org.apache.solr.common.SolrException;
+import org.apache.solr.common.params.CommonParams;
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.schema.DenseVectorField;
@@ -26,6 +30,8 @@ import org.apache.solr.schema.FieldType;
 import org.apache.solr.schema.SchemaField;
 import org.apache.solr.search.QParser;
 import org.apache.solr.search.QueryParsing;
+import org.apache.solr.search.QueryUtils;
+import org.apache.solr.search.SyntaxError;
 
 public class KnnQParser extends QParser {
 
@@ -72,7 +78,37 @@ public class KnnQParser extends QParser {
 
     DenseVectorField denseVectorType = (DenseVectorField) fieldType;
     float[] parsedVectorToSearch = parseVector(vectorToSearch, denseVectorType.getDimension());
-    return denseVectorType.getKnnVectorQuery(schemaField.getName(), parsedVectorToSearch, topK);
+
+    return denseVectorType.getKnnVectorQuery(
+        schemaField.getName(), parsedVectorToSearch, topK, getFilterQuery());
+  }
+
+  private Query getFilterQuery() throws SolrException {
+    if (!isFilter()) {
+      String[] filterQueries = req.getParams().getParams(CommonParams.FQ);
+      if (filterQueries != null && filterQueries.length != 0) {
+        List<Query> filters;
+
+        try {
+          filters = QueryUtils.parseFilterQueries(req, true);
+        } catch (SyntaxError e) {
+          throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
+        }
+
+        if (filters.size() == 1) {
+          return filters.get(0);
+        }
+
+        BooleanQuery.Builder builder = new BooleanQuery.Builder();
+        for (Query query : filters) {
+          builder.add(query, BooleanClause.Occur.FILTER);
+        }
+
+        return builder.build();
+      }
+    }
+
+    return null;
   }
 
   /**
diff --git a/solr/core/src/test/org/apache/solr/search/neural/KnnQParserTest.java b/solr/core/src/test/org/apache/solr/search/neural/KnnQParserTest.java
index e4f31903064..ef27333832d 100644
--- a/solr/core/src/test/org/apache/solr/search/neural/KnnQParserTest.java
+++ b/solr/core/src/test/org/apache/solr/search/neural/KnnQParserTest.java
@@ -288,8 +288,23 @@ public class KnnQParserTest extends SolrTestCaseJ4 {
         "//result/doc[10]/str[@name='id'][.='8']");
   }
 
+  public void knnQueryUsedInFilter_shouldFilterResultsBeforeTheQueryExecution() {
+    String vectorToSearch = "[1.0, 2.0, 3.0, 4.0]";
+    assertQ(
+        req(
+            CommonParams.Q,
+            "id:(3 4 9 2)",
+            "fq",
+            "{!knn f=vector topK=4}" + vectorToSearch,
+            "fl",
+            "id"),
+        "//result[@numFound='2']",
+        "//result/doc[1]/str[@name='id'][.='2']",
+        "//result/doc[2]/str[@name='id'][.='4']");
+  }
+
   @Test
-  public void knnQueryWithFilterQuery_shouldIntersectResults() {
+  public void knnQueryWithFilterQuery_shouldPerformKnnSearchInPreFilteredResults() {
     String vectorToSearch = "[1.0, 2.0, 3.0, 4.0]";
 
     assertQ(
@@ -304,14 +319,7 @@ public class KnnQParserTest extends SolrTestCaseJ4 {
         "//result/doc[1]/str[@name='id'][.='1']",
         "//result/doc[2]/str[@name='id'][.='2']",
         "//result/doc[3]/str[@name='id'][.='7']");
-    /*
-     * This behavior is counter-intuitive.
-     * You would expect the query to apply to only the results filtered by the filter query.
-     * So you ideally would like to see the following ranked list as a result: [4,2,3,9].
-     * To get this please use the knn query parser as a reranker, example in a following test.
-     * This is how filter queries work(it's just intersection of the bitsets coming from each query and filter query):
-     * Ranked List from q=[1,4,2,10] <intersects> Set from fq={3,4,9,2} = [4,2]
-     * */
+
     assertQ(
         req(
             CommonParams.Q,
@@ -320,21 +328,23 @@ public class KnnQParserTest extends SolrTestCaseJ4 {
             "id:(3 4 9 2)",
             "fl",
             "id"),
-        "//result[@numFound='2']",
+        "//result[@numFound='4']",
         "//result/doc[1]/str[@name='id'][.='4']",
-        "//result/doc[2]/str[@name='id'][.='2']");
-    /* The ranking is now different as default solr score is used for the main query */
+        "//result/doc[2]/str[@name='id'][.='2']",
+        "//result/doc[3]/str[@name='id'][.='3']",
+        "//result/doc[4]/str[@name='id'][.='9']");
+  }
+
+  @Test
+  public void knnQueryWithNegativeFilterQuery_shouldPerformKnnSearchInPreFilteredResults() {
+    String vectorToSearch = "[1.0, 2.0, 3.0, 4.0]";
     assertQ(
-        req(
-            CommonParams.Q,
-            "id:(3 4 9 2)",
-            "fq",
-            "{!knn f=vector topK=4}" + vectorToSearch,
-            "fl",
-            "id"),
-        "//result[@numFound='2']",
-        "//result/doc[1]/str[@name='id'][.='2']",
-        "//result/doc[2]/str[@name='id'][.='4']");
+        req(CommonParams.Q, "{!knn f=vector topK=4}" + vectorToSearch, "fq", "-id:4", "fl", "id"),
+        "//result[@numFound='4']",
+        "//result/doc[1]/str[@name='id'][.='1']",
+        "//result/doc[2]/str[@name='id'][.='2']",
+        "//result/doc[3]/str[@name='id'][.='10']",
+        "//result/doc[4]/str[@name='id'][.='3']");
   }
 
   /*
diff --git a/solr/modules/analytics/src/java/org/apache/solr/handler/AnalyticsHandler.java b/solr/modules/analytics/src/java/org/apache/solr/handler/AnalyticsHandler.java
index 3c6a5434a18..6da7ca79ddf 100644
--- a/solr/modules/analytics/src/java/org/apache/solr/handler/AnalyticsHandler.java
+++ b/solr/modules/analytics/src/java/org/apache/solr/handler/AnalyticsHandler.java
@@ -41,6 +41,7 @@ import org.apache.solr.search.DocSet;
 import org.apache.solr.search.QParser;
 import org.apache.solr.search.QParserPlugin;
 import org.apache.solr.search.QueryParsing;
+import org.apache.solr.search.QueryUtils;
 import org.apache.solr.search.SolrIndexSearcher;
 import org.apache.solr.search.SolrQueryTimeoutImpl;
 import org.apache.solr.search.SyntaxError;
@@ -129,15 +130,7 @@ public class AnalyticsHandler extends RequestHandlerBase
     queries.add(query);
 
     // Filter Params
-    String[] fqs = req.getParams().getParams(CommonParams.FQ);
-    if (fqs != null) {
-      for (String fq : fqs) {
-        if (fq != null && fq.trim().length() != 0) {
-          QParser fqp = QParser.getParser(fq, req);
-          queries.add(fqp.getQuery());
-        }
-      }
-    }
+    queries.addAll(QueryUtils.parseFilterQueries(req, false));
     return req.getSearcher().getDocSet(queries);
   }
 
diff --git a/solr/solr-ref-guide/modules/query-guide/pages/dense-vector-search.adoc b/solr/solr-ref-guide/modules/query-guide/pages/dense-vector-search.adoc
index d868ea21381..1484ab84ed7 100644
--- a/solr/solr-ref-guide/modules/query-guide/pages/dense-vector-search.adoc
+++ b/solr/solr-ref-guide/modules/query-guide/pages/dense-vector-search.adoc
@@ -274,13 +274,19 @@ The `knn` query parser can be used with filter queries:
 
 [IMPORTANT]
 ====
-When using `knn` in these scenarios make sure you have a clear understanding of how filter queries work in Apache Solr:
+Filter queries are executed as pre-filters: the main query refines the sub-set of search results derived from the application of all the filter queries combined as 'MUST' clauses(boolean AND).
 
-The Ranked List of document IDs resulting from the main query `q` is intersected with the set of document IDs deriving from each filter query `fq`.
+This means that in
+[source,text]
+&q=id:(1 2 3)&fq={!knn f=vector topK=10}[1.0, 2.0, 3.0, 4.0]
+
+The results are prefiltered by the topK knn retrieval and then only the documents from this subset, matching the query 'q=id:(1 2 3)' are returned.
 
-e.g.
+In
+[source,text]
+&q={!knn f=vector topK=10}[1.0, 2.0, 3.0, 4.0]&fq=id:(1 2 3)
 
-Ranked List from `q`=`[ID1, ID4, ID2, ID10]` <intersects> Set from `fq`=`{ID3, ID2, ID9, ID4}` = `[ID4,ID2]`
+The results are prefiltered by the fq=id:(1 2 3) and then only the documents from this subset are considered as candidates for the topK knn retrieval.
 ====