You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@solr.apache.org by ho...@apache.org on 2022/10/11 00:35:48 UTC

[solr] branch main updated: SOLR-16436: Fix "false positive" suggestions from DirectSolrSpellChecker when using maxQueryFrequency > 1 in multi-shard collections

This is an automated email from the ASF dual-hosted git repository.

hossman pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/main by this push:
     new a41c4cc2662 SOLR-16436: Fix "false positive" suggestions from DirectSolrSpellChecker when using maxQueryFrequency > 1 in multi-shard collections
a41c4cc2662 is described below

commit a41c4cc2662ed07b5cfe454140c5616ac36647ec
Author: Chris Hostetter <ho...@apache.org>
AuthorDate: Mon Oct 10 17:35:27 2022 -0700

    SOLR-16436: Fix "false positive" suggestions from DirectSolrSpellChecker when using maxQueryFrequency > 1 in multi-shard collections
---
 solr/CHANGES.txt                                   |  4 +-
 .../handler/component/SpellCheckComponent.java     |  2 +
 .../handler/component/SpellCheckMergeData.java     | 21 +++++++++
 .../solr/spelling/ConjunctionSolrSpellChecker.java |  7 +++
 .../solr/spelling/DirectSolrSpellChecker.java      | 50 +++++++++++++++++++++
 .../org/apache/solr/spelling/SolrSpellChecker.java |  8 ++++
 .../solr/collection1/conf/solrconfig.xml           |  7 +++
 .../DistributedSpellCheckComponentTest.java        | 52 +++++++++++++++++++++-
 .../modules/query-guide/pages/spell-checking.adoc  | 11 +++--
 9 files changed, 157 insertions(+), 5 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index bc74d294ea9..ad2cb3e7976 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -52,7 +52,9 @@ Optimizations
 
 Bug Fixes
 ---------------------
-(No changes)
+
+* SOLR-16436: Fix "false positive" suggestions from DirectSolrSpellChecker when using maxQueryFrequency > 1 in
+  multi-shard collections (hossman)
 
 Build
 ---------------------
diff --git a/solr/core/src/java/org/apache/solr/handler/component/SpellCheckComponent.java b/solr/core/src/java/org/apache/solr/handler/component/SpellCheckComponent.java
index 03311aebaa2..ce841535dd5 100644
--- a/solr/core/src/java/org/apache/solr/handler/component/SpellCheckComponent.java
+++ b/solr/core/src/java/org/apache/solr/handler/component/SpellCheckComponent.java
@@ -365,6 +365,8 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
     int purpose =
         rb.grouping() ? ShardRequest.PURPOSE_GET_TOP_GROUPS : ShardRequest.PURPOSE_GET_TOP_IDS;
     if ((sreq.purpose & purpose) != 0) {
+      getSpellChecker(params).modifyRequest(rb, sreq);
+
       // fetch at least 5 suggestions from each shard
       int count = sreq.params.getInt(SPELLCHECK_COUNT, 1);
       if (count < 5) count = 5;
diff --git a/solr/core/src/java/org/apache/solr/handler/component/SpellCheckMergeData.java b/solr/core/src/java/org/apache/solr/handler/component/SpellCheckMergeData.java
index 8b30bde5046..d7f550609b9 100644
--- a/solr/core/src/java/org/apache/solr/handler/component/SpellCheckMergeData.java
+++ b/solr/core/src/java/org/apache/solr/handler/component/SpellCheckMergeData.java
@@ -18,11 +18,13 @@ package org.apache.solr.handler.component;
 
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.Iterator;
 import java.util.LinkedHashMap;
 import java.util.Map;
 import java.util.Set;
 import org.apache.lucene.search.spell.SuggestWord;
 import org.apache.solr.client.solrj.response.SpellCheckResponse;
+import org.apache.solr.common.util.NamedList;
 import org.apache.solr.spelling.SpellCheckCollation;
 
 public class SpellCheckMergeData {
@@ -42,6 +44,25 @@ public class SpellCheckMergeData {
   public Set<String> originalTerms = null;
   public int totalNumberShardResponses = 0;
 
+  /**
+   * Removes the original word from all maps where original words are used as keys, as well as any
+   * collations that contain this original word as a misspelling
+   */
+  public void removeOriginal(final String original) {
+    origVsFreq.remove(original);
+    origVsShards.remove(original);
+    origVsSuggested.remove(original);
+
+    final Iterator<Map.Entry<String, SpellCheckCollation>> iter = collations.entrySet().iterator();
+    iter.forEachRemaining(
+        e -> {
+          final NamedList<String> misspellings = e.getValue().getMisspellingsAndCorrections();
+          if (null != misspellings && null != misspellings.get(original)) {
+            iter.remove();
+          }
+        });
+  }
+
   public boolean isOriginalToQuery(String term) {
     if (originalTerms == null) {
       return true;
diff --git a/solr/core/src/java/org/apache/solr/spelling/ConjunctionSolrSpellChecker.java b/solr/core/src/java/org/apache/solr/spelling/ConjunctionSolrSpellChecker.java
index e004626c823..8e72558bfa0 100644
--- a/solr/core/src/java/org/apache/solr/spelling/ConjunctionSolrSpellChecker.java
+++ b/solr/core/src/java/org/apache/solr/spelling/ConjunctionSolrSpellChecker.java
@@ -28,6 +28,8 @@ import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.search.spell.StringDistance;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.core.SolrCore;
+import org.apache.solr.handler.component.ResponseBuilder;
+import org.apache.solr.handler.component.ShardRequest;
 import org.apache.solr.handler.component.SpellCheckMergeData;
 import org.apache.solr.search.SolrIndexSearcher;
 
@@ -111,6 +113,11 @@ public class ConjunctionSolrSpellChecker extends SolrSpellChecker {
     return mergeCheckers(results, options.count);
   }
 
+  @Override
+  public void modifyRequest(ResponseBuilder rb, ShardRequest sreq) {
+    checkers.forEach(c -> c.modifyRequest(rb, sreq));
+  }
+
   @Override
   public SpellingResult mergeSuggestions(
       SpellCheckMergeData mergeData, int numSug, int count, boolean extendedResults) {
diff --git a/solr/core/src/java/org/apache/solr/spelling/DirectSolrSpellChecker.java b/solr/core/src/java/org/apache/solr/spelling/DirectSolrSpellChecker.java
index 04b8bec49e4..9c157729ad1 100644
--- a/solr/core/src/java/org/apache/solr/spelling/DirectSolrSpellChecker.java
+++ b/solr/core/src/java/org/apache/solr/spelling/DirectSolrSpellChecker.java
@@ -18,6 +18,7 @@ package org.apache.solr.spelling;
 
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
+import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.List;
@@ -28,8 +29,12 @@ import org.apache.lucene.search.spell.SuggestWord;
 import org.apache.lucene.search.spell.SuggestWordFrequencyComparator;
 import org.apache.lucene.search.spell.SuggestWordQueue;
 import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.common.params.SpellingParams;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.core.SolrCore;
+import org.apache.solr.handler.component.ResponseBuilder;
+import org.apache.solr.handler.component.ShardRequest;
+import org.apache.solr.handler.component.SpellCheckMergeData;
 import org.apache.solr.search.SolrIndexSearcher;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -233,6 +238,51 @@ public class DirectSolrSpellChecker extends SolrSpellChecker {
     return result;
   }
 
+  @Override
+  public void modifyRequest(ResponseBuilder rb, ShardRequest sreq) {
+    if (1.0F <= checker.getMaxQueryFrequency()) {
+      // we need extended results in order to prune things individual shards might think are below
+      // max freq
+      sreq.params.set(SpellingParams.SPELLCHECK_EXTENDED_RESULTS, true);
+      sreq.params.set(SpellingParams.SPELLCHECK_COLLATE_EXTENDED_RESULTS, true);
+    }
+  }
+
+  @Override
+  public SpellingResult mergeSuggestions(
+      SpellCheckMergeData mergeData, int numSug, int count, boolean extendedResults) {
+
+    for (String original : new ArrayList<>(mergeData.origVsSuggested.keySet())) {
+      if (mergeData.origVsFreq.containsKey(original)) {
+        if (1.0F <= checker.getMaxQueryFrequency()) {
+          // absolute maxQueryFreq threshold
+          if (checker.getMaxQueryFrequency() < mergeData.origVsFreq.get(original)) {
+            // one or more shards thought the word needed suggestions because it's
+            // (per-shard) origFreq was too low, but the aggregate sum of origFreq
+            // is above our threshold, so ignore those suggestions.
+            mergeData.removeOriginal(original);
+          }
+        } else {
+          // percentage maxQueryFreq threshold
+
+          // This situation is also problematic, but in the reverse situation of
+          // the absolute maxQueryFreq threshold.
+          //
+          // An individual shard may have found that it's (per-shard) origFreq
+          // was higher then the computed max for that shard (relative to the
+          // per-shard maxDoc) and said it's "correctlySpelled" even though
+          // it's cumulative origFreq may not meet the computed max across the
+          // entire collection.
+          //
+          // But we don't have a straightforward way to determine that, so for
+          // now it's just a documented deficiency in the ref-guide.
+
+        }
+      }
+    }
+    return super.mergeSuggestions(mergeData, numSug, count, extendedResults);
+  }
+
   @Override
   public float getAccuracy() {
     return checker.getAccuracy();
diff --git a/solr/core/src/java/org/apache/solr/spelling/SolrSpellChecker.java b/solr/core/src/java/org/apache/solr/spelling/SolrSpellChecker.java
index 6dbd87bdab4..f326a6f21c7 100644
--- a/solr/core/src/java/org/apache/solr/spelling/SolrSpellChecker.java
+++ b/solr/core/src/java/org/apache/solr/spelling/SolrSpellChecker.java
@@ -31,6 +31,8 @@ import org.apache.solr.client.solrj.response.SpellCheckResponse;
 import org.apache.solr.common.params.SpellingParams;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.core.SolrCore;
+import org.apache.solr.handler.component.ResponseBuilder;
+import org.apache.solr.handler.component.ShardRequest;
 import org.apache.solr.handler.component.SpellCheckMergeData;
 import org.apache.solr.schema.FieldType;
 import org.apache.solr.schema.IndexSchema;
@@ -75,6 +77,12 @@ public abstract class SolrSpellChecker {
     }
     return name;
   }
+
+  /** modify the shard request to be used in a distributed environment. */
+  public void modifyRequest(ResponseBuilder rb, ShardRequest sreq) {
+    /* No-Op */
+  }
+
   /** Integrate spelling suggestions from the various shards in a distributed environment. */
   public SpellingResult mergeSuggestions(
       SpellCheckMergeData mergeData, int numSug, int count, boolean extendedResults) {
diff --git a/solr/core/src/test-files/solr/collection1/conf/solrconfig.xml b/solr/core/src/test-files/solr/collection1/conf/solrconfig.xml
index 02d79645eba..44d64e1e119 100644
--- a/solr/core/src/test-files/solr/collection1/conf/solrconfig.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/solrconfig.xml
@@ -236,6 +236,13 @@
       <str name="field">lowerfilt</str>
       <int name="minQueryLength">3</int>
     </lst>
+    <lst name="spellchecker">
+      <str name="name">directMQF2</str>
+      <str name="classname">DirectSolrSpellChecker</str>
+      <str name="field">lowerfilt</str>
+      <int name="minQueryLength">3</int>
+      <int name="maxQueryFrequency">2</int>
+    </lst>
     <lst name="spellchecker">
       <str name="name">wordbreak</str>
       <str name="classname">solr.WordBreakSolrSpellChecker</str>
diff --git a/solr/core/src/test/org/apache/solr/handler/component/DistributedSpellCheckComponentTest.java b/solr/core/src/test/org/apache/solr/handler/component/DistributedSpellCheckComponentTest.java
index 72dc30a4c79..07114e15627 100644
--- a/solr/core/src/test/org/apache/solr/handler/component/DistributedSpellCheckComponentTest.java
+++ b/solr/core/src/test/org/apache/solr/handler/component/DistributedSpellCheckComponentTest.java
@@ -64,11 +64,21 @@ public class DistributedSpellCheckComponentTest extends BaseDistributedSearchTes
   @Override
   public void validateControlData(QueryResponse control) {
     NamedList<Object> nl = control.getResponse();
+
+    Object explicitNumSuggestExpected =
+        nl.findRecursive("responseHeader", "params", "test.expected.suggestions");
+
     @SuppressWarnings("unchecked")
     NamedList<Object> sc = (NamedList<Object>) nl.get("spellcheck");
     @SuppressWarnings("unchecked")
     NamedList<Object> sug = (NamedList<Object>) sc.get("suggestions");
-    if (sug.size() == 0) {
+
+    if (null != explicitNumSuggestExpected) {
+      assertEquals(
+          "Control data did not return test specified num suggestions",
+          explicitNumSuggestExpected,
+          Integer.toString(sug.size()));
+    } else if (sug.size() == 0) {
       Assert.fail("Control data did not return any suggestions.");
     }
   }
@@ -107,6 +117,7 @@ public class DistributedSpellCheckComponentTest extends BaseDistributedSearchTes
     handle.put("timestamp", SKIPVAL);
     handle.put("maxScore", SKIPVAL);
     // we care only about the spellcheck results
+    handle.put("responseHeader", SKIP);
     handle.put("response", SKIP);
     handle.put("grouped", SKIP);
 
@@ -341,6 +352,45 @@ public class DistributedSpellCheckComponentTest extends BaseDistributedSearchTes
             "1",
             collateExtended,
             "true"));
+
+    // term will exist in a total of 4 docs, but only 2 per shard (which is less them directMQF2
+    // requires)
+    // (This of course assumes we have more then 1 shard)
+    index_specific(0, id, "30", "lowerfilt", "fax");
+    index_specific(0, id, "31", "lowerfilt", "fax");
+    index_specific(getShardCount() - 1, id, "40", "lowerfilt", "fax");
+    index_specific(getShardCount() - 1, id, "41", "lowerfilt", "fax");
+    commit();
+
+    query(
+        true,
+        params(
+            "qt",
+            "/spellCheckCompRH_Direct",
+            "shards.qt",
+            "/spellCheckCompRH_Direct",
+            "rows",
+            "0",
+            "q",
+            "lowerfilt:fax",
+            "spellcheck",
+            "true",
+            "spellcheck.dictionary",
+            "directMQF2",
+            "spellcheck.maxResultsForSuggest",
+            "9999",
+            "spellcheck.onlyMorePopular",
+            "true",
+            extended,
+            Boolean.toString(random().nextBoolean()),
+            collate,
+            Boolean.toString(random().nextBoolean()),
+            collateExtended,
+            Boolean.toString(random().nextBoolean()),
+            "test.expected.suggestions",
+            "0", // this word is correctly spelled, in more docs then configured maxQueryFrequency
+            "echoParams",
+            "all")); // needed so validateControlData can see our test.expected.suggestions
   }
 
   private Object[] buildRequest(
diff --git a/solr/solr-ref-guide/modules/query-guide/pages/spell-checking.adoc b/solr/solr-ref-guide/modules/query-guide/pages/spell-checking.adoc
index 84b0194b5e2..0adfd4c10fc 100644
--- a/solr/solr-ref-guide/modules/query-guide/pages/spell-checking.adoc
+++ b/solr/solr-ref-guide/modules/query-guide/pages/spell-checking.adoc
@@ -105,10 +105,15 @@ The `maxInspections` parameter defines the maximum number of possible matches to
 There is no limit to term length by default.
 
 At first, spellchecker analyses incoming query words by looking up them in the index.
-Only query words, which are absent in index or too rare ones (below `maxQueryFrequency`) are considered as misspelled and used for finding suggestions.
-Words which are frequent than `maxQueryFrequency` bypass spellchecker unchanged.
+Only query words which are absent from the index, or too rare (equal to or below `maxQueryFrequency`) are considered as misspelled and used for finding suggestions.
+Words which are more frequent than `maxQueryFrequency` bypass spellchecker unchanged.
 After suggestions for every misspelled word are found they are filtered for enough frequency with `thresholdTokenFrequency` as boundary value.
-These parameters (`maxQueryFrequency` and `thresholdTokenFrequency`) can be a percentage (such as .01, or 1%) or an absolute value (such as 4).
+These parameters (`maxQueryFrequency` and `thresholdTokenFrequency`) can be a percentage represented as a decimal value below 1 (such as `0.01` for or `1%`) or an absolute value (such as `4`).
+
+[NOTE]
+====
+When `maxQueryFrequency` is specified as a percentage, it is evaluated independently on each shard (relative to that shard's `maxDoc`) to determine if it should be considered misspelled.  Some infrequent terms may be considered correctly spelled, and not generate suggestions when expected, if the term distributions are uneven between shards.
+====
 
 ==== FileBasedSpellChecker