You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ho...@apache.org on 2023/01/22 23:00:26 UTC

[lucene] branch branch_9x updated: WordBreakSpellChecker now correctly respects maxEvaluations (#12077)

This is an automated email from the ASF dual-hosted git repository.

hossman pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/branch_9x by this push:
     new ef8c18b43c7 WordBreakSpellChecker now correctly respects maxEvaluations (#12077)
ef8c18b43c7 is described below

commit ef8c18b43c726ea770ef9353ecebe1e49db0b24a
Author: Chris Hostetter <ho...@apache.org>
AuthorDate: Sun Jan 22 15:44:29 2023 -0700

    WordBreakSpellChecker now correctly respects maxEvaluations (#12077)
    
    (cherry picked from commit 9007f746a395aebeb0f563d260469ae61ca64a71)
---
 lucene/CHANGES.txt                                 |  2 ++
 .../lucene/search/spell/WordBreakSpellChecker.java | 22 +++++++--------
 .../search/spell/TestWordBreakSpellChecker.java    | 32 ++++++++++++++++++++++
 3 files changed, 44 insertions(+), 12 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 3b68c77f702..bc9c3e94bf8 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -161,6 +161,8 @@ Bug Fixes
 
 * GITHUB#12084: Same bound with fallbackQuery. (Lu Xugang)
 
+* GITHUB#12077: WordBreakSpellChecker now correctly respects maxEvaluations (hossman)
+
 Optimizations
 ---------------------
 * GITHUB#11738: Optimize MultiTermQueryConstantScoreWrapper when a term is present that matches all
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java b/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java
index 56bc2a9388d..b5572f717d6 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java
@@ -252,17 +252,21 @@ public class WordBreakSpellChecker {
     if (useMinBreakWordLength < 1) {
       useMinBreakWordLength = 1;
     }
+
     if (termLength < (useMinBreakWordLength * 2)) {
-      return 0;
+      return totalEvaluations;
     }
 
-    int thisTimeEvaluations = 0;
     for (int i = useMinBreakWordLength; i <= (termLength - useMinBreakWordLength); i++) {
+      if (totalEvaluations >= maxEvaluations) {
+        break;
+      }
+      totalEvaluations++;
+
       int end = termText.offsetByCodePoints(0, i);
       String leftText = termText.substring(0, end);
       String rightText = termText.substring(end);
       SuggestWord leftWord = generateSuggestWord(ir, term.field(), leftText);
-
       if (leftWord.freq >= useMinSuggestionFrequency) {
         SuggestWord rightWord = generateSuggestWord(ir, term.field(), rightText);
         if (rightWord.freq >= useMinSuggestionFrequency) {
@@ -275,7 +279,7 @@ public class WordBreakSpellChecker {
         }
         int newNumberBreaks = numberBreaks + 1;
         if (newNumberBreaks <= maxChanges) {
-          int evaluations =
+          totalEvaluations =
               generateBreakUpSuggestions(
                   new Term(term.field(), rightWord.string),
                   ir,
@@ -286,17 +290,11 @@ public class WordBreakSpellChecker {
                   suggestions,
                   totalEvaluations,
                   sortMethod);
-          totalEvaluations += evaluations;
         }
       }
-
-      thisTimeEvaluations++;
-      totalEvaluations++;
-      if (totalEvaluations >= maxEvaluations) {
-        break;
-      }
     }
-    return thisTimeEvaluations;
+
+    return totalEvaluations;
   }
 
   private SuggestWord[] newPrefix(SuggestWord[] oldPrefix, SuggestWord append) {
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java b/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java
index 513f2bb5e28..afb24b3c99a 100644
--- a/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java
@@ -54,6 +54,11 @@ public class TestWordBreakSpellChecker extends LuceneTestCase {
       writer.addDocument(doc);
     }
 
+    {
+      Document doc = new Document();
+      doc.add(newTextField("abba", "A B AB ABA BAB", Field.Store.NO));
+      writer.addDocument(doc);
+    }
     {
       Document doc = new Document();
       doc.add(newTextField("numbers", "thou hast sand betwixt thy toes", Field.Store.NO));
@@ -80,6 +85,33 @@ public class TestWordBreakSpellChecker extends LuceneTestCase {
     super.tearDown();
   }
 
+  public void testMaxEvaluations() throws Exception {
+    final int maxEvals = 100;
+
+    try (IndexReader ir = DirectoryReader.open(dir)) {
+      WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
+      wbsp.setMaxChanges(10);
+      wbsp.setMinBreakWordLength(1);
+      wbsp.setMinSuggestionFrequency(1);
+      wbsp.setMaxEvaluations(100);
+
+      Term term = new Term("abba", "ab".repeat(5));
+      SuggestWord[][] sw =
+          wbsp.suggestWordBreaks(
+              term,
+              500,
+              ir,
+              SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX,
+              BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
+
+      // sanity check that our suggester isn't completely broken
+      assertThat(sw.length, org.hamcrest.Matchers.greaterThan(0));
+
+      // if maxEvaluations is respected, we can't possibly have more suggestions then that.
+      assertThat(sw.length, org.hamcrest.Matchers.lessThan(maxEvals));
+    }
+  }
+
   public void testCombiningWords() throws Exception {
     IndexReader ir = DirectoryReader.open(dir);
     WordBreakSpellChecker wbsp = new WordBreakSpellChecker();