You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ho...@apache.org on 2023/01/22 22:44:42 UTC
[lucene] branch main updated: WordBreakSpellChecker now correctly respects maxEvaluations (#12077)
This is an automated email from the ASF dual-hosted git repository.
hossman pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/main by this push:
new 9007f746a39 WordBreakSpellChecker now correctly respects maxEvaluations (#12077)
9007f746a39 is described below
commit 9007f746a395aebeb0f563d260469ae61ca64a71
Author: Chris Hostetter <ho...@apache.org>
AuthorDate: Sun Jan 22 15:44:29 2023 -0700
WordBreakSpellChecker now correctly respects maxEvaluations (#12077)
---
lucene/CHANGES.txt | 2 ++
.../lucene/search/spell/WordBreakSpellChecker.java | 22 +++++++--------
.../search/spell/TestWordBreakSpellChecker.java | 32 ++++++++++++++++++++++
3 files changed, 44 insertions(+), 12 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 7a1211bd61f..d9e7340f55a 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -252,6 +252,8 @@ Bug Fixes
* GITHUB#12084: Same bound with fallbackQuery. (Lu Xugang)
+* GITHUB#12077: WordBreakSpellChecker now correctly respects maxEvaluations (hossman)
+
Optimizations
---------------------
* GITHUB#11738: Optimize MultiTermQueryConstantScoreWrapper when a term is present that matches all
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java b/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java
index 56bc2a9388d..b5572f717d6 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java
@@ -252,17 +252,21 @@ public class WordBreakSpellChecker {
if (useMinBreakWordLength < 1) {
useMinBreakWordLength = 1;
}
+
if (termLength < (useMinBreakWordLength * 2)) {
- return 0;
+ return totalEvaluations;
}
- int thisTimeEvaluations = 0;
for (int i = useMinBreakWordLength; i <= (termLength - useMinBreakWordLength); i++) {
+ if (totalEvaluations >= maxEvaluations) {
+ break;
+ }
+ totalEvaluations++;
+
int end = termText.offsetByCodePoints(0, i);
String leftText = termText.substring(0, end);
String rightText = termText.substring(end);
SuggestWord leftWord = generateSuggestWord(ir, term.field(), leftText);
-
if (leftWord.freq >= useMinSuggestionFrequency) {
SuggestWord rightWord = generateSuggestWord(ir, term.field(), rightText);
if (rightWord.freq >= useMinSuggestionFrequency) {
@@ -275,7 +279,7 @@ public class WordBreakSpellChecker {
}
int newNumberBreaks = numberBreaks + 1;
if (newNumberBreaks <= maxChanges) {
- int evaluations =
+ totalEvaluations =
generateBreakUpSuggestions(
new Term(term.field(), rightWord.string),
ir,
@@ -286,17 +290,11 @@ public class WordBreakSpellChecker {
suggestions,
totalEvaluations,
sortMethod);
- totalEvaluations += evaluations;
}
}
-
- thisTimeEvaluations++;
- totalEvaluations++;
- if (totalEvaluations >= maxEvaluations) {
- break;
- }
}
- return thisTimeEvaluations;
+
+ return totalEvaluations;
}
private SuggestWord[] newPrefix(SuggestWord[] oldPrefix, SuggestWord append) {
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java b/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java
index 513f2bb5e28..afb24b3c99a 100644
--- a/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java
@@ -54,6 +54,11 @@ public class TestWordBreakSpellChecker extends LuceneTestCase {
writer.addDocument(doc);
}
+ {
+ Document doc = new Document();
+ doc.add(newTextField("abba", "A B AB ABA BAB", Field.Store.NO));
+ writer.addDocument(doc);
+ }
{
Document doc = new Document();
doc.add(newTextField("numbers", "thou hast sand betwixt thy toes", Field.Store.NO));
@@ -80,6 +85,33 @@ public class TestWordBreakSpellChecker extends LuceneTestCase {
super.tearDown();
}
+ public void testMaxEvaluations() throws Exception {
+ final int maxEvals = 100;
+
+ try (IndexReader ir = DirectoryReader.open(dir)) {
+ WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
+ wbsp.setMaxChanges(10);
+ wbsp.setMinBreakWordLength(1);
+ wbsp.setMinSuggestionFrequency(1);
+ wbsp.setMaxEvaluations(100);
+
+ Term term = new Term("abba", "ab".repeat(5));
+ SuggestWord[][] sw =
+ wbsp.suggestWordBreaks(
+ term,
+ 500,
+ ir,
+ SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX,
+ BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
+
+ // sanity check that our suggester isn't completely broken
+ assertThat(sw.length, org.hamcrest.Matchers.greaterThan(0));
+
+ // if maxEvaluations is respected, we can't possibly have more suggestions then that.
+ assertThat(sw.length, org.hamcrest.Matchers.lessThan(maxEvals));
+ }
+ }
+
public void testCombiningWords() throws Exception {
IndexReader ir = DirectoryReader.open(dir);
WordBreakSpellChecker wbsp = new WordBreakSpellChecker();