You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2021/02/23 11:58:33 UTC

[lucene-solr] branch master updated: LUCENE-9804: Hunspell: fix most similar dictionary entry search by reversing the comparator (#2419)

This is an automated email from the ASF dual-hosted git repository.

rmuir pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new c61b458  LUCENE-9804: Hunspell: fix most similar dictionary entry search by reversing the comparator (#2419)
c61b458 is described below

commit c61b458719909ab9a1e395d3985f7f9cd50f3390
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Tue Feb 23 12:58:22 2021 +0100

    LUCENE-9804: Hunspell: fix most similar dictionary entry search by reversing the comparator (#2419)
---
 .../analysis/hunspell/GeneratingSuggester.java     |   4 +-
 .../analysis/hunspell/TestSpellChecking.java       |   4 +
 .../org/apache/lucene/analysis/hunspell/ngram.aff  |   0
 .../org/apache/lucene/analysis/hunspell/ngram.dic  | 202 +++++++++++++++++++++
 .../org/apache/lucene/analysis/hunspell/ngram.sug  |   1 +
 .../apache/lucene/analysis/hunspell/ngram.wrong    |   1 +
 6 files changed, 211 insertions(+), 1 deletion(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
index 500ae15..c1bbdf9 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
@@ -22,6 +22,7 @@ import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_STRIP_ORD;
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Comparator;
 import java.util.EnumSet;
 import java.util.LinkedHashSet;
 import java.util.List;
@@ -60,7 +61,8 @@ class GeneratingSuggester {
 
   private List<Weighted<Root<String>>> findSimilarDictionaryEntries(
       String word, WordCase originalCase) {
-    PriorityQueue<Weighted<Root<String>>> roots = new PriorityQueue<>();
+    Comparator<Weighted<Root<String>>> natural = Comparator.naturalOrder();
+    PriorityQueue<Weighted<Root<String>>> roots = new PriorityQueue<>(natural.reversed());
     processFST(
         dictionary.words,
         (key, forms) -> {
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
index 7737531..c131237 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
@@ -201,6 +201,10 @@ public class TestSpellChecking extends LuceneTestCase {
     doTest("sug2");
   }
 
+  public void testGeneratedSuggestions() throws Exception {
+    doTest("ngram");
+  }
+
   public void testMaxNGramSugsDefaultIsNotUnlimited() throws Exception {
     doTest("maxNGramSugsDefault");
   }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ngram.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ngram.aff
new file mode 100644
index 0000000..e69de29
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ngram.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ngram.dic
new file mode 100644
index 0000000..69f15a8
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ngram.dic
@@ -0,0 +1,202 @@
+100
+A
+AA
+AAA
+AB
+ABA
+ABC
+ABM
+ABS
+AC
+ACLU
+ACT
+ACTH
+AD
+ADC
+ADD
+ADM
+ADP
+AF
+AFAIK
+AFB
+AFC
+AFDC
+AFN
+AFT
+AI
+AIDS
+AK
+AL
+AM
+AMA
+AMD
+ANSI
+ANZUS
+AOL
+AP
+APB
+APC
+API
+APO
+APR
+AR
+ARC
+ASAP
+ASCII
+ASL
+ASPCA
+ATM
+ATP
+ATV
+AV
+AVI
+AWACS
+AWOL
+AWS
+AZ
+AZT
+Aachen
+Aaliyah
+Aaron
+Abbas
+Abbasid
+Abbott
+Abby
+Abdul
+Abe
+Abel
+Abelard
+Abelson
+Aberdeen
+Abernathy
+Abidjan
+Abigail
+Abilene
+Abner
+Aborigine
+Abraham
+Abram
+Abrams
+Absalom
+Abuja
+Abyssinia
+Abyssinian
+Ac
+Acadia
+Acapulco
+Accenture
+Accra
+Acevedo
+Achaean
+Achebe
+Achernar
+Acheson
+Achilles
+Aconcagua
+Acosta
+Acropolis
+Acrux
+Actaeon
+Acton
+Acts
+Acuff
+thermostat
+squeaker
+Theron
+heather
+taker
+Thermos
+thinker
+theorist
+theorize
+theatrics
+therapeutic
+lawbreaker
+Northeast
+weather
+tiebreaker
+their
+therm
+there
+therefor
+theta
+theoretic
+thereunder
+Theiler
+therapist
+thematic
+therewith
+icebreaker
+Thespian
+sneaker
+theater
+breaker
+speaker
+Heather
+Whitaker
+toolmaker
+Dorothea
+Thermopylae
+thereto
+theocracy
+thereby
+ethereal
+theremin
+caretaker
+thereat
+Theravada
+threadlike
+therein
+thereafter
+thereupon
+streaker
+thereof
+they're
+thereon
+jawbreaker
+shoemaker
+shaker
+threader
+Thackeray
+thermionic
+heartbreak
+therapy
+thesauri
+feathery
+theatricals
+takeover
+leather
+thespian
+thunderhead
+Katheryn
+thereunto
+thereabout
+feather
+Shaker
+therefrom
+Thea
+leathery
+beaker
+therefore
+thesaurus
+homemaker
+theory
+theorem
+theocratic
+Therese
+Theresa
+Theodore
+Theodora
+Theodoric
+teakettle
+Thatcher
+theatergoer
+Katherine
+watchmaker
+theatrical
+haymaker
+breather
+thither
+thwacker
+thermal
+thermos
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ngram.sug b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ngram.sug
new file mode 100644
index 0000000..4e7b10c
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ngram.sug
@@ -0,0 +1 @@
+Theater, Heather, Thereat
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ngram.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ngram.wrong
new file mode 100644
index 0000000..2ce830e
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ngram.wrong
@@ -0,0 +1 @@
+Theaker
\ No newline at end of file