You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2016/09/15 19:45:54 UTC

lucene-solr:branch_6x: LUCENE-7439: FuzzyQuery now matches all terms within the specified edit distance, even if they are short

Repository: lucene-solr
Updated Branches:
  refs/heads/branch_6x bd9962aba -> 471f90cf8


LUCENE-7439: FuzzyQuery now matches all terms within the specified edit distance, even if they are short


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/471f90cf
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/471f90cf
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/471f90cf

Branch: refs/heads/branch_6x
Commit: 471f90cf825ee3106fef1fa4c1094d0ca461e7fb
Parents: bd9962a
Author: Mike McCandless <mi...@apache.org>
Authored: Thu Sep 15 15:45:41 2016 -0400
Committer: Mike McCandless <mi...@apache.org>
Committed: Thu Sep 15 15:45:41 2016 -0400

----------------------------------------------------------------------
 lucene/CHANGES.txt                               |  3 +++
 .../org/apache/lucene/search/FuzzyTermsEnum.java |  2 +-
 .../apache/lucene/search/TopTermsRewrite.java    |  4 +++-
 .../lucene/search/FuzzyTermOnShortTermsTest.java | 15 ++++++++-------
 .../org/apache/lucene/search/TestFuzzyQuery.java | 10 ++++------
 .../sandbox/queries/TestSlowFuzzyQuery.java      | 19 +++++++++++++++----
 6 files changed, 34 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/471f90cf/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 964719a..522da2f 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -32,6 +32,9 @@ Bug Fixes
 
 Improvements
 
+* LUCENE-7439: FuzzyQuery now matches all terms within the specified
+  edit distance, even if they are short terms (Mike McCandless)
+
 Optimizations
 
 Other

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/471f90cf/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java b/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
index 66a64e1..37f16b4 100644
--- a/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
+++ b/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
@@ -350,7 +350,7 @@ public class FuzzyTermsEnum extends TermsEnum {
         final int codePointCount = UnicodeUtil.codePointCount(term);
         final float similarity = 1.0f - ((float) ed / (float) 
             (Math.min(codePointCount, termLength)));
-        if (similarity > minSimilarity) {
+        if (minSimilarity == 0 || similarity > minSimilarity) {
           boostAtt.setBoost((similarity - minSimilarity) * scale_factor);
           //System.out.println("  yes");
           return AcceptStatus.YES;

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/471f90cf/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java b/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java
index 013171d..b75836e 100644
--- a/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java
@@ -160,7 +160,9 @@ public abstract class TopTermsRewrite<B> extends TermCollectingRewrite<B> {
 
     for (final ScoreTerm st : scoreTerms) {
       final Term term = new Term(query.field, st.bytes.toBytesRef());
-      addClause(b, term, st.termState.docFreq(), st.boost, st.termState); // add to query
+      // We allow negative term scores (fuzzy query does this, for example) while collecting the terms,
+      // but truncate such boosts to 0.0f when building the query:
+      addClause(b, term, st.termState.docFreq(), Math.max(0.0f, st.boost), st.termState); // add to query
     }
     return build(b);
   }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/471f90cf/lucene/core/src/test/org/apache/lucene/search/FuzzyTermOnShortTermsTest.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/FuzzyTermOnShortTermsTest.java b/lucene/core/src/test/org/apache/lucene/search/FuzzyTermOnShortTermsTest.java
index 427888b..faf4552 100644
--- a/lucene/core/src/test/org/apache/lucene/search/FuzzyTermOnShortTermsTest.java
+++ b/lucene/core/src/test/org/apache/lucene/search/FuzzyTermOnShortTermsTest.java
@@ -49,15 +49,16 @@ public class FuzzyTermOnShortTermsTest extends LuceneTestCase {
       countHits(a, new String[]{"abcde"}, new FuzzyQuery(new Term(FIELD, "abc"), 2), 1);
       countHits(a, new String[]{"abc"}, new FuzzyQuery(new Term(FIELD, "abcde"), 2), 1);
       
-      //these don't      
-      countHits(a, new String[]{"ab"}, new FuzzyQuery(new Term(FIELD, "a"), 1), 0);
-      countHits(a, new String[]{"a"}, new FuzzyQuery(new Term(FIELD, "ab"), 1), 0);
+      // LUCENE-7439: these now work as well:
       
-      countHits(a, new String[]{"abc"}, new FuzzyQuery(new Term(FIELD, "a"), 2), 0);
-      countHits(a, new String[]{"a"}, new FuzzyQuery(new Term(FIELD, "abc"), 2), 0);
+      countHits(a, new String[]{"ab"}, new FuzzyQuery(new Term(FIELD, "a"), 1), 1);
+      countHits(a, new String[]{"a"}, new FuzzyQuery(new Term(FIELD, "ab"), 1), 1);
+      
+      countHits(a, new String[]{"abc"}, new FuzzyQuery(new Term(FIELD, "a"), 2), 1);
+      countHits(a, new String[]{"a"}, new FuzzyQuery(new Term(FIELD, "abc"), 2), 1);
 
-      countHits(a, new String[]{"abcd"}, new FuzzyQuery(new Term(FIELD, "ab"), 2), 0);
-      countHits(a, new String[]{"ab"}, new FuzzyQuery(new Term(FIELD, "abcd"), 2), 0);
+      countHits(a, new String[]{"abcd"}, new FuzzyQuery(new Term(FIELD, "ab"), 2), 1);
+      countHits(a, new String[]{"ab"}, new FuzzyQuery(new Term(FIELD, "abcd"), 2), 1);
    }
    
    private void countHits(Analyzer analyzer, String[] docs, Query q, int expected) throws Exception {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/471f90cf/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java
index 1e90525..62e63ea 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java
@@ -543,12 +543,10 @@ public class TestFuzzyQuery extends LuceneTestCase {
           continue;
         }
         int ed = getDistance(term, queryTerm);
-        if (Math.min(queryTerm.length(), term.length()) > ed) {        
-          float score = 1f - (float) ed / (float) Math.min(queryTerm.length(), term.length());
-          while (ed < 3) {
-            expected[ed].add(new TermAndScore(term, score));
-            ed++;
-          }
+        float score = 1f - (float) ed / (float) Math.min(queryTerm.length(), term.length());
+        while (ed < 3) {
+          expected[ed].add(new TermAndScore(term, score));
+          ed++;
         }
       }
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/471f90cf/lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/TestSlowFuzzyQuery.java
----------------------------------------------------------------------
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/TestSlowFuzzyQuery.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/TestSlowFuzzyQuery.java
index 922213f..3ff1f3b 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/TestSlowFuzzyQuery.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/TestSlowFuzzyQuery.java
@@ -16,9 +16,11 @@
  */
 package org.apache.lucene.sandbox.queries;
 
-import java.util.List;
-import java.util.Arrays;
 import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
 
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
@@ -472,8 +474,17 @@ public class TestSlowFuzzyQuery extends LuceneTestCase {
     q = new SlowFuzzyQuery(new Term("field", "a"), 6f, 0, 50);
     hits = searcher.search(q, 10).scoreDocs;
     assertEquals(2, hits.length);
-    assertEquals("test", searcher.doc(hits[0].doc).get("field"));
-    assertEquals("foobar", searcher.doc(hits[1].doc).get("field"));
+
+    // We cannot expect a particular order since both hits 0.0 score:
+    Set<String> actual = new HashSet<>();
+    actual.add(searcher.doc(hits[0].doc).get("field"));
+    actual.add(searcher.doc(hits[1].doc).get("field"));
+
+    Set<String> expected = new HashSet<>();
+    expected.add("test");
+    expected.add("foobar");
+    
+    assertEquals(expected, actual);
     
     reader.close();
     index.close();