You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2016/09/15 18:44:40 UTC

lucene-solr:master: LUCENE-7439: FuzzyQuery now matches all terms within the specified edit distance, even if they are short

Repository: lucene-solr
Updated Branches:
  refs/heads/master ea31811e4 -> c7fb49d7b


LUCENE-7439: FuzzyQuery now matches all terms within the specified edit distance, even if they are short


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/c7fb49d7
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/c7fb49d7
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/c7fb49d7

Branch: refs/heads/master
Commit: c7fb49d7b50d171c6d787253b9ab575218fef7fe
Parents: ea31811
Author: Mike McCandless <mi...@apache.org>
Authored: Thu Sep 15 14:44:26 2016 -0400
Committer: Mike McCandless <mi...@apache.org>
Committed: Thu Sep 15 14:44:26 2016 -0400

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |  3 ++
 .../lucene/codecs/blocktree/FieldReader.java    |  2 +-
 .../apache/lucene/search/FuzzyTermsEnum.java    | 56 ++++++++------------
 .../apache/lucene/search/TopTermsRewrite.java   |  4 +-
 .../search/FuzzyTermOnShortTermsTest.java       | 15 +++---
 .../apache/lucene/search/TestFuzzyQuery.java    | 10 ++--
 .../lucene/index/AssertingLeafReader.java       |  5 ++
 7 files changed, 46 insertions(+), 49 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c7fb49d7/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index e5f9afd..6102ea4 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -22,6 +22,9 @@ Bug Fixes
 
 Improvements
 
+* LUCENE-7439: FuzzyQuery now matches all terms within the specified
+  edit distance, even if they are short terms (Mike McCandless)
+
 Optimizations
 
 * LUCENE-7416: BooleanQuery optimizes queries that have queries that occur both

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c7fb49d7/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java
index 1e92a43..7f13a32 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java
@@ -201,6 +201,6 @@ public final class FieldReader extends Terms implements Accountable {
 
   @Override
   public String toString() {
-    return "BlockTreeTerms(terms=" + numTerms + ",postings=" + sumDocFreq + ",positions=" + sumTotalTermFreq + ",docs=" + docCount + ")";
+    return "BlockTreeTerms(seg=" + parent.segment +" terms=" + numTerms + ",postings=" + sumDocFreq + ",positions=" + sumTotalTermFreq + ",docs=" + docCount + ")";
   }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c7fb49d7/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java b/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
index d30d34e..881c5dd 100644
--- a/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
+++ b/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
@@ -189,9 +189,6 @@ public final class FuzzyTermsEnum extends TermsEnum {
       maxEdits--;
     }
 
-    // TODO: this opto could be improved, e.g. if the worst term in the queue is zzzz with ed=2, then, really, on the next segment, we
-    // should only be looking for ed=1 terms up until zzzz, then ed=2.  Tricky :)
-    
     if (oldMaxEdits != maxEdits || lastTerm == null) {
       // This is a very powerful optimization: the maximum edit distance has changed.  This happens because we collect only the top scoring
       // N (= 50, by default) terms, and if e.g. maxEdits=2, and the queue is now full of matching terms, and we notice that the worst entry
@@ -211,43 +208,34 @@ public final class FuzzyTermsEnum extends TermsEnum {
 
     BytesRef term;
 
-    // while loop because we skip short terms even if they are within the specified edit distance (see the NOTE in FuzzyQuery class javadocs)
-    while (true) {
-
-      term = actualEnum.next();
-      if (term == null) {
-        // end
-        break;
-      }
+    term = actualEnum.next();
+    if (term == null) {
+      // end
+      return null;
+    }
 
-      int ed = maxEdits;
-      
-      // we know the outer DFA always matches.
-      // now compute exact edit distance
-      while (ed > 0) {
-        if (matches(term, ed - 1)) {
-          ed--;
-        } else {
-          break;
-        }
-      }
+    int ed = maxEdits;
       
-      if (ed == 0) { // exact match
-        boostAtt.setBoost(1.0F);
-        break;
+    // we know the outer DFA always matches.
+    // now compute exact edit distance
+    while (ed > 0) {
+      if (matches(term, ed - 1)) {
+        ed--;
       } else {
-        final int codePointCount = UnicodeUtil.codePointCount(term);
-        int minTermLength = Math.min(codePointCount, termLength);
-
-        // only accept a matching term if it's longer than the edit distance:
-        if (minTermLength > ed) {
-          float similarity = 1.0f - (float) ed / (float) minTermLength;
-          boostAtt.setBoost(similarity);
-          break;
-        }
+        break;
       }
     }
       
+    if (ed == 0) { // exact match
+      boostAtt.setBoost(1.0F);
+    } else {
+      final int codePointCount = UnicodeUtil.codePointCount(term);
+      int minTermLength = Math.min(codePointCount, termLength);
+
+      float similarity = 1.0f - (float) ed / (float) minTermLength;
+      boostAtt.setBoost(similarity);
+    }
+      
     final float bottom = maxBoostAtt.getMaxNonCompetitiveBoost();
     final BytesRef bottomTerm = maxBoostAtt.getCompetitiveTerm();
     if (term != null && (bottom != this.bottom || bottomTerm != this.bottomTerm)) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c7fb49d7/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java b/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java
index 013171d..b75836e 100644
--- a/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java
@@ -160,7 +160,9 @@ public abstract class TopTermsRewrite<B> extends TermCollectingRewrite<B> {
 
     for (final ScoreTerm st : scoreTerms) {
       final Term term = new Term(query.field, st.bytes.toBytesRef());
-      addClause(b, term, st.termState.docFreq(), st.boost, st.termState); // add to query
+      // We allow negative term scores (fuzzy query does this, for example) while collecting the terms,
+      // but truncate such boosts to 0.0f when building the query:
+      addClause(b, term, st.termState.docFreq(), Math.max(0.0f, st.boost), st.termState); // add to query
     }
     return build(b);
   }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c7fb49d7/lucene/core/src/test/org/apache/lucene/search/FuzzyTermOnShortTermsTest.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/FuzzyTermOnShortTermsTest.java b/lucene/core/src/test/org/apache/lucene/search/FuzzyTermOnShortTermsTest.java
index 427888b..e4c633c 100644
--- a/lucene/core/src/test/org/apache/lucene/search/FuzzyTermOnShortTermsTest.java
+++ b/lucene/core/src/test/org/apache/lucene/search/FuzzyTermOnShortTermsTest.java
@@ -48,16 +48,17 @@ public class FuzzyTermOnShortTermsTest extends LuceneTestCase {
 
       countHits(a, new String[]{"abcde"}, new FuzzyQuery(new Term(FIELD, "abc"), 2), 1);
       countHits(a, new String[]{"abc"}, new FuzzyQuery(new Term(FIELD, "abcde"), 2), 1);
+
+      // LUCENE-7439: these now work as well:
       
-      //these don't      
-      countHits(a, new String[]{"ab"}, new FuzzyQuery(new Term(FIELD, "a"), 1), 0);
-      countHits(a, new String[]{"a"}, new FuzzyQuery(new Term(FIELD, "ab"), 1), 0);
+      countHits(a, new String[]{"ab"}, new FuzzyQuery(new Term(FIELD, "a"), 1), 1);
+      countHits(a, new String[]{"a"}, new FuzzyQuery(new Term(FIELD, "ab"), 1), 1);
       
-      countHits(a, new String[]{"abc"}, new FuzzyQuery(new Term(FIELD, "a"), 2), 0);
-      countHits(a, new String[]{"a"}, new FuzzyQuery(new Term(FIELD, "abc"), 2), 0);
+      countHits(a, new String[]{"abc"}, new FuzzyQuery(new Term(FIELD, "a"), 2), 1);
+      countHits(a, new String[]{"a"}, new FuzzyQuery(new Term(FIELD, "abc"), 2), 1);
 
-      countHits(a, new String[]{"abcd"}, new FuzzyQuery(new Term(FIELD, "ab"), 2), 0);
-      countHits(a, new String[]{"ab"}, new FuzzyQuery(new Term(FIELD, "abcd"), 2), 0);
+      countHits(a, new String[]{"abcd"}, new FuzzyQuery(new Term(FIELD, "ab"), 2), 1);
+      countHits(a, new String[]{"ab"}, new FuzzyQuery(new Term(FIELD, "abcd"), 2), 1);
    }
    
    private void countHits(Analyzer analyzer, String[] docs, Query q, int expected) throws Exception {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c7fb49d7/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java
index 1e90525..62e63ea 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java
@@ -543,12 +543,10 @@ public class TestFuzzyQuery extends LuceneTestCase {
           continue;
         }
         int ed = getDistance(term, queryTerm);
-        if (Math.min(queryTerm.length(), term.length()) > ed) {        
-          float score = 1f - (float) ed / (float) Math.min(queryTerm.length(), term.length());
-          while (ed < 3) {
-            expected[ed].add(new TermAndScore(term, score));
-            ed++;
-          }
+        float score = 1f - (float) ed / (float) Math.min(queryTerm.length(), term.length());
+        while (ed < 3) {
+          expected[ed].add(new TermAndScore(term, score));
+          ed++;
         }
       }
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c7fb49d7/lucene/test-framework/src/java/org/apache/lucene/index/AssertingLeafReader.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/AssertingLeafReader.java b/lucene/test-framework/src/java/org/apache/lucene/index/AssertingLeafReader.java
index b4bcb1e..c33db57 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/index/AssertingLeafReader.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/AssertingLeafReader.java
@@ -126,6 +126,11 @@ public class AssertingLeafReader extends FilterLeafReader {
       assert termsEnum != null;
       return new AssertingTermsEnum(termsEnum);
     }
+
+    @Override
+    public String toString() {
+      return "AssertingTerms(" + in + ")";
+    }
   }
   
   static final VirtualMethod<TermsEnum> SEEK_EXACT = new VirtualMethod<>(TermsEnum.class, "seekExact", BytesRef.class);