You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2016/09/15 18:44:40 UTC
lucene-solr:master: LUCENE-7439: FuzzyQuery now matches all terms
within the specified edit distance, even if they are short
Repository: lucene-solr
Updated Branches:
refs/heads/master ea31811e4 -> c7fb49d7b
LUCENE-7439: FuzzyQuery now matches all terms within the specified edit distance, even if they are short
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/c7fb49d7
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/c7fb49d7
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/c7fb49d7
Branch: refs/heads/master
Commit: c7fb49d7b50d171c6d787253b9ab575218fef7fe
Parents: ea31811
Author: Mike McCandless <mi...@apache.org>
Authored: Thu Sep 15 14:44:26 2016 -0400
Committer: Mike McCandless <mi...@apache.org>
Committed: Thu Sep 15 14:44:26 2016 -0400
----------------------------------------------------------------------
lucene/CHANGES.txt | 3 ++
.../lucene/codecs/blocktree/FieldReader.java | 2 +-
.../apache/lucene/search/FuzzyTermsEnum.java | 56 ++++++++------------
.../apache/lucene/search/TopTermsRewrite.java | 4 +-
.../search/FuzzyTermOnShortTermsTest.java | 15 +++---
.../apache/lucene/search/TestFuzzyQuery.java | 10 ++--
.../lucene/index/AssertingLeafReader.java | 5 ++
7 files changed, 46 insertions(+), 49 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c7fb49d7/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index e5f9afd..6102ea4 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -22,6 +22,9 @@ Bug Fixes
Improvements
+* LUCENE-7439: FuzzyQuery now matches all terms within the specified
+ edit distance, even if they are short terms (Mike McCandless)
+
Optimizations
* LUCENE-7416: BooleanQuery optimizes queries that have queries that occur both
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c7fb49d7/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java
index 1e92a43..7f13a32 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java
@@ -201,6 +201,6 @@ public final class FieldReader extends Terms implements Accountable {
@Override
public String toString() {
- return "BlockTreeTerms(terms=" + numTerms + ",postings=" + sumDocFreq + ",positions=" + sumTotalTermFreq + ",docs=" + docCount + ")";
+ return "BlockTreeTerms(seg=" + parent.segment +" terms=" + numTerms + ",postings=" + sumDocFreq + ",positions=" + sumTotalTermFreq + ",docs=" + docCount + ")";
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c7fb49d7/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java b/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
index d30d34e..881c5dd 100644
--- a/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
+++ b/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
@@ -189,9 +189,6 @@ public final class FuzzyTermsEnum extends TermsEnum {
maxEdits--;
}
- // TODO: this opto could be improved, e.g. if the worst term in the queue is zzzz with ed=2, then, really, on the next segment, we
- // should only be looking for ed=1 terms up until zzzz, then ed=2. Tricky :)
-
if (oldMaxEdits != maxEdits || lastTerm == null) {
// This is a very powerful optimization: the maximum edit distance has changed. This happens because we collect only the top scoring
// N (= 50, by default) terms, and if e.g. maxEdits=2, and the queue is now full of matching terms, and we notice that the worst entry
@@ -211,43 +208,34 @@ public final class FuzzyTermsEnum extends TermsEnum {
BytesRef term;
- // while loop because we skip short terms even if they are within the specified edit distance (see the NOTE in FuzzyQuery class javadocs)
- while (true) {
-
- term = actualEnum.next();
- if (term == null) {
- // end
- break;
- }
+ term = actualEnum.next();
+ if (term == null) {
+ // end
+ return null;
+ }
- int ed = maxEdits;
-
- // we know the outer DFA always matches.
- // now compute exact edit distance
- while (ed > 0) {
- if (matches(term, ed - 1)) {
- ed--;
- } else {
- break;
- }
- }
+ int ed = maxEdits;
- if (ed == 0) { // exact match
- boostAtt.setBoost(1.0F);
- break;
+ // we know the outer DFA always matches.
+ // now compute exact edit distance
+ while (ed > 0) {
+ if (matches(term, ed - 1)) {
+ ed--;
} else {
- final int codePointCount = UnicodeUtil.codePointCount(term);
- int minTermLength = Math.min(codePointCount, termLength);
-
- // only accept a matching term if it's longer than the edit distance:
- if (minTermLength > ed) {
- float similarity = 1.0f - (float) ed / (float) minTermLength;
- boostAtt.setBoost(similarity);
- break;
- }
+ break;
}
}
+ if (ed == 0) { // exact match
+ boostAtt.setBoost(1.0F);
+ } else {
+ final int codePointCount = UnicodeUtil.codePointCount(term);
+ int minTermLength = Math.min(codePointCount, termLength);
+
+ float similarity = 1.0f - (float) ed / (float) minTermLength;
+ boostAtt.setBoost(similarity);
+ }
+
final float bottom = maxBoostAtt.getMaxNonCompetitiveBoost();
final BytesRef bottomTerm = maxBoostAtt.getCompetitiveTerm();
if (term != null && (bottom != this.bottom || bottomTerm != this.bottomTerm)) {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c7fb49d7/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java b/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java
index 013171d..b75836e 100644
--- a/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java
@@ -160,7 +160,9 @@ public abstract class TopTermsRewrite<B> extends TermCollectingRewrite<B> {
for (final ScoreTerm st : scoreTerms) {
final Term term = new Term(query.field, st.bytes.toBytesRef());
- addClause(b, term, st.termState.docFreq(), st.boost, st.termState); // add to query
+ // We allow negative term scores (fuzzy query does this, for example) while collecting the terms,
+ // but truncate such boosts to 0.0f when building the query:
+ addClause(b, term, st.termState.docFreq(), Math.max(0.0f, st.boost), st.termState); // add to query
}
return build(b);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c7fb49d7/lucene/core/src/test/org/apache/lucene/search/FuzzyTermOnShortTermsTest.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/FuzzyTermOnShortTermsTest.java b/lucene/core/src/test/org/apache/lucene/search/FuzzyTermOnShortTermsTest.java
index 427888b..e4c633c 100644
--- a/lucene/core/src/test/org/apache/lucene/search/FuzzyTermOnShortTermsTest.java
+++ b/lucene/core/src/test/org/apache/lucene/search/FuzzyTermOnShortTermsTest.java
@@ -48,16 +48,17 @@ public class FuzzyTermOnShortTermsTest extends LuceneTestCase {
countHits(a, new String[]{"abcde"}, new FuzzyQuery(new Term(FIELD, "abc"), 2), 1);
countHits(a, new String[]{"abc"}, new FuzzyQuery(new Term(FIELD, "abcde"), 2), 1);
+
+ // LUCENE-7439: these now work as well:
- //these don't
- countHits(a, new String[]{"ab"}, new FuzzyQuery(new Term(FIELD, "a"), 1), 0);
- countHits(a, new String[]{"a"}, new FuzzyQuery(new Term(FIELD, "ab"), 1), 0);
+ countHits(a, new String[]{"ab"}, new FuzzyQuery(new Term(FIELD, "a"), 1), 1);
+ countHits(a, new String[]{"a"}, new FuzzyQuery(new Term(FIELD, "ab"), 1), 1);
- countHits(a, new String[]{"abc"}, new FuzzyQuery(new Term(FIELD, "a"), 2), 0);
- countHits(a, new String[]{"a"}, new FuzzyQuery(new Term(FIELD, "abc"), 2), 0);
+ countHits(a, new String[]{"abc"}, new FuzzyQuery(new Term(FIELD, "a"), 2), 1);
+ countHits(a, new String[]{"a"}, new FuzzyQuery(new Term(FIELD, "abc"), 2), 1);
- countHits(a, new String[]{"abcd"}, new FuzzyQuery(new Term(FIELD, "ab"), 2), 0);
- countHits(a, new String[]{"ab"}, new FuzzyQuery(new Term(FIELD, "abcd"), 2), 0);
+ countHits(a, new String[]{"abcd"}, new FuzzyQuery(new Term(FIELD, "ab"), 2), 1);
+ countHits(a, new String[]{"ab"}, new FuzzyQuery(new Term(FIELD, "abcd"), 2), 1);
}
private void countHits(Analyzer analyzer, String[] docs, Query q, int expected) throws Exception {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c7fb49d7/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java
index 1e90525..62e63ea 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java
@@ -543,12 +543,10 @@ public class TestFuzzyQuery extends LuceneTestCase {
continue;
}
int ed = getDistance(term, queryTerm);
- if (Math.min(queryTerm.length(), term.length()) > ed) {
- float score = 1f - (float) ed / (float) Math.min(queryTerm.length(), term.length());
- while (ed < 3) {
- expected[ed].add(new TermAndScore(term, score));
- ed++;
- }
+ float score = 1f - (float) ed / (float) Math.min(queryTerm.length(), term.length());
+ while (ed < 3) {
+ expected[ed].add(new TermAndScore(term, score));
+ ed++;
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c7fb49d7/lucene/test-framework/src/java/org/apache/lucene/index/AssertingLeafReader.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/AssertingLeafReader.java b/lucene/test-framework/src/java/org/apache/lucene/index/AssertingLeafReader.java
index b4bcb1e..c33db57 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/index/AssertingLeafReader.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/AssertingLeafReader.java
@@ -126,6 +126,11 @@ public class AssertingLeafReader extends FilterLeafReader {
assert termsEnum != null;
return new AssertingTermsEnum(termsEnum);
}
+
+ @Override
+ public String toString() {
+ return "AssertingTerms(" + in + ")";
+ }
}
static final VirtualMethod<TermsEnum> SEEK_EXACT = new VirtualMethod<>(TermsEnum.class, "seekExact", BytesRef.class);