You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/11/05 22:54:24 UTC
svn commit: r1405977 - in /lucene/dev/trunk/lucene/suggest/src:
java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java
Author: mikemccand
Date: Mon Nov 5 21:54:24 2012
New Revision: 1405977
URL: http://svn.apache.org/viewvc?rev=1405977&view=rev
Log:
LUCENE-4534: dedup same surface form in Analyzing/FuzzySuggester
Modified:
lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
lucene/dev/trunk/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java
Modified: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java?rev=1405977&r1=1405976&r2=1405977&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (original)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java Mon Nov 5 21:54:24 2012
@@ -390,6 +390,7 @@ public class AnalyzingSuggester extends
try {
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
BytesRef surfaceForm;
+
while ((surfaceForm = iterator.next()) != null) {
Set<IntsRef> paths = toFiniteStrings(surfaceForm, ts2a);
@@ -430,6 +431,10 @@ public class AnalyzingSuggester extends
// Sort all input/output pairs (required by FST.Builder):
new Sort(sortComparator).sort(tempInput, tempSorted);
+
+ // Free disk space:
+ tempInput.delete();
+
reader = new Sort.ByteSequencesReader(tempSorted);
PairOutputs<Long,BytesRef> outputs = new PairOutputs<Long,BytesRef>(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton());
@@ -442,6 +447,12 @@ public class AnalyzingSuggester extends
IntsRef scratchInts = new IntsRef();
ByteArrayDataInput input = new ByteArrayDataInput();
+ // Used to remove duplicate surface forms (but we
+ // still index the hightest-weight one). We clear
+ // this when we see a new analyzed form, so it cannot
+ // grow unbounded (at most 256 entries):
+ Set<BytesRef> seenSurfaceForms = new HashSet<BytesRef>();
+
int dedup = 0;
while (reader.read(scratch)) {
input.reset(scratch.bytes, scratch.offset, scratch.length);
@@ -459,6 +470,7 @@ public class AnalyzingSuggester extends
if (previousAnalyzed == null) {
previousAnalyzed = new BytesRef();
previousAnalyzed.copyBytes(analyzed);
+ seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
} else if (analyzed.equals(previousAnalyzed)) {
dedup++;
if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
@@ -466,9 +478,15 @@ public class AnalyzingSuggester extends
// dups: skip the rest:
continue;
}
+ if (seenSurfaceForms.contains(surface)) {
+ continue;
+ }
+ seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
} else {
dedup = 0;
previousAnalyzed.copyBytes(analyzed);
+ seenSurfaceForms.clear();
+ seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
}
// TODO: I think we can avoid the extra 2 bytes when
Modified: lucene/dev/trunk/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java?rev=1405977&r1=1405976&r2=1405977&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (original)
+++ lucene/dev/trunk/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java Mon Nov 5 21:54:24 2012
@@ -1031,4 +1031,17 @@ public class AnalyzingSuggesterTest exte
new TermFreq("a b", 50),
}));
}
+
+ public void testDupSurfaceFormsMissingResults3() throws Exception {
+ Analyzer a = new MockAnalyzer(random());
+ AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1);
+ suggester.build(new TermFreqArrayIterator(new TermFreq[] {
+ new TermFreq("a a", 7),
+ new TermFreq("a a", 7),
+ new TermFreq("a c", 6),
+ new TermFreq("a c", 3),
+ new TermFreq("a b", 5),
+ }));
+ assertEquals("[a a/7, a c/6, a b/5]", suggester.lookup("a", false, 3).toString());
+ }
}