You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/11/05 22:54:57 UTC

svn commit: r1405978 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/suggest/ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/

Author: mikemccand
Date: Mon Nov  5 21:54:56 2012
New Revision: 1405978

URL: http://svn.apache.org/viewvc?rev=1405978&view=rev
Log:
LUCENE-4534: dedup same surface form in Analyzing/FuzzySuggester

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/suggest/   (props changed)
    lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
    lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java

Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java?rev=1405978&r1=1405977&r2=1405978&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java Mon Nov  5 21:54:56 2012
@@ -390,6 +390,7 @@ public class AnalyzingSuggester extends 
     try {
       ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
       BytesRef surfaceForm;
+
       while ((surfaceForm = iterator.next()) != null) {
         Set<IntsRef> paths = toFiniteStrings(surfaceForm, ts2a);
         
@@ -430,6 +431,10 @@ public class AnalyzingSuggester extends 
 
       // Sort all input/output pairs (required by FST.Builder):
       new Sort(sortComparator).sort(tempInput, tempSorted);
+
+      // Free disk space:
+      tempInput.delete();
+
       reader = new Sort.ByteSequencesReader(tempSorted);
      
       PairOutputs<Long,BytesRef> outputs = new PairOutputs<Long,BytesRef>(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton());
@@ -442,6 +447,12 @@ public class AnalyzingSuggester extends 
       IntsRef scratchInts = new IntsRef();
       ByteArrayDataInput input = new ByteArrayDataInput();
 
+      // Used to remove duplicate surface forms (but we
+      // still index the hightest-weight one).  We clear
+      // this when we see a new analyzed form, so it cannot
+      // grow unbounded (at most 256 entries):
+      Set<BytesRef> seenSurfaceForms = new HashSet<BytesRef>();
+
       int dedup = 0;
       while (reader.read(scratch)) {
         input.reset(scratch.bytes, scratch.offset, scratch.length);
@@ -459,6 +470,7 @@ public class AnalyzingSuggester extends 
         if (previousAnalyzed == null) {
           previousAnalyzed = new BytesRef();
           previousAnalyzed.copyBytes(analyzed);
+          seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
         } else if (analyzed.equals(previousAnalyzed)) {
           dedup++;
           if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
@@ -466,9 +478,15 @@ public class AnalyzingSuggester extends 
             // dups: skip the rest:
             continue;
           }
+          if (seenSurfaceForms.contains(surface)) {
+            continue;
+          }
+          seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
         } else {
           dedup = 0;
           previousAnalyzed.copyBytes(analyzed);
+          seenSurfaceForms.clear();
+          seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
         }
 
         // TODO: I think we can avoid the extra 2 bytes when

Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java?rev=1405978&r1=1405977&r2=1405978&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java Mon Nov  5 21:54:56 2012
@@ -1031,4 +1031,17 @@ public class AnalyzingSuggesterTest exte
           new TermFreq("a b", 50),
         }));
   }
+
+  public void testDupSurfaceFormsMissingResults3() throws Exception {
+    Analyzer a = new MockAnalyzer(random());
+    AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1);
+    suggester.build(new TermFreqArrayIterator(new TermFreq[] {
+          new TermFreq("a a", 7),
+          new TermFreq("a a", 7),
+          new TermFreq("a c", 6),
+          new TermFreq("a c", 3),
+          new TermFreq("a b", 5),
+        }));
+    assertEquals("[a a/7, a c/6, a b/5]", suggester.lookup("a", false, 3).toString());
+  }
 }