You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/10/21 15:00:04 UTC

svn commit: r1400635 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/core/ lucene/core/src/java/org/apache/lucene/util/fst/ lucene/suggest/ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/ lucene/suggest/src/test/org/apache/lu...

Author: mikemccand
Date: Sun Oct 21 13:00:04 2012
New Revision: 1400635

URL: http://svn.apache.org/viewvc?rev=1400635&view=rev
Log:
LUCENE-4481: add back some optos

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
    lucene/dev/branches/branch_4x/lucene/suggest/   (props changed)
    lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
    lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java

Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/Util.java?rev=1400635&r1=1400634&r2=1400635&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/Util.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/Util.java Sun Oct 21 13:00:04 2012
@@ -266,6 +266,7 @@ public final class Util {
     private final FST<T> fst;
     private final FST.BytesReader bytesReader;
     private final int topN;
+    private final int maxQueueDepth;
 
     private final FST.Arc<T> scratchArc = new FST.Arc<T>();
     
@@ -273,10 +274,11 @@ public final class Util {
 
     TreeSet<FSTPath<T>> queue = null;
 
-    public TopNSearcher(FST<T> fst, int topN, Comparator<T> comparator) {
+    public TopNSearcher(FST<T> fst, int topN, int maxQueueDepth, Comparator<T> comparator) {
       this.fst = fst;
       this.bytesReader = fst.getBytesReader(0);
       this.topN = topN;
+      this.maxQueueDepth = maxQueueDepth;
       this.comparator = comparator;
 
       queue = new TreeSet<FSTPath<T>>();
@@ -290,9 +292,7 @@ public final class Util {
       T cost = fst.outputs.add(path.cost, path.arc.output);
       //System.out.println("  addIfCompetitive queue.size()=" + queue.size() + " path=" + path + " + label=" + path.arc.label);
 
-      // LUCENE-4481: TODO: re-enable this pruning if we can make this admissible:
-      /*
-      if (queue.size() == topN) {
+      if (queue.size() == maxQueueDepth) {
         FSTPath<T> bottom = queue.last();
         int comp = comparator.compare(cost, bottom.cost);
         if (comp > 0) {
@@ -314,7 +314,6 @@ public final class Util {
       } else {
         // Queue isn't full yet, so any path we hit competes:
       }
-      */
 
       // copy over the current input to the new input
       // and add the arc.label to the end
@@ -326,12 +325,9 @@ public final class Util {
 
       queue.add(newPath);
 
-      // LUCENE-4481: TODO: re-enable this pruning if we can make this admissible:
-      /*
-      if (queue.size() == topN+1) {
+      if (queue.size() == maxQueueDepth+1) {
         queue.pollLast();
       }
-      */
     }
 
     /** Adds all leaving arcs, including 'finished' arc, if
@@ -375,6 +371,7 @@ public final class Util {
 
       // TODO: maybe we should make an FST.INPUT_TYPE.BYTE0.5!?
       // (nibbles)
+      int rejectCount = 0;
 
       // For each top N path:
       while (results.size() < topN) {
@@ -404,13 +401,10 @@ public final class Util {
           continue;
         }
 
-        // LUCENE-4481: TODO: re-enable this pruning if we can make this admissible:
-        /*
-        if (results.size() == topN-1) {
+        if (results.size() == topN-1 && maxQueueDepth == topN) {
           // Last path -- don't bother w/ queue anymore:
           queue = null;
         }
-        */
 
         //System.out.println("  path: " + path);
         
@@ -467,6 +461,9 @@ public final class Util {
             T finalOutput = fst.outputs.add(path.cost, path.arc.output);
             if (acceptResult(path.input, finalOutput)) {
               results.add(new MinResult<T>(path.input, finalOutput, comparator));
+            } else {
+              rejectCount++;
+              assert rejectCount + topN <= maxQueueDepth: "maxQueueDepth (" + maxQueueDepth + ") is too small for topN (" + topN + "): rejected " + rejectCount + " paths";
             }
             break;
           } else {
@@ -519,7 +516,10 @@ public final class Util {
    *  PositiveIntOutputs#getSingleton}). */
   public static <T> MinResult<T>[] shortestPaths(FST<T> fst, FST.Arc<T> fromNode, T startOutput, Comparator<T> comparator, int topN,
                                                  boolean allowEmptyString) throws IOException {
-    TopNSearcher<T> searcher = new TopNSearcher<T>(fst, topN, comparator);
+
+    // All paths are kept, so we can pass topN for
+    // maxQueueDepth and the pruning is admissible:
+    TopNSearcher<T> searcher = new TopNSearcher<T>(fst, topN, topN, comparator);
 
     // since this search is initialized with a single start node 
     // it is okay to start with an empty input path here

Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java?rev=1400635&r1=1400634&r2=1400635&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java Sun Oct 21 13:00:04 2012
@@ -36,6 +36,8 @@ import org.apache.lucene.search.suggest.
 import org.apache.lucene.search.suggest.fst.Sort;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.store.ByteArrayDataOutput;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.store.InputStreamDataInput;
 import org.apache.lucene.store.OutputStreamDataOutput;
 import org.apache.lucene.util.ArrayUtil;
@@ -161,6 +163,11 @@ public class AnalyzingSuggester extends 
    *  SynonymFilter). */
   private final int maxGraphExpansions;
 
+  /** Highest number of analyzed paths we saw for any single
+   *  input surface form.  For analyzers that never create
+   *  graphs this will always be 1. */
+  private int maxAnalyzedPathsForOneInput;
+
   /**
    * Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)
    * AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST |
@@ -354,6 +361,8 @@ public class AnalyzingSuggester extends 
         // don't have to alloc [possibly biggish]
         // intermediate HashSet in RAM:
         Set<IntsRef> paths = SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);
+        maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size());
+
         for (IntsRef path : paths) {
 
           Util.toBytesRef(path, scratch);
@@ -469,8 +478,10 @@ public class AnalyzingSuggester extends 
 
   @Override
   public boolean store(OutputStream output) throws IOException {
+    DataOutput dataOut = new OutputStreamDataOutput(output);
     try {
-      fst.save(new OutputStreamDataOutput(output));
+      fst.save(dataOut);
+      dataOut.writeVInt(maxAnalyzedPathsForOneInput);
     } finally {
       IOUtils.close(output);
     }
@@ -479,8 +490,10 @@ public class AnalyzingSuggester extends 
 
   @Override
   public boolean load(InputStream input) throws IOException {
+    DataInput dataIn = new InputStreamDataInput(input);
     try {
-      this.fst = new FST<Pair<Long,BytesRef>>(new InputStreamDataInput(input), new PairOutputs<Long,BytesRef>(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton()));
+      this.fst = new FST<Pair<Long,BytesRef>>(dataIn, new PairOutputs<Long,BytesRef>(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton()));
+      maxAnalyzedPathsForOneInput = dataIn.readVInt();
     } finally {
       IOUtils.close(input);
     }
@@ -529,7 +542,7 @@ public class AnalyzingSuggester extends 
 
       FST.Arc<Pair<Long,BytesRef>> scratchArc = new FST.Arc<Pair<Long,BytesRef>>();
 
-      List<LookupResult> results = new ArrayList<LookupResult>();
+      final List<LookupResult> results = new ArrayList<LookupResult>();
 
       if (exactFirst) {
 
@@ -545,7 +558,7 @@ public class AnalyzingSuggester extends 
         // Searcher just to find the single exact only
         // match, if present:
         Util.TopNSearcher<Pair<Long,BytesRef>> searcher;
-        searcher = new Util.TopNSearcher<Pair<Long,BytesRef>>(fst, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
+        searcher = new Util.TopNSearcher<Pair<Long,BytesRef>>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
 
         // NOTE: we could almost get away with only using
         // the first start node.  The only catch is if
@@ -591,18 +604,17 @@ public class AnalyzingSuggester extends 
 
       Util.TopNSearcher<Pair<Long,BytesRef>> searcher;
       searcher = new Util.TopNSearcher<Pair<Long,BytesRef>>(fst,
-                                                            num,
+                                                            num - results.size(),
+                                                            num * maxAnalyzedPathsForOneInput,
                                                             weightComparator) {
         private final Set<BytesRef> seen = new HashSet<BytesRef>();
 
         @Override
         protected boolean acceptResult(IntsRef input, Pair<Long,BytesRef> output) {
 
-          //System.out.println("ACCEPT? path=" + input);
           // Dedup: when the input analyzes to a graph we
           // can get duplicate surface forms:
           if (seen.contains(output.output2)) {
-            //System.out.println("SKIP: dup");
             return false;
           }
           seen.add(output.output2);
@@ -615,7 +627,14 @@ public class AnalyzingSuggester extends 
             // create duplicate results:
             spare.grow(output.output2.length);
             UnicodeUtil.UTF8toUTF16(output.output2, spare);
-            return CHARSEQUENCE_COMPARATOR.compare(spare, key) != 0;
+            if (CHARSEQUENCE_COMPARATOR.compare(spare, key) == 0) {
+              // We found exact match, which means we should
+              // have already found it in the first search:
+              assert results.size() == 1;
+              return false;
+            } else {
+              return true;
+            }
           }
         }
       };

Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java?rev=1400635&r1=1400634&r2=1400635&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java Sun Oct 21 13:00:04 2012
@@ -17,6 +17,11 @@ package org.apache.lucene.search.suggest
  * limitations under the License.
  */
 
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.FileOutputStream;
+import java.io.OutputStream;
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
@@ -824,6 +829,29 @@ public class AnalyzingSuggesterTest exte
     assertEquals(4, results.get(1).value);
     assertEquals("a b", results.get(2).key);
     assertEquals(3, results.get(2).value);
+
+    // Try again after save/load:
+    File tmpDir = _TestUtil.getTempDir("AnalyzingSuggesterTest");
+    tmpDir.mkdir();
+
+    File path = new File(tmpDir, "suggester");
+
+    OutputStream os = new FileOutputStream(path);
+    suggester.store(os);
+    os.close();
+
+    InputStream is = new FileInputStream(path);
+    suggester.load(is);
+    is.close();
+
+    results = suggester.lookup("a", false, 3);
+    assertEquals(3, results.size());
+    assertEquals("a", results.get(0).key);
+    assertEquals(5, results.get(0).value);
+    assertEquals("a c", results.get(1).key);
+    assertEquals(4, results.get(1).value);
+    assertEquals("a b", results.get(2).key);
+    assertEquals(3, results.get(2).value);
   }
 
   public void testDupSurfaceFormsMissingResults() throws Exception {
@@ -863,6 +891,27 @@ public class AnalyzingSuggesterTest exte
     assertEquals(6, results.get(0).value);
     assertEquals("nellie", results.get(1).key);
     assertEquals(5, results.get(1).value);
+
+    // Try again after save/load:
+    File tmpDir = _TestUtil.getTempDir("AnalyzingSuggesterTest");
+    tmpDir.mkdir();
+
+    File path = new File(tmpDir, "suggester");
+
+    OutputStream os = new FileOutputStream(path);
+    suggester.store(os);
+    os.close();
+
+    InputStream is = new FileInputStream(path);
+    suggester.load(is);
+    is.close();
+
+    results = suggester.lookup("nellie", false, 2);
+    assertEquals(2, results.size());
+    assertEquals("hambone", results.get(0).key);
+    assertEquals(6, results.get(0).value);
+    assertEquals("nellie", results.get(1).key);
+    assertEquals(5, results.get(1).value);
   }
 
   public void testDupSurfaceFormsMissingResults2() throws Exception {
@@ -912,5 +961,26 @@ public class AnalyzingSuggesterTest exte
     assertEquals(6, results.get(0).value);
     assertEquals("b", results.get(1).key);
     assertEquals(5, results.get(1).value);
+
+    // Try again after save/load:
+    File tmpDir = _TestUtil.getTempDir("AnalyzingSuggesterTest");
+    tmpDir.mkdir();
+
+    File path = new File(tmpDir, "suggester");
+
+    OutputStream os = new FileOutputStream(path);
+    suggester.store(os);
+    os.close();
+
+    InputStream is = new FileInputStream(path);
+    suggester.load(is);
+    is.close();
+
+    results = suggester.lookup("a", false, 2);
+    assertEquals(2, results.size());
+    assertEquals("a", results.get(0).key);
+    assertEquals(6, results.get(0).value);
+    assertEquals("b", results.get(1).key);
+    assertEquals(5, results.get(1).value);
   }
 }