You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/10/21 15:00:04 UTC
svn commit: r1400635 - in /lucene/dev/branches/branch_4x: ./ lucene/
lucene/core/ lucene/core/src/java/org/apache/lucene/util/fst/ lucene/suggest/
lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/
lucene/suggest/src/test/org/apache/lu...
Author: mikemccand
Date: Sun Oct 21 13:00:04 2012
New Revision: 1400635
URL: http://svn.apache.org/viewvc?rev=1400635&view=rev
Log:
LUCENE-4481: add back some optos
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/lucene/ (props changed)
lucene/dev/branches/branch_4x/lucene/core/ (props changed)
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
lucene/dev/branches/branch_4x/lucene/suggest/ (props changed)
lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java
Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/Util.java?rev=1400635&r1=1400634&r2=1400635&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/Util.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/Util.java Sun Oct 21 13:00:04 2012
@@ -266,6 +266,7 @@ public final class Util {
private final FST<T> fst;
private final FST.BytesReader bytesReader;
private final int topN;
+ private final int maxQueueDepth;
private final FST.Arc<T> scratchArc = new FST.Arc<T>();
@@ -273,10 +274,11 @@ public final class Util {
TreeSet<FSTPath<T>> queue = null;
- public TopNSearcher(FST<T> fst, int topN, Comparator<T> comparator) {
+ public TopNSearcher(FST<T> fst, int topN, int maxQueueDepth, Comparator<T> comparator) {
this.fst = fst;
this.bytesReader = fst.getBytesReader(0);
this.topN = topN;
+ this.maxQueueDepth = maxQueueDepth;
this.comparator = comparator;
queue = new TreeSet<FSTPath<T>>();
@@ -290,9 +292,7 @@ public final class Util {
T cost = fst.outputs.add(path.cost, path.arc.output);
//System.out.println(" addIfCompetitive queue.size()=" + queue.size() + " path=" + path + " + label=" + path.arc.label);
- // LUCENE-4481: TODO: re-enable this pruning if we can make this admissible:
- /*
- if (queue.size() == topN) {
+ if (queue.size() == maxQueueDepth) {
FSTPath<T> bottom = queue.last();
int comp = comparator.compare(cost, bottom.cost);
if (comp > 0) {
@@ -314,7 +314,6 @@ public final class Util {
} else {
// Queue isn't full yet, so any path we hit competes:
}
- */
// copy over the current input to the new input
// and add the arc.label to the end
@@ -326,12 +325,9 @@ public final class Util {
queue.add(newPath);
- // LUCENE-4481: TODO: re-enable this pruning if we can make this admissible:
- /*
- if (queue.size() == topN+1) {
+ if (queue.size() == maxQueueDepth+1) {
queue.pollLast();
}
- */
}
/** Adds all leaving arcs, including 'finished' arc, if
@@ -375,6 +371,7 @@ public final class Util {
// TODO: maybe we should make an FST.INPUT_TYPE.BYTE0.5!?
// (nibbles)
+ int rejectCount = 0;
// For each top N path:
while (results.size() < topN) {
@@ -404,13 +401,10 @@ public final class Util {
continue;
}
- // LUCENE-4481: TODO: re-enable this pruning if we can make this admissible:
- /*
- if (results.size() == topN-1) {
+ if (results.size() == topN-1 && maxQueueDepth == topN) {
// Last path -- don't bother w/ queue anymore:
queue = null;
}
- */
//System.out.println(" path: " + path);
@@ -467,6 +461,9 @@ public final class Util {
T finalOutput = fst.outputs.add(path.cost, path.arc.output);
if (acceptResult(path.input, finalOutput)) {
results.add(new MinResult<T>(path.input, finalOutput, comparator));
+ } else {
+ rejectCount++;
+ assert rejectCount + topN <= maxQueueDepth: "maxQueueDepth (" + maxQueueDepth + ") is too small for topN (" + topN + "): rejected " + rejectCount + " paths";
}
break;
} else {
@@ -519,7 +516,10 @@ public final class Util {
* PositiveIntOutputs#getSingleton}). */
public static <T> MinResult<T>[] shortestPaths(FST<T> fst, FST.Arc<T> fromNode, T startOutput, Comparator<T> comparator, int topN,
boolean allowEmptyString) throws IOException {
- TopNSearcher<T> searcher = new TopNSearcher<T>(fst, topN, comparator);
+
+ // All paths are kept, so we can pass topN for
+ // maxQueueDepth and the pruning is admissible:
+ TopNSearcher<T> searcher = new TopNSearcher<T>(fst, topN, topN, comparator);
// since this search is initialized with a single start node
// it is okay to start with an empty input path here
Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java?rev=1400635&r1=1400634&r2=1400635&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java Sun Oct 21 13:00:04 2012
@@ -36,6 +36,8 @@ import org.apache.lucene.search.suggest.
import org.apache.lucene.search.suggest.fst.Sort;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.store.OutputStreamDataOutput;
import org.apache.lucene.util.ArrayUtil;
@@ -161,6 +163,11 @@ public class AnalyzingSuggester extends
* SynonymFilter). */
private final int maxGraphExpansions;
+ /** Highest number of analyzed paths we saw for any single
+ * input surface form. For analyzers that never create
+ * graphs this will always be 1. */
+ private int maxAnalyzedPathsForOneInput;
+
/**
* Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)
* AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST |
@@ -354,6 +361,8 @@ public class AnalyzingSuggester extends
// don't have to alloc [possibly biggish]
// intermediate HashSet in RAM:
Set<IntsRef> paths = SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);
+ maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size());
+
for (IntsRef path : paths) {
Util.toBytesRef(path, scratch);
@@ -469,8 +478,10 @@ public class AnalyzingSuggester extends
@Override
public boolean store(OutputStream output) throws IOException {
+ DataOutput dataOut = new OutputStreamDataOutput(output);
try {
- fst.save(new OutputStreamDataOutput(output));
+ fst.save(dataOut);
+ dataOut.writeVInt(maxAnalyzedPathsForOneInput);
} finally {
IOUtils.close(output);
}
@@ -479,8 +490,10 @@ public class AnalyzingSuggester extends
@Override
public boolean load(InputStream input) throws IOException {
+ DataInput dataIn = new InputStreamDataInput(input);
try {
- this.fst = new FST<Pair<Long,BytesRef>>(new InputStreamDataInput(input), new PairOutputs<Long,BytesRef>(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton()));
+ this.fst = new FST<Pair<Long,BytesRef>>(dataIn, new PairOutputs<Long,BytesRef>(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton()));
+ maxAnalyzedPathsForOneInput = dataIn.readVInt();
} finally {
IOUtils.close(input);
}
@@ -529,7 +542,7 @@ public class AnalyzingSuggester extends
FST.Arc<Pair<Long,BytesRef>> scratchArc = new FST.Arc<Pair<Long,BytesRef>>();
- List<LookupResult> results = new ArrayList<LookupResult>();
+ final List<LookupResult> results = new ArrayList<LookupResult>();
if (exactFirst) {
@@ -545,7 +558,7 @@ public class AnalyzingSuggester extends
// Searcher just to find the single exact only
// match, if present:
Util.TopNSearcher<Pair<Long,BytesRef>> searcher;
- searcher = new Util.TopNSearcher<Pair<Long,BytesRef>>(fst, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
+ searcher = new Util.TopNSearcher<Pair<Long,BytesRef>>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
// NOTE: we could almost get away with only using
// the first start node. The only catch is if
@@ -591,18 +604,17 @@ public class AnalyzingSuggester extends
Util.TopNSearcher<Pair<Long,BytesRef>> searcher;
searcher = new Util.TopNSearcher<Pair<Long,BytesRef>>(fst,
- num,
+ num - results.size(),
+ num * maxAnalyzedPathsForOneInput,
weightComparator) {
private final Set<BytesRef> seen = new HashSet<BytesRef>();
@Override
protected boolean acceptResult(IntsRef input, Pair<Long,BytesRef> output) {
- //System.out.println("ACCEPT? path=" + input);
// Dedup: when the input analyzes to a graph we
// can get duplicate surface forms:
if (seen.contains(output.output2)) {
- //System.out.println("SKIP: dup");
return false;
}
seen.add(output.output2);
@@ -615,7 +627,14 @@ public class AnalyzingSuggester extends
// create duplicate results:
spare.grow(output.output2.length);
UnicodeUtil.UTF8toUTF16(output.output2, spare);
- return CHARSEQUENCE_COMPARATOR.compare(spare, key) != 0;
+ if (CHARSEQUENCE_COMPARATOR.compare(spare, key) == 0) {
+ // We found exact match, which means we should
+ // have already found it in the first search:
+ assert results.size() == 1;
+ return false;
+ } else {
+ return true;
+ }
}
}
};
Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java?rev=1400635&r1=1400634&r2=1400635&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java Sun Oct 21 13:00:04 2012
@@ -17,6 +17,11 @@ package org.apache.lucene.search.suggest
* limitations under the License.
*/
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.FileOutputStream;
+import java.io.OutputStream;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
@@ -824,6 +829,29 @@ public class AnalyzingSuggesterTest exte
assertEquals(4, results.get(1).value);
assertEquals("a b", results.get(2).key);
assertEquals(3, results.get(2).value);
+
+ // Try again after save/load:
+ File tmpDir = _TestUtil.getTempDir("AnalyzingSuggesterTest");
+ tmpDir.mkdir();
+
+ File path = new File(tmpDir, "suggester");
+
+ OutputStream os = new FileOutputStream(path);
+ suggester.store(os);
+ os.close();
+
+ InputStream is = new FileInputStream(path);
+ suggester.load(is);
+ is.close();
+
+ results = suggester.lookup("a", false, 3);
+ assertEquals(3, results.size());
+ assertEquals("a", results.get(0).key);
+ assertEquals(5, results.get(0).value);
+ assertEquals("a c", results.get(1).key);
+ assertEquals(4, results.get(1).value);
+ assertEquals("a b", results.get(2).key);
+ assertEquals(3, results.get(2).value);
}
public void testDupSurfaceFormsMissingResults() throws Exception {
@@ -863,6 +891,27 @@ public class AnalyzingSuggesterTest exte
assertEquals(6, results.get(0).value);
assertEquals("nellie", results.get(1).key);
assertEquals(5, results.get(1).value);
+
+ // Try again after save/load:
+ File tmpDir = _TestUtil.getTempDir("AnalyzingSuggesterTest");
+ tmpDir.mkdir();
+
+ File path = new File(tmpDir, "suggester");
+
+ OutputStream os = new FileOutputStream(path);
+ suggester.store(os);
+ os.close();
+
+ InputStream is = new FileInputStream(path);
+ suggester.load(is);
+ is.close();
+
+ results = suggester.lookup("nellie", false, 2);
+ assertEquals(2, results.size());
+ assertEquals("hambone", results.get(0).key);
+ assertEquals(6, results.get(0).value);
+ assertEquals("nellie", results.get(1).key);
+ assertEquals(5, results.get(1).value);
}
public void testDupSurfaceFormsMissingResults2() throws Exception {
@@ -912,5 +961,26 @@ public class AnalyzingSuggesterTest exte
assertEquals(6, results.get(0).value);
assertEquals("b", results.get(1).key);
assertEquals(5, results.get(1).value);
+
+ // Try again after save/load:
+ File tmpDir = _TestUtil.getTempDir("AnalyzingSuggesterTest");
+ tmpDir.mkdir();
+
+ File path = new File(tmpDir, "suggester");
+
+ OutputStream os = new FileOutputStream(path);
+ suggester.store(os);
+ os.close();
+
+ InputStream is = new FileInputStream(path);
+ suggester.load(is);
+ is.close();
+
+ results = suggester.lookup("a", false, 2);
+ assertEquals(2, results.size());
+ assertEquals("a", results.get(0).key);
+ assertEquals(6, results.get(0).value);
+ assertEquals("b", results.get(1).key);
+ assertEquals(5, results.get(1).value);
}
}