You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by si...@apache.org on 2014/03/12 18:20:48 UTC
svn commit: r1576825 - in /lucene/dev/trunk/lucene: ./
core/src/java/org/apache/lucene/util/fst/
core/src/test/org/apache/lucene/util/fst/
suggest/src/java/org/apache/lucene/search/suggest/analyzing/
suggest/src/java/org/apache/lucene/search/suggest/fst/
Author: simonw
Date: Wed Mar 12 17:20:47 2014
New Revision: 1576825
URL: http://svn.apache.org/r1576825
Log:
LUCENE-5519: Make queueDepth enforcing optional in TopNSearcher
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java
lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java
lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1576825&r1=1576824&r2=1576825&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Wed Mar 12 17:20:47 2014
@@ -121,6 +121,14 @@ API Changes
also simplified the Weight.scorer API by removing the two confusing
booleans. (Robert Muir, Uwe Schindler, Mike McCandless)
+* LUCENE-5519: TopNSearcher now allows to retrieve incomplete results if the max
+ size of the candidate queue is unknown. The queue can still be bound in order
+ to apply pruning while retrieving the top N but will not throw an exception if
+ too many results are rejected to guarantee an absolutely correct top N result.
+ The TopNSearcher now returns a struct like class that indicates if the result
+ is complete in the sense of the top N or not. Consumers of this API should assert
+ on the completness if the bounded queue size is know ahead of time. (Simon Willnauer)
+
Optimizations
* LUCENE-5468: HunspellStemFilter uses 10 to 100x less RAM. It also loads
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/Util.java?rev=1576825&r1=1576824&r2=1576825&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/Util.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/Util.java Wed Mar 12 17:20:47 2014
@@ -17,14 +17,20 @@ package org.apache.lucene.util.fst;
* limitations under the License.
*/
-import java.io.*;
-import java.util.*;
-
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.FST.Arc;
import org.apache.lucene.util.fst.FST.BytesReader;
+import java.io.IOException;
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+import java.util.TreeSet;
+
/** Static helper methods.
*
* @lucene.experimental */
@@ -294,6 +300,13 @@ public final class Util {
TreeSet<FSTPath<T>> queue = null;
+ /**
+ * Creates an unbounded TopNSearcher
+ * @param fst the {@link org.apache.lucene.util.fst.FST} to search on
+ * @param topN the number of top scoring entries to retrieve
+ * @param maxQueueDepth the maximum size of the queue of possible top entries
+ * @param comparator the comparator to select the top N
+ */
public TopNSearcher(FST<T> fst, int topN, int maxQueueDepth, Comparator<T> comparator) {
this.fst = fst;
this.bytesReader = fst.getBytesReader();
@@ -379,9 +392,9 @@ public final class Util {
}
}
- public MinResult<T>[] search() throws IOException {
+ public TopResults<T> search() throws IOException {
- final List<MinResult<T>> results = new ArrayList<>();
+ final List<Result<T>> results = new ArrayList<>();
//System.out.println("search topN=" + topN);
@@ -422,7 +435,7 @@ public final class Util {
//System.out.println(" empty string! cost=" + path.cost);
// Empty string!
path.input.length--;
- results.add(new MinResult<>(path.input, path.cost));
+ results.add(new Result<>(path.input, path.cost));
continue;
}
@@ -486,10 +499,9 @@ public final class Util {
T finalOutput = fst.outputs.add(path.cost, path.arc.output);
if (acceptResult(path.input, finalOutput)) {
//System.out.println(" add result: " + path);
- results.add(new MinResult<>(path.input, finalOutput));
+ results.add(new Result(path.input, finalOutput));
} else {
rejectCount++;
- assert rejectCount + topN <= maxQueueDepth: "maxQueueDepth (" + maxQueueDepth + ") is too small for topN (" + topN + "): rejected " + rejectCount + " paths";
}
break;
} else {
@@ -500,10 +512,7 @@ public final class Util {
}
}
}
-
- @SuppressWarnings({"rawtypes","unchecked"}) final MinResult<T>[] arr =
- (MinResult<T>[]) new MinResult[results.size()];
- return results.toArray(arr);
+ return new TopResults(rejectCount + topN <= maxQueueDepth, results);
}
protected boolean acceptResult(IntsRef input, T output) {
@@ -513,18 +522,47 @@ public final class Util {
/** Holds a single input (IntsRef) + output, returned by
* {@link #shortestPaths shortestPaths()}. */
- public final static class MinResult<T> {
+ public final static class Result<T> {
public final IntsRef input;
public final T output;
- public MinResult(IntsRef input, T output) {
+ public Result(IntsRef input, T output) {
this.input = input;
this.output = output;
}
}
+
+ /**
+ * Holds the results for a top N search using {@link TopNSearcher}
+ */
+ public static final class TopResults<T> implements Iterable<Result<T>> {
+
+ /**
+ * <code>true</code> iff this is a complete result ie. if
+ * the specified queue size was large enough to find the complete list of results. This might
+ * be <code>false</code> if the {@link TopNSearcher} rejected too many results.
+ */
+ public final boolean isComplete;
+ /**
+ * The top results
+ */
+ public final List<Result<T>> topN;
+
+ TopResults(boolean isComplete, List<Result<T>> topN) {
+ this.topN = topN;
+ this.isComplete = isComplete;
+ }
+
+ @Override
+ public Iterator<Result<T>> iterator() {
+ return topN.iterator();
+ }
+ }
+
+
/** Starting from node, find the top N min cost
* completions to a final node. */
- public static <T> MinResult<T>[] shortestPaths(FST<T> fst, FST.Arc<T> fromNode, T startOutput, Comparator<T> comparator, int topN,
+ public static <T> TopResults<T> shortestPaths(FST<T> fst, FST.Arc<T> fromNode, T startOutput, Comparator<T> comparator, int topN,
boolean allowEmptyString) throws IOException {
// All paths are kept, so we can pass topN for
Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java?rev=1576825&r1=1576824&r2=1576825&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java Wed Mar 12 17:20:47 2014
@@ -17,17 +17,6 @@ package org.apache.lucene.util.fst;
* limitations under the License.
*/
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.OutputStreamWriter;
-import java.io.StringWriter;
-import java.io.Writer;
-import java.util.*;
-
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@@ -51,9 +40,9 @@ import org.apache.lucene.store.MockDirec
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LineFileDocs;
+import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
-import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CompiledAutomaton;
@@ -62,8 +51,32 @@ import org.apache.lucene.util.fst.BytesR
import org.apache.lucene.util.fst.FST.Arc;
import org.apache.lucene.util.fst.FST.BytesReader;
import org.apache.lucene.util.fst.PairOutputs.Pair;
+import org.apache.lucene.util.fst.Util.Result;
import org.apache.lucene.util.packed.PackedInts;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.StringWriter;
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Random;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.TreeSet;
+import java.util.concurrent.atomic.AtomicInteger;
+
import static org.apache.lucene.util.fst.FSTTester.getRandomString;
import static org.apache.lucene.util.fst.FSTTester.simpleRandomString;
import static org.apache.lucene.util.fst.FSTTester.toIntsRef;
@@ -478,7 +491,7 @@ public class TestFSTs extends LuceneTest
ord++;
if (ord % 500000 == 0) {
System.out.println(
- String.format(Locale.ROOT,
+ String.format(Locale.ROOT,
"%6.2fs: %9d...", ((System.currentTimeMillis() - tStart) / 1000.0), ord));
}
if (ord >= limit) {
@@ -645,7 +658,7 @@ public class TestFSTs extends LuceneTest
}
idx++;
}
-
+
if (wordsFileIn == null) {
System.err.println("No input file.");
System.exit(-1);
@@ -712,7 +725,7 @@ public class TestFSTs extends LuceneTest
assertNull(fstEnum.seekFloor(new BytesRef("foo")));
assertNull(fstEnum.seekCeil(new BytesRef("foobaz")));
}
-
+
public void testDuplicateFSAString() throws Exception {
String str = "foobar";
@@ -723,15 +736,15 @@ public class TestFSTs extends LuceneTest
b.add(Util.toIntsRef(new BytesRef(str), ints), outputs.getNoOutput());
}
FST<Object> fst = b.finish();
-
+
// count the input paths
- int count = 0;
+ int count = 0;
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(fst);
while(fstEnum.next()!=null) {
- count++;
+ count++;
}
assertEquals(1, count);
-
+
assertNotNull(Util.get(fst, new BytesRef(str)));
assertNull(Util.get(fst, new BytesRef("foobaz")));
}
@@ -840,7 +853,7 @@ public class TestFSTs extends LuceneTest
Document doc = new Document();
Field idField = newStringField("id", "", Field.Store.NO);
doc.add(idField);
-
+
final int NUM_IDS = atLeast(200);
//final int NUM_IDS = (int) (377 * (1.0+random.nextDouble()));
if (VERBOSE) {
@@ -970,7 +983,7 @@ public class TestFSTs extends LuceneTest
Document doc = new Document();
Field f = newStringField("field", "", Field.Store.NO);
doc.add(f);
-
+
final int NUM_TERMS = (int) (1000*RANDOM_MULTIPLIER * (1+random().nextDouble()));
if (VERBOSE) {
System.out.println("TEST: NUM_TERMS=" + NUM_TERMS);
@@ -1016,8 +1029,8 @@ public class TestFSTs extends LuceneTest
/**
* Test state expansion (array format) on close-to-root states. Creates
* synthetic input that has one expanded state on each level.
- *
- * @see "https://issues.apache.org/jira/browse/LUCENE-2933"
+ *
+ * @see "https://issues.apache.org/jira/browse/LUCENE-2933"
*/
public void testExpandedCloseToRoot() throws Exception {
class SyntheticData {
@@ -1037,10 +1050,10 @@ public class TestFSTs extends LuceneTest
term.copyChars(w);
b.add(Util.toIntsRef(term, scratchIntsRef), nothing);
}
-
+
return b.finish();
}
-
+
void generate(ArrayList<String> out, StringBuilder b, char from, char to,
int depth) {
if (depth == 0 || from == to) {
@@ -1055,12 +1068,12 @@ public class TestFSTs extends LuceneTest
}
}
- public int verifyStateAndBelow(FST<Object> fst, Arc<Object> arc, int depth)
+ public int verifyStateAndBelow(FST<Object> fst, Arc<Object> arc, int depth)
throws IOException {
if (FST.targetHasArcs(arc)) {
int childCount = 0;
BytesReader fstReader = fst.getBytesReader();
- for (arc = fst.readFirstTargetArc(arc, arc, fstReader);;
+ for (arc = fst.readFirstTargetArc(arc, arc, fstReader);;
arc = fst.readNextArc(arc, fstReader), childCount++)
{
boolean expanded = fst.isExpandedTarget(arc, fstReader);
@@ -1068,7 +1081,7 @@ public class TestFSTs extends LuceneTest
assertEquals(
expanded,
- (depth <= FST.FIXED_ARRAY_SHALLOW_DISTANCE &&
+ (depth <= FST.FIXED_ARRAY_SHALLOW_DISTANCE &&
children >= FST.FIXED_ARRAY_NUM_ARCS_SHALLOW) ||
children >= FST.FIXED_ARRAY_NUM_ARCS_DEEP);
if (arc.isLast()) break;
@@ -1123,7 +1136,7 @@ public class TestFSTs extends LuceneTest
Util.toDot(fst, w, false, false);
w.close();
//System.out.println(w.toString());
-
+
// check for accept state at label t
assertTrue(w.toString().indexOf("[label=\"t\" style=\"bold\"") != -1);
// check for accept state at label n
@@ -1171,7 +1184,7 @@ public class TestFSTs extends LuceneTest
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
Util.toDot(fst, w, false, false);
w.close();
-
+
checkStopNodes(fst, outputs);
// Make sure it still works after save/load:
@@ -1204,12 +1217,12 @@ public class TestFSTs extends LuceneTest
assertFalse(arc.isFinal());
assertEquals(42, arc.output.longValue());
}
-
+
static final Comparator<Long> minLongComparator = new Comparator<Long> () {
@Override
public int compare(Long left, Long right) {
return left.compareTo(right);
- }
+ }
};
public void testShortestPaths() throws Exception {
@@ -1225,32 +1238,82 @@ public class TestFSTs extends LuceneTest
//Util.toDot(fst, w, false, false);
//w.close();
- Util.MinResult<Long>[] r = Util.shortestPaths(fst,
+ Util.TopResults<Long> res = Util.shortestPaths(fst,
fst.getFirstArc(new FST.Arc<Long>()),
outputs.getNoOutput(),
minLongComparator,
3,
true);
- assertEquals(3, r.length);
+ assertTrue(res.isComplete);
+ assertEquals(3, res.topN.size());
+ assertEquals(Util.toIntsRef(new BytesRef("aac"), scratch), res.topN.get(0).input);
+ assertEquals(7L, res.topN.get(0).output.longValue());
+
+ assertEquals(Util.toIntsRef(new BytesRef("ax"), scratch), res.topN.get(1).input);
+ assertEquals(17L,res.topN.get(1).output.longValue());
- assertEquals(Util.toIntsRef(new BytesRef("aac"), scratch), r[0].input);
- assertEquals(7L, r[0].output.longValue());
+ assertEquals(Util.toIntsRef(new BytesRef("aab"), scratch), res.topN.get(2).input);
+ assertEquals(22L, res.topN.get(2).output.longValue());
+ }
+
+ public void testRejectNoLimits() throws IOException {
+ final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
+ final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
- assertEquals(Util.toIntsRef(new BytesRef("ax"), scratch), r[1].input);
- assertEquals(17L, r[1].output.longValue());
+ final IntsRef scratch = new IntsRef();
+ builder.add(Util.toIntsRef(new BytesRef("aab"), scratch), 22L);
+ builder.add(Util.toIntsRef(new BytesRef("aac"), scratch), 7L);
+ builder.add(Util.toIntsRef(new BytesRef("adcd"), scratch), 17L);
+ builder.add(Util.toIntsRef(new BytesRef("adcde"), scratch), 17L);
- assertEquals(Util.toIntsRef(new BytesRef("aab"), scratch), r[2].input);
- assertEquals(22L, r[2].output.longValue());
+ builder.add(Util.toIntsRef(new BytesRef("ax"), scratch), 17L);
+ final FST<Long> fst = builder.finish();
+ final AtomicInteger rejectCount = new AtomicInteger();
+ Util.TopNSearcher<Long> searcher = new Util.TopNSearcher<Long>(fst, 2, 6, minLongComparator) {
+ @Override
+ protected boolean acceptResult(IntsRef input, Long output) {
+ boolean accept = output.intValue() == 7;
+ if (!accept) {
+ rejectCount.incrementAndGet();
+ }
+ return accept;
+ }
+ };
+
+ searcher.addStartPaths(fst.getFirstArc(new FST.Arc<Long>()), outputs.getNoOutput(), true, new IntsRef());
+ Util.TopResults<Long> res = searcher.search();
+ assertEquals(rejectCount.get(), 4);
+ assertTrue(res.isComplete); // rejected(4) + topN(2) <= maxQueueSize(6)
+
+ assertEquals(1, res.topN.size());
+ assertEquals(Util.toIntsRef(new BytesRef("aac"), scratch), res.topN.get(0).input);
+ assertEquals(7L, res.topN.get(0).output.longValue());
+ rejectCount.set(0);
+ searcher = new Util.TopNSearcher<Long>(fst, 2, 5, minLongComparator) {
+ @Override
+ protected boolean acceptResult(IntsRef input, Long output) {
+ boolean accept = output.intValue() == 7;
+ if (!accept) {
+ rejectCount.incrementAndGet();
+ }
+ return accept;
+ }
+ };
+
+ searcher.addStartPaths(fst.getFirstArc(new FST.Arc<Long>()), outputs.getNoOutput(), true, new IntsRef());
+ res = searcher.search();
+ assertEquals(rejectCount.get(), 4);
+ assertFalse(res.isComplete); // rejected(4) + topN(2) > maxQueueSize(5)
}
-
+
// compares just the weight side of the pair
static final Comparator<Pair<Long,Long>> minPairWeightComparator = new Comparator<Pair<Long,Long>> () {
@Override
public int compare(Pair<Long,Long> left, Pair<Long,Long> right) {
return left.output1.compareTo(right.output1);
- }
+ }
};
-
+
/** like testShortestPaths, but uses pairoutputs so we have both a weight and an output */
public void testShortestPathsWFST() throws Exception {
@@ -1258,7 +1321,7 @@ public class TestFSTs extends LuceneTest
PositiveIntOutputs.getSingleton(), // weight
PositiveIntOutputs.getSingleton() // output
);
-
+
final Builder<Pair<Long,Long>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRef scratch = new IntsRef();
@@ -1270,38 +1333,39 @@ public class TestFSTs extends LuceneTest
//Util.toDot(fst, w, false, false);
//w.close();
- Util.MinResult<Pair<Long,Long>>[] r = Util.shortestPaths(fst,
+ Util.TopResults<Pair<Long,Long>> res = Util.shortestPaths(fst,
fst.getFirstArc(new FST.Arc<Pair<Long,Long>>()),
outputs.getNoOutput(),
minPairWeightComparator,
3,
true);
- assertEquals(3, r.length);
+ assertTrue(res.isComplete);
+ assertEquals(3, res.topN.size());
- assertEquals(Util.toIntsRef(new BytesRef("aac"), scratch), r[0].input);
- assertEquals(7L, r[0].output.output1.longValue()); // weight
- assertEquals(36L, r[0].output.output2.longValue()); // output
-
- assertEquals(Util.toIntsRef(new BytesRef("ax"), scratch), r[1].input);
- assertEquals(17L, r[1].output.output1.longValue()); // weight
- assertEquals(85L, r[1].output.output2.longValue()); // output
-
- assertEquals(Util.toIntsRef(new BytesRef("aab"), scratch), r[2].input);
- assertEquals(22L, r[2].output.output1.longValue()); // weight
- assertEquals(57L, r[2].output.output2.longValue()); // output
+ assertEquals(Util.toIntsRef(new BytesRef("aac"), scratch), res.topN.get(0).input);
+ assertEquals(7L, res.topN.get(0).output.output1.longValue()); // weight
+ assertEquals(36L, res.topN.get(0).output.output2.longValue()); // output
+
+ assertEquals(Util.toIntsRef(new BytesRef("ax"), scratch), res.topN.get(1).input);
+ assertEquals(17L, res.topN.get(1).output.output1.longValue()); // weight
+ assertEquals(85L, res.topN.get(1).output.output2.longValue()); // output
+
+ assertEquals(Util.toIntsRef(new BytesRef("aab"), scratch), res.topN.get(2).input);
+ assertEquals(22L, res.topN.get(2).output.output1.longValue()); // weight
+ assertEquals(57L, res.topN.get(2).output.output2.longValue()); // output
}
-
+
public void testShortestPathsRandom() throws Exception {
final Random random = random();
int numWords = atLeast(1000);
-
+
final TreeMap<String,Long> slowCompletor = new TreeMap<>();
final TreeSet<String> allPrefixes = new TreeSet<>();
-
+
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRef scratch = new IntsRef();
-
+
for (int i = 0; i < numWords; i++) {
String s;
while (true) {
@@ -1310,32 +1374,32 @@ public class TestFSTs extends LuceneTest
break;
}
}
-
+
for (int j = 1; j < s.length(); j++) {
allPrefixes.add(s.substring(0, j));
}
int weight = TestUtil.nextInt(random, 1, 100); // weights 1..100
slowCompletor.put(s, (long)weight);
}
-
+
for (Map.Entry<String,Long> e : slowCompletor.entrySet()) {
//System.out.println("add: " + e);
builder.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), e.getValue());
}
-
+
final FST<Long> fst = builder.finish();
//System.out.println("SAVE out.dot");
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
//Util.toDot(fst, w, false, false);
//w.close();
-
+
BytesReader reader = fst.getBytesReader();
-
+
//System.out.println("testing: " + allPrefixes.size() + " prefixes");
for (String prefix : allPrefixes) {
// 1. run prefix against fst, then complete by value
//System.out.println("TEST: " + prefix);
-
+
long prefixOutput = 0;
FST.Arc<Long> arc = fst.getFirstArc(new FST.Arc<Long>());
for(int idx=0;idx<prefix.length();idx++) {
@@ -1347,16 +1411,17 @@ public class TestFSTs extends LuceneTest
final int topN = TestUtil.nextInt(random, 1, 10);
- Util.MinResult<Long>[] r = Util.shortestPaths(fst, arc, fst.outputs.getNoOutput(), minLongComparator, topN, true);
+ Util.TopResults<Long> r = Util.shortestPaths(fst, arc, fst.outputs.getNoOutput(), minLongComparator, topN, true);
+ assertTrue(r.isComplete);
// 2. go thru whole treemap (slowCompletor) and check its actually the best suggestion
- final List<Util.MinResult<Long>> matches = new ArrayList<>();
+ final List<Result<Long>> matches = new ArrayList<>();
// TODO: could be faster... but its slowCompletor for a reason
for (Map.Entry<String,Long> e : slowCompletor.entrySet()) {
if (e.getKey().startsWith(prefix)) {
//System.out.println(" consider " + e.getKey());
- matches.add(new Util.MinResult<>(Util.toIntsRef(new BytesRef(e.getKey().substring(prefix.length())), new IntsRef()),
+ matches.add(new Result<>(Util.toIntsRef(new BytesRef(e.getKey().substring(prefix.length())), new IntsRef()),
e.getValue() - prefixOutput));
}
}
@@ -1367,24 +1432,24 @@ public class TestFSTs extends LuceneTest
matches.subList(topN, matches.size()).clear();
}
- assertEquals(matches.size(), r.length);
+ assertEquals(matches.size(), r.topN.size());
- for(int hit=0;hit<r.length;hit++) {
+ for(int hit=0;hit<r.topN.size();hit++) {
//System.out.println(" check hit " + hit);
- assertEquals(matches.get(hit).input, r[hit].input);
- assertEquals(matches.get(hit).output, r[hit].output);
+ assertEquals(matches.get(hit).input, r.topN.get(hit).input);
+ assertEquals(matches.get(hit).output, r.topN.get(hit).output);
}
}
}
- private static class TieBreakByInputComparator<T> implements Comparator<Util.MinResult<T>> {
+ private static class TieBreakByInputComparator<T> implements Comparator<Result<T>> {
private final Comparator<T> comparator;
public TieBreakByInputComparator(Comparator<T> comparator) {
this.comparator = comparator;
}
@Override
- public int compare(Util.MinResult<T> a, Util.MinResult<T> b) {
+ public int compare(Result<T> a, Result<T> b) {
int cmp = comparator.compare(a.output, b.output);
if (cmp == 0) {
return a.input.compareTo(b.input);
@@ -1404,21 +1469,21 @@ public class TestFSTs extends LuceneTest
this.b = b;
}
}
-
+
/** like testShortestPathsRandom, but uses pairoutputs so we have both a weight and an output */
public void testShortestPathsWFSTRandom() throws Exception {
int numWords = atLeast(1000);
-
+
final TreeMap<String,TwoLongs> slowCompletor = new TreeMap<>();
final TreeSet<String> allPrefixes = new TreeSet<>();
-
+
PairOutputs<Long,Long> outputs = new PairOutputs<>(
PositiveIntOutputs.getSingleton(), // weight
PositiveIntOutputs.getSingleton() // output
);
final Builder<Pair<Long,Long>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRef scratch = new IntsRef();
-
+
Random random = random();
for (int i = 0; i < numWords; i++) {
String s;
@@ -1428,7 +1493,7 @@ public class TestFSTs extends LuceneTest
break;
}
}
-
+
for (int j = 1; j < s.length(); j++) {
allPrefixes.add(s.substring(0, j));
}
@@ -1436,27 +1501,27 @@ public class TestFSTs extends LuceneTest
int output = TestUtil.nextInt(random, 0, 500); // outputs 0..500
slowCompletor.put(s, new TwoLongs(weight, output));
}
-
+
for (Map.Entry<String,TwoLongs> e : slowCompletor.entrySet()) {
//System.out.println("add: " + e);
long weight = e.getValue().a;
long output = e.getValue().b;
builder.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), outputs.newPair(weight, output));
}
-
+
final FST<Pair<Long,Long>> fst = builder.finish();
//System.out.println("SAVE out.dot");
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
//Util.toDot(fst, w, false, false);
//w.close();
-
+
BytesReader reader = fst.getBytesReader();
-
+
//System.out.println("testing: " + allPrefixes.size() + " prefixes");
for (String prefix : allPrefixes) {
// 1. run prefix against fst, then complete by value
//System.out.println("TEST: " + prefix);
-
+
Pair<Long,Long> prefixOutput = outputs.getNoOutput();
FST.Arc<Pair<Long,Long>> arc = fst.getFirstArc(new FST.Arc<Pair<Long,Long>>());
for(int idx=0;idx<prefix.length();idx++) {
@@ -1468,17 +1533,17 @@ public class TestFSTs extends LuceneTest
final int topN = TestUtil.nextInt(random, 1, 10);
- Util.MinResult<Pair<Long,Long>>[] r = Util.shortestPaths(fst, arc, fst.outputs.getNoOutput(), minPairWeightComparator, topN, true);
-
+ Util.TopResults<Pair<Long,Long>> r = Util.shortestPaths(fst, arc, fst.outputs.getNoOutput(), minPairWeightComparator, topN, true);
+ assertTrue(r.isComplete);
// 2. go thru whole treemap (slowCompletor) and check its actually the best suggestion
- final List<Util.MinResult<Pair<Long,Long>>> matches = new ArrayList<>();
+ final List<Result<Pair<Long,Long>>> matches = new ArrayList<>();
// TODO: could be faster... but its slowCompletor for a reason
for (Map.Entry<String,TwoLongs> e : slowCompletor.entrySet()) {
if (e.getKey().startsWith(prefix)) {
//System.out.println(" consider " + e.getKey());
- matches.add(new Util.MinResult<>(Util.toIntsRef(new BytesRef(e.getKey().substring(prefix.length())), new IntsRef()),
- outputs.newPair(e.getValue().a - prefixOutput.output1, e.getValue().b - prefixOutput.output2)));
+ matches.add(new Result<>(Util.toIntsRef(new BytesRef(e.getKey().substring(prefix.length())), new IntsRef()),
+ outputs.newPair(e.getValue().a - prefixOutput.output1, e.getValue().b - prefixOutput.output2)));
}
}
@@ -1488,12 +1553,12 @@ public class TestFSTs extends LuceneTest
matches.subList(topN, matches.size()).clear();
}
- assertEquals(matches.size(), r.length);
+ assertEquals(matches.size(), r.topN.size());
- for(int hit=0;hit<r.length;hit++) {
+ for(int hit=0;hit<r.topN.size();hit++) {
//System.out.println(" check hit " + hit);
- assertEquals(matches.get(hit).input, r[hit].input);
- assertEquals(matches.get(hit).output, r[hit].output);
+ assertEquals(matches.get(hit).input, r.topN.get(hit).input);
+ assertEquals(matches.get(hit).output, r.topN.get(hit).output);
}
}
}
@@ -1525,4 +1590,5 @@ public class TestFSTs extends LuceneTest
}
}
}
+
}
Modified: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java?rev=1576825&r1=1576824&r2=1576825&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (original)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java Wed Mar 12 17:20:47 2014
@@ -17,15 +17,6 @@ package org.apache.lucene.search.suggest
* limitations under the License.
*/
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
@@ -40,6 +31,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.BasicOperations;
@@ -48,14 +40,23 @@ import org.apache.lucene.util.automaton.
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
-import org.apache.lucene.util.fst.FST.BytesReader;
import org.apache.lucene.util.fst.FST;
-import org.apache.lucene.util.fst.PairOutputs.Pair;
+import org.apache.lucene.util.fst.FST.BytesReader;
import org.apache.lucene.util.fst.PairOutputs;
+import org.apache.lucene.util.fst.PairOutputs.Pair;
import org.apache.lucene.util.fst.PositiveIntOutputs;
-import org.apache.lucene.util.fst.Util.MinResult;
import org.apache.lucene.util.fst.Util;
-import org.apache.lucene.util.OfflineSorter;
+import org.apache.lucene.util.fst.Util.Result;
+import org.apache.lucene.util.fst.Util.TopResults;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
/**
* Suggester that first analyzes the surface form, adds the
@@ -709,7 +710,8 @@ public class AnalyzingSuggester extends
}
}
- MinResult<Pair<Long,BytesRef>> completions[] = searcher.search();
+ TopResults<Pair<Long,BytesRef>> completions = searcher.search();
+ assert completions.isComplete;
// NOTE: this is rather inefficient: we enumerate
// every matching "exactly the same analyzed form"
@@ -723,7 +725,7 @@ public class AnalyzingSuggester extends
// seach: it's bounded by how many prefix start
// nodes we have and the
// maxSurfaceFormsPerAnalyzedForm:
- for(MinResult<Pair<Long,BytesRef>> completion : completions) {
+ for(Result<Pair<Long,BytesRef>> completion : completions) {
BytesRef output2 = completion.output.output2;
if (sameSurfaceForm(utf8Key, output2)) {
results.add(getLookupResult(completion.output.output1, output2, spare));
@@ -778,9 +780,10 @@ public class AnalyzingSuggester extends
searcher.addStartPaths(path.fstNode, path.output, true, path.input);
}
- MinResult<Pair<Long,BytesRef>> completions[] = searcher.search();
+ TopResults<Pair<Long,BytesRef>> completions = searcher.search();
+ assert completions.isComplete;
- for(MinResult<Pair<Long,BytesRef>> completion : completions) {
+ for(Result<Pair<Long,BytesRef>> completion : completions) {
LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);
Modified: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java?rev=1576825&r1=1576824&r2=1576825&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java (original)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java Wed Mar 12 17:20:47 2014
@@ -21,17 +21,6 @@ package org.apache.lucene.search.suggest
// - test w/ syns
// - add pruning of low-freq ngrams?
-import java.io.File;
-import java.io.IOException;
-//import java.io.PrintWriter;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Random;
-import java.util.Set;
-
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.AnalyzerWrapper;
import org.apache.lucene.analysis.TokenStream;
@@ -64,17 +53,30 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.Arc;
import org.apache.lucene.util.fst.FST.BytesReader;
-import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Outputs;
import org.apache.lucene.util.fst.PositiveIntOutputs;
-import org.apache.lucene.util.fst.Util.MinResult;
import org.apache.lucene.util.fst.Util;
-import org.apache.lucene.util.OfflineSorter;
+import org.apache.lucene.util.fst.Util.Result;
+import org.apache.lucene.util.fst.Util.TopResults;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Random;
+import java.util.Set;
+
+//import java.io.PrintWriter;
/**
* Builds an ngram model from the text sent to {@link
@@ -611,7 +613,7 @@ public class FreeTextSuggester extends L
CharsRef spare = new CharsRef();
// complete top-N
- MinResult<Long> completions[] = null;
+ TopResults<Long> completions = null;
try {
// Because we store multiple models in one FST
@@ -658,6 +660,7 @@ public class FreeTextSuggester extends L
searcher.addStartPaths(arc, prefixOutput, true, new IntsRef());
completions = searcher.search();
+ assert completions.isComplete;
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
@@ -668,7 +671,7 @@ public class FreeTextSuggester extends L
//System.out.println(" " + completions.length + " completions");
nextCompletion:
- for (MinResult<Long> completion : completions) {
+ for (Result<Long> completion : completions) {
token.length = prefixLength;
// append suffix
Util.toBytesRef(completion.input, suffix);
Modified: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java?rev=1576825&r1=1576824&r2=1576825&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java (original)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java Wed Mar 12 17:20:47 2014
@@ -17,12 +17,6 @@ package org.apache.lucene.search.suggest
* limitations under the License.
*/
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.List;
-
import org.apache.lucene.search.suggest.InputIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.SortedInputIterator;
@@ -34,15 +28,22 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.Arc;
import org.apache.lucene.util.fst.FST.BytesReader;
-import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
-import org.apache.lucene.util.fst.Util.MinResult;
+import org.apache.lucene.util.fst.Util.Result;
+import org.apache.lucene.util.fst.Util.TopResults;
import org.apache.lucene.util.fst.Util;
-import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
/**
* Suggester based on a weighted FST: it first traverses the prefix,
@@ -176,15 +177,16 @@ public class WFSTCompletionLookup extend
}
// complete top-N
- MinResult<Long> completions[] = null;
+ TopResults<Long> completions = null;
try {
completions = Util.shortestPaths(fst, arc, prefixOutput, weightComparator, num, !exactFirst);
+ assert completions.isComplete;
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
BytesRef suffix = new BytesRef(8);
- for (MinResult<Long> completion : completions) {
+ for (Result<Long> completion : completions) {
scratch.length = prefixLength;
// append suffix
Util.toBytesRef(completion.input, suffix);