You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/01/30 01:14:40 UTC
svn commit: r1237509 [2/2] - in /lucene/dev/branches/branch_3x: ./ lucene/
lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/
lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/
lucene/contrib/...
Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/PairOutputs.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/PairOutputs.java?rev=1237509&r1=1237508&r2=1237509&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/PairOutputs.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/PairOutputs.java Mon Jan 30 00:14:39 2012
@@ -38,7 +38,8 @@ public class PairOutputs<A,B> extends Ou
public final A output1;
public final B output2;
- public Pair(A output1, B output2) {
+ // use newPair
+ private Pair(A output1, B output2) {
this.output1 = output1;
this.output2 = output2;
}
@@ -66,35 +67,79 @@ public class PairOutputs<A,B> extends Ou
this.outputs2 = outputs2;
NO_OUTPUT = new Pair<A,B>(outputs1.getNoOutput(), outputs2.getNoOutput());
}
-
- public Pair<A,B> get(A output1, B output2) {
- if (output1 == outputs1.getNoOutput() && output2 == outputs2.getNoOutput()) {
+
+ /** Create a new Pair */
+ public Pair<A,B> newPair(A a, B b) {
+ if (a.equals(outputs1.getNoOutput())) {
+ a = outputs1.getNoOutput();
+ }
+ if (b.equals(outputs2.getNoOutput())) {
+ b = outputs2.getNoOutput();
+ }
+
+ if (a == outputs1.getNoOutput() && b == outputs2.getNoOutput()) {
return NO_OUTPUT;
} else {
- return new Pair<A,B>(output1, output2);
+ final Pair<A,B> p = new Pair<A,B>(a, b);
+ assert valid(p);
+ return p;
}
}
-
+
+ // for assert
+ private boolean valid(Pair<A,B> pair) {
+ final boolean noOutput1 = pair.output1.equals(outputs1.getNoOutput());
+ final boolean noOutput2 = pair.output2.equals(outputs2.getNoOutput());
+
+ if (noOutput1 && pair.output1 != outputs1.getNoOutput()) {
+ System.out.println("invalid0");
+ return false;
+ }
+
+ if (noOutput2 && pair.output2 != outputs2.getNoOutput()) {
+ System.out.println("invalid1");
+ return false;
+ }
+
+ if (noOutput1 && noOutput2) {
+ if (pair != NO_OUTPUT) {
+ System.out.println("invalid2");
+ return false;
+ } else {
+ return true;
+ }
+ } else {
+ return true;
+ }
+ }
+
@Override
public Pair<A,B> common(Pair<A,B> pair1, Pair<A,B> pair2) {
- return get(outputs1.common(pair1.output1, pair2.output1),
- outputs2.common(pair1.output2, pair2.output2));
+ assert valid(pair1);
+ assert valid(pair2);
+ return newPair(outputs1.common(pair1.output1, pair2.output1),
+ outputs2.common(pair1.output2, pair2.output2));
}
@Override
public Pair<A,B> subtract(Pair<A,B> output, Pair<A,B> inc) {
- return get(outputs1.subtract(output.output1, inc.output1),
- outputs2.subtract(output.output2, inc.output2));
+ assert valid(output);
+ assert valid(inc);
+ return newPair(outputs1.subtract(output.output1, inc.output1),
+ outputs2.subtract(output.output2, inc.output2));
}
@Override
public Pair<A,B> add(Pair<A,B> prefix, Pair<A,B> output) {
- return get(outputs1.add(prefix.output1, output.output1),
- outputs2.add(prefix.output2, output.output2));
+ assert valid(prefix);
+ assert valid(output);
+ return newPair(outputs1.add(prefix.output1, output.output1),
+ outputs2.add(prefix.output2, output.output2));
}
@Override
public void write(Pair<A,B> output, DataOutput writer) throws IOException {
+ assert valid(output);
outputs1.write(output.output1, writer);
outputs2.write(output.output2, writer);
}
@@ -103,7 +148,7 @@ public class PairOutputs<A,B> extends Ou
public Pair<A,B> read(DataInput in) throws IOException {
A output1 = outputs1.read(in);
B output2 = outputs2.read(in);
- return get(output1, output2);
+ return newPair(output1, output2);
}
@Override
@@ -113,6 +158,12 @@ public class PairOutputs<A,B> extends Ou
@Override
public String outputToString(Pair<A,B> output) {
+ assert valid(output);
return "<pair:" + outputs1.outputToString(output.output1) + "," + outputs2.outputToString(output.output2) + ">";
}
+
+ @Override
+ public String toString() {
+ return "PairOutputs<" + outputs1 + "," + outputs2 + ">";
+ }
}
Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java?rev=1237509&r1=1237508&r2=1237509&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java Mon Jan 30 00:14:39 2012
@@ -25,10 +25,7 @@ import org.apache.lucene.store.DataOutpu
/**
* Output is a long, for each input term. NOTE: the
* resulting FST is not guaranteed to be minimal! See
- * {@link Builder}. You must use {@link #get} to obtain the
- * output for a given long value -- do not use autoboxing
- * nor create your own Long instance (the value 0
- * must map to the {@link #getNoOutput} singleton).
+ * {@link Builder}.
*
* @lucene.experimental
*/
@@ -50,14 +47,6 @@ public final class PositiveIntOutputs ex
return doShare ? singletonShare : singletonNoShare;
}
- public Long get(long v) {
- if (v == 0) {
- return NO_OUTPUT;
- } else {
- return Long.valueOf(v);
- }
- }
-
@Override
public Long common(Long output1, Long output2) {
assert valid(output1);
Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/Util.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/Util.java?rev=1237509&r1=1237508&r2=1237509&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/Util.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/Util.java Mon Jan 30 00:14:39 2012
@@ -37,23 +37,21 @@ public final class Util {
// TODO: would be nice not to alloc this on every lookup
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
+ final FST.BytesReader fstReader = fst.getBytesReader(0);
+
// Accumulate output as we go
- final T NO_OUTPUT = fst.outputs.getNoOutput();
- T output = NO_OUTPUT;
+ T output = fst.outputs.getNoOutput();
for(int i=0;i<input.length;i++) {
- if (fst.findTargetArc(input.ints[input.offset + i], arc, arc) == null) {
+ if (fst.findTargetArc(input.ints[input.offset + i], arc, arc, fstReader) == null) {
return null;
- } else if (arc.output != NO_OUTPUT) {
- output = fst.outputs.add(output, arc.output);
}
+ output = fst.outputs.add(output, arc.output);
}
- if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
- return null;
- } else if (arc.output != NO_OUTPUT) {
- return fst.outputs.add(output, arc.output);
+ if (arc.isFinal()) {
+ return fst.outputs.add(output, arc.nextFinalOutput);
} else {
- return output;
+ return null;
}
}
@@ -64,26 +62,24 @@ public final class Util {
public static<T> T get(FST<T> fst, BytesRef input) throws IOException {
assert fst.inputType == FST.INPUT_TYPE.BYTE1;
+ final FST.BytesReader fstReader = fst.getBytesReader(0);
+
// TODO: would be nice not to alloc this on every lookup
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
// Accumulate output as we go
- final T NO_OUTPUT = fst.outputs.getNoOutput();
- T output = NO_OUTPUT;
+ T output = fst.outputs.getNoOutput();
for(int i=0;i<input.length;i++) {
- if (fst.findTargetArc(input.bytes[i+input.offset] & 0xFF, arc, arc) == null) {
+ if (fst.findTargetArc(input.bytes[i+input.offset] & 0xFF, arc, arc, fstReader) == null) {
return null;
- } else if (arc.output != NO_OUTPUT) {
- output = fst.outputs.add(output, arc.output);
}
+ output = fst.outputs.add(output, arc.output);
}
- if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
- return null;
- } else if (arc.output != NO_OUTPUT) {
- return fst.outputs.add(output, arc.output);
+ if (arc.isFinal()) {
+ return fst.outputs.add(output, arc.nextFinalOutput);
} else {
- return output;
+ return null;
}
}
@@ -142,7 +138,7 @@ public final class Util {
result.grow(1+upto);
}
- fst.readFirstRealArc(arc.target, arc, in);
+ fst.readFirstRealTargetArc(arc.target, arc, in);
FST.Arc<Long> prevArc = null;
@@ -238,6 +234,7 @@ public final class Util {
// A queue of transitions to consider when processing the next level.
final List<FST.Arc<T>> nextLevelQueue = new ArrayList<FST.Arc<T>>();
nextLevelQueue.add(startArc);
+ //System.out.println("toDot: startArc: " + startArc);
// A list of states on the same level (for ranking).
final List<Integer> sameLevelStates = new ArrayList<Integer>();
@@ -289,8 +286,11 @@ public final class Util {
int level = 0;
+ final FST.BytesReader r = fst.getBytesReader(0);
+
while (!nextLevelQueue.isEmpty()) {
// we could double buffer here, but it doesn't matter probably.
+ //System.out.println("next level=" + level);
thisLevelQueue.addAll(nextLevelQueue);
nextLevelQueue.clear();
@@ -298,19 +298,19 @@ public final class Util {
out.write("\n // Transitions and states at level: " + level + "\n");
while (!thisLevelQueue.isEmpty()) {
final FST.Arc<T> arc = thisLevelQueue.remove(thisLevelQueue.size() - 1);
+ //System.out.println(" pop: " + arc);
if (fst.targetHasArcs(arc)) {
- // scan all arcs
+ // scan all target arcs
+ //System.out.println(" readFirstTarget...");
final int node = arc.target;
- fst.readFirstTargetArc(arc, arc);
- if (arc.label == FST.END_LABEL) {
- // Skip it -- prior recursion took this into account already
- assert !arc.isLast();
- fst.readNextArc(arc);
- }
+ fst.readFirstRealTargetArc(arc.target, arc, r);
+
+ //System.out.println(" firstTarget: " + arc);
while (true) {
+ //System.out.println(" cycle arc=" + arc);
// Emit the unseen state and add it to the queue for the next level.
if (arc.target >= 0 && !seen.get(arc.target)) {
@@ -329,7 +329,7 @@ public final class Util {
if (fst.isExpandedTarget(arc)) {
stateColor = expandedNodeColor;
} else {
- stateColor = null;
+ stateColor = null;
}
final String finalOutput;
@@ -339,7 +339,9 @@ public final class Util {
finalOutput = "";
}
- emitDotState(out, Integer.toString(arc.target), arc.isFinal() ? finalStateShape : stateShape, stateColor, finalOutput);
+ emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, finalOutput);
+ // To see the node address, use this instead:
+ //emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, String.valueOf(arc.target));
seen.set(arc.target);
nextLevelQueue.add(new FST.Arc<T>().copyFrom(arc));
sameLevelStates.add(arc.target);
@@ -362,14 +364,22 @@ public final class Util {
outs = outs + "/[" + fst.outputs.outputToString(arc.nextFinalOutput) + "]";
}
+ final String arcColor;
+ if (arc.flag(FST.BIT_TARGET_NEXT)) {
+ arcColor = "red";
+ } else {
+ arcColor = "black";
+ }
+
assert arc.label != FST.END_LABEL;
- out.write(" " + node + " -> " + arc.target + " [label=\"" + printableLabel(arc.label) + outs + "\"]\n");
+ out.write(" " + node + " -> " + arc.target + " [label=\"" + printableLabel(arc.label) + outs + "\"" + (arc.isFinal() ? " style=\"bold\"" : "" ) + " color=\"" + arcColor + "\"]\n");
// Break the loop if we're on the last arc of this state.
if (arc.isLast()) {
+ //System.out.println(" break");
break;
}
- fst.readNextArc(arc);
+ fst.readNextRealArc(arc, r);
}
}
}
Modified: lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java?rev=1237509&r1=1237508&r2=1237509&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java Mon Jan 30 00:14:39 2012
@@ -80,11 +80,11 @@ public class TestFSTs extends LuceneTest
return br;
}
- private static IntsRef toIntsRef(String s, int inputMode) {
+ static IntsRef toIntsRef(String s, int inputMode) {
return toIntsRef(s, inputMode, new IntsRef(10));
}
- private static IntsRef toIntsRef(String s, int inputMode, IntsRef ir) {
+ static IntsRef toIntsRef(String s, int inputMode, IntsRef ir) {
if (inputMode == 0) {
// utf8
return toIntsRef(new BytesRef(s), ir);
@@ -94,7 +94,7 @@ public class TestFSTs extends LuceneTest
}
}
- private static IntsRef toIntsRefUTF32(String s, IntsRef ir) {
+ static IntsRef toIntsRefUTF32(String s, IntsRef ir) {
final int charLength = s.length();
int charIdx = 0;
int intIdx = 0;
@@ -111,7 +111,7 @@ public class TestFSTs extends LuceneTest
return ir;
}
- private static IntsRef toIntsRef(BytesRef br, IntsRef ir) {
+ static IntsRef toIntsRef(BytesRef br, IntsRef ir) {
if (br.length > ir.ints.length) {
ir.grow(br.length);
}
@@ -163,7 +163,7 @@ public class TestFSTs extends LuceneTest
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms2.length);
for(int idx=0;idx<terms2.length;idx++) {
- pairs.add(new FSTTester.InputOutput<Long>(terms2[idx], outputs.get(idx)));
+ pairs.add(new FSTTester.InputOutput<Long>(terms2[idx], (long) idx));
}
final FST<Long> fst = new FSTTester<Long>(random, dir, inputMode, pairs, outputs, true).doTest(0, 0, false);
assertNotNull(fst);
@@ -221,7 +221,7 @@ public class TestFSTs extends LuceneTest
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length);
for(int idx=0;idx<terms.length;idx++) {
- pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(idx)));
+ pairs.add(new FSTTester.InputOutput<Long>(terms[idx], (long) idx));
}
new FSTTester<Long>(random, dir, inputMode, pairs, outputs, true).doTest();
}
@@ -235,7 +235,7 @@ public class TestFSTs extends LuceneTest
for(int idx=0;idx<terms.length;idx++) {
final long value = lastOutput + _TestUtil.nextInt(random, 1, 1000);
lastOutput = value;
- pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(value)));
+ pairs.add(new FSTTester.InputOutput<Long>(terms[idx], value));
}
new FSTTester<Long>(random, dir, inputMode, pairs, outputs, doShare).doTest();
}
@@ -245,7 +245,7 @@ public class TestFSTs extends LuceneTest
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length);
for(int idx=0;idx<terms.length;idx++) {
- pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(random.nextLong()) & Long.MAX_VALUE));
+ pairs.add(new FSTTester.InputOutput<Long>(terms[idx], random.nextLong() & Long.MAX_VALUE));
}
new FSTTester<Long>(random, dir, inputMode, pairs, outputs, false).doTest();
}
@@ -261,8 +261,7 @@ public class TestFSTs extends LuceneTest
final long value = lastOutput + _TestUtil.nextInt(random, 1, 1000);
lastOutput = value;
pairs.add(new FSTTester.InputOutput<PairOutputs.Pair<Long,Long>>(terms[idx],
- outputs.get(o1.get(idx),
- o2.get(value))));
+ outputs.newPair((long) idx, value)));
}
new FSTTester<PairOutputs.Pair<Long,Long>>(random, dir, inputMode, pairs, outputs, false).doTest();
}
@@ -384,6 +383,7 @@ public class TestFSTs extends LuceneTest
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
final T NO_OUTPUT = fst.outputs.getNoOutput();
T output = NO_OUTPUT;
+ final FST.BytesReader fstReader = fst.getBytesReader(0);
for(int i=0;i<=term.length;i++) {
final int label;
@@ -392,8 +392,9 @@ public class TestFSTs extends LuceneTest
} else {
label = term.ints[term.offset+i];
}
- //System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal());
- if (fst.findTargetArc(label, arc, arc) == null) {
+ // System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal());
+ if (fst.findTargetArc(label, arc, arc, fstReader) == null) {
+ // System.out.println(" not found");
if (prefixLength != null) {
prefixLength[0] = i;
return output;
@@ -453,15 +454,19 @@ public class TestFSTs extends LuceneTest
FST<T> doTest(int prune1, int prune2, boolean allowRandomSuffixSharing) throws IOException {
if (VERBOSE) {
- System.out.println("TEST: prune1=" + prune1 + " prune2=" + prune2);
+ System.out.println("\nTEST: prune1=" + prune1 + " prune2=" + prune2);
}
+ final boolean willRewrite = random.nextBoolean();
+
final Builder<T> builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
prune1, prune2,
prune1==0 && prune2==0,
allowRandomSuffixSharing ? random.nextBoolean() : true,
allowRandomSuffixSharing ? _TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE,
- outputs, null);
+ outputs,
+ null,
+ willRewrite);
for(InputOutput<T> pair : pairs) {
if (pair.output instanceof UpToTwoPositiveIntOutputs.TwoLongs) {
@@ -476,7 +481,7 @@ public class TestFSTs extends LuceneTest
}
FST<T> fst = builder.finish();
- if (random.nextBoolean() && fst != null) {
+ if (random.nextBoolean() && fst != null && !willRewrite) {
IndexOutput out = dir.createOutput("fst.bin");
fst.save(out);
out.close();
@@ -510,6 +515,21 @@ public class TestFSTs extends LuceneTest
verifyPruned(inputMode, fst, prune1, prune2);
}
+ if (willRewrite && fst != null) {
+ if (VERBOSE) {
+ System.out.println("TEST: now rewrite");
+ }
+ final FST<T> packed = fst.pack(_TestUtil.nextInt(random, 1, 10), _TestUtil.nextInt(random, 0, 10000000));
+ if (VERBOSE) {
+ System.out.println("TEST: now verify packed FST");
+ }
+ if (prune1 == 0 && prune2 == 0) {
+ verifyUnPruned(inputMode, packed);
+ } else {
+ verifyPruned(inputMode, packed, prune1, prune2);
+ }
+ }
+
return fst;
}
@@ -626,7 +646,7 @@ public class TestFSTs extends LuceneTest
num = atLeast(100);
for(int iter=0;iter<num;iter++) {
if (VERBOSE) {
- System.out.println("TEST: iter=" + iter);
+ System.out.println(" iter=" + iter);
}
if (random.nextBoolean()) {
// seek to term that doesn't exist:
@@ -843,7 +863,15 @@ public class TestFSTs extends LuceneTest
prefixes.put(IntsRef.deepCopyOf(scratch), cmo);
} else {
cmo.count++;
- cmo.output = outputs.common(cmo.output, pair.output);
+ T output1 = cmo.output;
+ if (output1.equals(outputs.getNoOutput())) {
+ output1 = outputs.getNoOutput();
+ }
+ T output2 = pair.output;
+ if (output2.equals(outputs.getNoOutput())) {
+ output2 = outputs.getNoOutput();
+ }
+ cmo.output = outputs.common(output1, output2);
}
if (idx == pair.input.length) {
cmo.isFinal = true;
@@ -971,7 +999,7 @@ public class TestFSTs extends LuceneTest
public void testRandomWords() throws IOException {
testRandomWords(1000, atLeast(2));
- //testRandomWords(20, 100);
+ //testRandomWords(100, 1);
}
String inputModeToString(int mode) {
@@ -1054,50 +1082,6 @@ public class TestFSTs extends LuceneTest
return new String(chars);
}
- // NOTE: this test shows a case where our current builder
- // fails to produce minimal FST:
- /*
- public void test3() throws Exception {
- final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
- Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
- IntsRef scratchIntsRef = new IntsRef();
- builder.add(Util.toIntsRef(new BytesRef("aa$"), scratchIntsRef), outputs.get(0));
- builder.add(Util.toIntsRef(new BytesRef("aab$"), scratchIntsRef), 1L);
- builder.add(Util.toIntsRef(new BytesRef("bbb$"), scratchIntsRef), 2L);
- final FST<Long> fst = builder.finish();
- //System.out.println("NODES " + fst.getNodeCount() + " ARCS " + fst.getArcCount());
- // NOTE: we produce 7 nodes today
- assertEquals(6, fst.getNodeCount());
- // NOTE: we produce 8 arcs today
- assertEquals(7, fst.getNodeCount());
- //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
- //Util.toDot(fst, w, false, false);
- //w.close();
- }
- */
-
- // NOTE: this test shows a case where our current builder
- // fails to produce minimal FST:
- /*
- public void test4() throws Exception {
- final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
- Builder<BytesRef> builder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, outputs);
- IntsRef scratchIntsRef = new IntsRef();
- builder.add(Util.toIntsRef(new BytesRef("aa$"), scratchIntsRef), outputs.getNoOutput());
- builder.add(Util.toIntsRef(new BytesRef("aab$"), scratchIntsRef), new BytesRef("1"));
- builder.add(Util.toIntsRef(new BytesRef("bbb$"), scratchIntsRef), new BytesRef("11"));
- final FST<BytesRef> fst = builder.finish();
- //System.out.println("NODES " + fst.getNodeCount() + " ARCS " + fst.getArcCount());
- // NOTE: we produce 7 nodes today
- assertEquals(6, fst.getNodeCount());
- // NOTE: we produce 8 arcs today
- assertEquals(7, fst.getNodeCount());
- //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
- //Util.toDot(fst, w, false, false);
- //w.close();
- }
- */
-
// Build FST for all unique terms in the test line docs
// file, up until a time limit
public void testRealTerms() throws Exception {
@@ -1126,7 +1110,9 @@ public class TestFSTs extends LuceneTest
IndexReader r = IndexReader.open(writer, true);
writer.close();
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
- Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE2, outputs);
+
+ final boolean doRewrite = random.nextBoolean();
+ Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, doRewrite);
boolean storeOrd = false;
if (VERBOSE) {
@@ -1167,61 +1153,72 @@ public class TestFSTs extends LuceneTest
output = termEnum.docFreq();
}
//System.out.println("ADD: " + term.text() + " ch[0]=" + (term.text().length() == 0 ? -1 : term.text().charAt(0)));
- builder.add(toIntsRef(term.text()), outputs.get(output));
+ builder.add(toIntsRef(term.text()), (long) output);
ord++;
if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) {
System.out.println(ord + " terms...");
}
termEnum.next();
}
- final FST<Long> fst = builder.finish();
+
+ FST<Long> fst = builder.finish();
if (VERBOSE) {
System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs;" + " " + fst.sizeInBytes() + " bytes");
}
if (ord > 0) {
- // Now confirm BytesRefFSTEnum and TermEnum act the
- // same:
- final IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<Long>(fst);
- int num = atLeast(1000);
- for(int iter=0;iter<num;iter++) {
- final String randomTerm = getRandomString();
-
- if (VERBOSE) {
- System.out.println("TEST: seek " + randomTerm + " ch[0]=" + (randomTerm.length() == 0 ? -1 : randomTerm.charAt(0)));
+ for(int rewriteIter=0;rewriteIter<2;rewriteIter++) {
+ if (rewriteIter == 1) {
+ if (doRewrite) {
+ // Verify again, with packed FST:
+ fst = fst.pack(_TestUtil.nextInt(random, 1, 10), _TestUtil.nextInt(random, 0, 10000000));
+ } else {
+ break;
+ }
}
+ // Now confirm BytesRefFSTEnum and TermsEnum act the
+ // same:
+ final IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<Long>(fst);
+ int num = atLeast(1000);
+ for(int iter=0;iter<num;iter++) {
+ final String randomTerm = getRandomString();
+
+ if (VERBOSE) {
+ System.out.println("TEST: seek " + randomTerm + " ch[0]=" + (randomTerm.length() == 0 ? -1 : randomTerm.charAt(0)));
+ }
- termEnum = r.terms(new Term("body", randomTerm));
- final IntsRefFSTEnum.InputOutput fstSeekResult = fstEnum.seekCeil(toIntsRef(randomTerm));
+ termEnum = r.terms(new Term("body", randomTerm));
+ final IntsRefFSTEnum.InputOutput fstSeekResult = fstEnum.seekCeil(toIntsRef(randomTerm));
- if (termEnum.term() == null || !"body".equals(termEnum.term().field())) {
- assertNull("got " + (fstSeekResult == null ? "null" : toString(fstSeekResult.input) + " but expected null"), fstSeekResult);
- } else {
- assertSame(termEnum, fstEnum, storeOrd);
- for(int nextIter=0;nextIter<10;nextIter++) {
- if (VERBOSE) {
- System.out.println("TEST: next");
- //if (storeOrd) {
- //System.out.println(" ord=" + termEnum.ord());
- //}
- }
- termEnum.next();
- if (termEnum.term() != null && "body".equals(termEnum.term().field())) {
- if (VERBOSE) {
- System.out.println(" term=" + termEnum.term());
- }
- assertNotNull(fstEnum.next());
- assertSame(termEnum, fstEnum, storeOrd);
- } else {
+ if (termEnum.term() == null || !"body".equals(termEnum.term().field())) {
+ assertNull("got " + (fstSeekResult == null ? "null" : toString(fstSeekResult.input) + " but expected null"), fstSeekResult);
+ } else {
+ assertSame(termEnum, fstEnum, storeOrd);
+ for(int nextIter=0;nextIter<10;nextIter++) {
if (VERBOSE) {
- System.out.println(" end!");
+ System.out.println("TEST: next");
+ //if (storeOrd) {
+ //System.out.println(" ord=" + termEnum.ord());
+ //}
}
- IntsRefFSTEnum.InputOutput<Long> nextResult = fstEnum.next();
- if (nextResult != null) {
- System.out.println("expected null but got: input=" + toString(nextResult.input) + " output=" + outputs.outputToString(nextResult.output));
- fail();
+ termEnum.next();
+ if (termEnum.term() != null && "body".equals(termEnum.term().field())) {
+ if (VERBOSE) {
+ System.out.println(" term=" + termEnum.term());
+ }
+ assertNotNull(fstEnum.next());
+ assertSame(termEnum, fstEnum, storeOrd);
+ } else {
+ if (VERBOSE) {
+ System.out.println(" end!");
+ }
+ IntsRefFSTEnum.InputOutput<Long> nextResult = fstEnum.next();
+ if (nextResult != null) {
+ System.out.println("expected null but got: input=" + toString(nextResult.input) + " output=" + outputs.outputToString(nextResult.output));
+ fail();
+ }
+ break;
}
- break;
}
}
}
@@ -1257,14 +1254,17 @@ public class TestFSTs extends LuceneTest
private int inputMode;
private final Outputs<T> outputs;
private final Builder<T> builder;
+ private final boolean doPack;
- public VisitTerms(String dirOut, String wordsFileIn, int inputMode, int prune, Outputs<T> outputs) {
+ public VisitTerms(String dirOut, String wordsFileIn, int inputMode, int prune, Outputs<T> outputs, boolean doPack, boolean noArcArrays) {
this.dirOut = dirOut;
this.wordsFileIn = wordsFileIn;
this.inputMode = inputMode;
this.outputs = outputs;
-
- builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, null);
+ this.doPack = doPack;
+
+ builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, null, doPack);
+ builder.setAllowArrayArcs(!noArcArrays);
}
protected abstract T getOutput(IntsRef input, int ord) throws IOException;
@@ -1296,14 +1296,15 @@ public class TestFSTs extends LuceneTest
}
assert builder.getTermCount() == ord;
- final FST<T> fst = builder.finish();
+ FST<T> fst = builder.finish();
if (fst == null) {
System.out.println("FST was fully pruned!");
System.exit(0);
}
- if (dirOut == null)
+ if (dirOut == null) {
return;
+ }
System.out.println(ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs; " + fst.getArcWithOutputCount() + " arcs w/ output; tot size " + fst.sizeInBytes());
if (fst.getNodeCount() < 100) {
@@ -1313,11 +1314,15 @@ public class TestFSTs extends LuceneTest
System.out.println("Wrote FST to out.dot");
}
+ if (doPack) {
+ System.out.println("Pack...");
+ fst = fst.pack(4, 100000000);
+ System.out.println("New size " + fst.sizeInBytes() + " bytes");
+ }
Directory dir = FSDirectory.open(new File(dirOut));
IndexOutput out = dir.createOutput("fst.bin");
fst.save(out);
out.close();
-
System.out.println("Saved FST to fst.bin.");
if (!verify) {
@@ -1326,37 +1331,42 @@ public class TestFSTs extends LuceneTest
System.out.println("\nNow verify...");
- is.close();
- is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536);
-
- ord = 0;
- tStart = System.currentTimeMillis();
while(true) {
- String w = is.readLine();
- if (w == null) {
- break;
- }
- toIntsRef(w, inputMode, intsRef);
- T expected = getOutput(intsRef, ord);
- T actual = Util.get(fst, intsRef);
- if (actual == null) {
- throw new RuntimeException("unexpected null output on input=" + w);
- }
- if (!actual.equals(expected)) {
- throw new RuntimeException("wrong output (got " + outputs.outputToString(actual) + " but expected " + outputs.outputToString(expected) + ") on input=" + w);
- }
+ is.close();
+ is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536);
- ord++;
- if (ord % 500000 == 0) {
- System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "...");
- }
- if (ord >= limit) {
- break;
+ ord = 0;
+ tStart = System.currentTimeMillis();
+ while(true) {
+ String w = is.readLine();
+ if (w == null) {
+ break;
+ }
+ toIntsRef(w, inputMode, intsRef);
+ T expected = getOutput(intsRef, ord);
+ T actual = Util.get(fst, intsRef);
+ if (actual == null) {
+ throw new RuntimeException("unexpected null output on input=" + w);
+ }
+ if (!actual.equals(expected)) {
+ throw new RuntimeException("wrong output (got " + outputs.outputToString(actual) + " but expected " + outputs.outputToString(expected) + ") on input=" + w);
+ }
+
+ ord++;
+ if (ord % 500000 == 0) {
+ System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "...");
+ }
+ if (ord >= limit) {
+ break;
+ }
}
- }
- double totSec = ((System.currentTimeMillis() - tStart)/1000.0);
- System.out.println("Verify took " + totSec + " sec + (" + (int) ((totSec*1000000000/ord)) + " nsec per lookup)");
+ double totSec = ((System.currentTimeMillis() - tStart)/1000.0);
+ System.out.println("Verify took " + totSec + " sec + (" + (int) ((totSec*1000000000/ord)) + " nsec per lookup)");
+
+ // NOTE: comment out to profile lookup...
+ break;
+ }
} finally {
is.close();
@@ -1364,7 +1374,7 @@ public class TestFSTs extends LuceneTest
}
}
- // java -cp build/classes/test:build/classes/java:build/classes/test-framework:lib/junit-4.7.jar org.apache.lucene.util.fst.TestFSTs /x/tmp/allTerms3.txt out
+ // java -cp build/classes/test:build/classes/test-framework:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.fst.TestFSTs /x/tmp/allTerms3.txt out
public static void main(String[] args) throws IOException {
int prune = 0;
int limit = Integer.MAX_VALUE;
@@ -1372,7 +1382,8 @@ public class TestFSTs extends LuceneTest
boolean storeOrds = false;
boolean storeDocFreqs = false;
boolean verify = true;
-
+ boolean doPack = false;
+ boolean noArcArrays = false;
String wordsFileIn = null;
String dirOut = null;
@@ -1390,10 +1401,14 @@ public class TestFSTs extends LuceneTest
inputMode = 1;
} else if (args[idx].equals("-docFreq")) {
storeDocFreqs = true;
+ } else if (args[idx].equals("-noArcArrays")) {
+ noArcArrays = true;
} else if (args[idx].equals("-ords")) {
storeOrds = true;
} else if (args[idx].equals("-noverify")) {
verify = false;
+ } else if (args[idx].equals("-pack")) {
+ doPack = true;
} else if (args[idx].startsWith("-")) {
System.err.println("Unrecognized option: " + args[idx]);
System.exit(-1);
@@ -1422,44 +1437,44 @@ public class TestFSTs extends LuceneTest
final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(true);
final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(false);
final PairOutputs<Long,Long> outputs = new PairOutputs<Long,Long>(o1, o2);
- new VisitTerms<PairOutputs.Pair<Long,Long>>(dirOut, wordsFileIn, inputMode, prune, outputs) {
+ new VisitTerms<PairOutputs.Pair<Long,Long>>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
Random rand;
@Override
public PairOutputs.Pair<Long,Long> getOutput(IntsRef input, int ord) {
if (ord == 0) {
rand = new Random(17);
}
- return new PairOutputs.Pair<Long,Long>(o1.get(ord),
- o2.get(_TestUtil.nextInt(rand, 1, 5000)));
+ return outputs.newPair((long) ord,
+ (long) _TestUtil.nextInt(rand, 1, 5000));
}
}.run(limit, verify);
} else if (storeOrds) {
// Store only ords
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
- new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs) {
+ new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
@Override
public Long getOutput(IntsRef input, int ord) {
- return outputs.get(ord);
+ return (long) ord;
}
}.run(limit, verify);
} else if (storeDocFreqs) {
// Store only docFreq
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(false);
- new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs) {
+ new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
Random rand;
@Override
public Long getOutput(IntsRef input, int ord) {
if (ord == 0) {
rand = new Random(17);
}
- return outputs.get(_TestUtil.nextInt(rand, 1, 5000));
+ return (long) _TestUtil.nextInt(rand, 1, 5000);
}
}.run(limit, verify);
} else {
// Store nothing
final NoOutputs outputs = NoOutputs.getSingleton();
final Object NO_OUTPUT = outputs.getNoOutput();
- new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, prune, outputs) {
+ new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
@Override
public Object getOutput(IntsRef input, int ord) {
return NO_OUTPUT;
@@ -1477,6 +1492,46 @@ public class TestFSTs extends LuceneTest
assertNull(fstEnum.seekCeil(new BytesRef("foobaz")));
}
+ /*
+ public void testTrivial() throws Exception {
+
+ // Get outputs -- passing true means FST will share
+ // (delta code) the outputs. This should result in
+ // smaller FST if the outputs grow monotonically. But
+ // if numbers are "random", false should give smaller
+ // final size:
+ final NoOutputs outputs = NoOutputs.getSingleton();
+
+ String[] strings = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation", "stat"};
+
+ final Builder<Object> builder = new Builder<Object>(FST.INPUT_TYPE.BYTE1,
+ 0, 0,
+ true,
+ true,
+ Integer.MAX_VALUE,
+ outputs,
+ null,
+ true);
+ Arrays.sort(strings);
+ final IntsRef scratch = new IntsRef();
+ for(String s : strings) {
+ builder.add(Util.toIntsRef(new BytesRef(s), scratch), outputs.getNoOutput());
+ }
+ final FST<Object> fst = builder.finish();
+ System.out.println("DOT before rewrite");
+ Writer w = new OutputStreamWriter(new FileOutputStream("/mnt/scratch/before.dot"));
+ Util.toDot(fst, w, false, false);
+ w.close();
+
+ final FST<Object> rewrite = new FST<Object>(fst, 1, 100);
+
+ System.out.println("DOT after rewrite");
+ w = new OutputStreamWriter(new FileOutputStream("/mnt/scratch/after.dot"));
+ Util.toDot(rewrite, w, false, false);
+ w.close();
+ }
+ */
+
public void testSimple() throws Exception {
// Get outputs -- passing true means FST will share
@@ -1493,9 +1548,9 @@ public class TestFSTs extends LuceneTest
final BytesRef b = new BytesRef("b");
final BytesRef c = new BytesRef("c");
- builder.add(Util.toIntsRef(a, new IntsRef()), outputs.get(17));
- builder.add(Util.toIntsRef(b, new IntsRef()), outputs.get(42));
- builder.add(Util.toIntsRef(c, new IntsRef()), outputs.get(13824324872317238L));
+ builder.add(Util.toIntsRef(a, new IntsRef()), 17L);
+ builder.add(Util.toIntsRef(b, new IntsRef()), 42L);
+ builder.add(Util.toIntsRef(c, new IntsRef()), 13824324872317238L);
final FST<Long> fst = builder.finish();
@@ -1797,11 +1852,11 @@ public class TestFSTs extends LuceneTest
public void testFinalOutputOnEndState() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
- final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, null);
- builder.add(Util.toUTF32("stat", new IntsRef()), outputs.get(17));
- builder.add(Util.toUTF32("station", new IntsRef()), outputs.get(10));
+ final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, null, random.nextBoolean());
+ builder.add(Util.toUTF32("stat", new IntsRef()), 17L);
+ builder.add(Util.toUTF32("station", new IntsRef()), 10L);
final FST<Long> fst = builder.finish();
- //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot"));
+ //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
StringWriter w = new StringWriter();
Util.toDot(fst, w, false, false);
w.close();
@@ -1811,8 +1866,8 @@ public class TestFSTs extends LuceneTest
public void testInternalFinalState() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
-
- final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null);
+ final boolean willRewrite = random.nextBoolean();
+ final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, willRewrite);
builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRef()), outputs.getNoOutput());
builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRef()), outputs.getNoOutput());
final FST<Long> fst = builder.finish();
@@ -1821,17 +1876,23 @@ public class TestFSTs extends LuceneTest
Util.toDot(fst, w, false, false);
w.close();
//System.out.println(w.toString());
- assertTrue(w.toString().indexOf("6 [shape=doublecircle") != -1);
+ final String expected;
+ if (willRewrite) {
+ expected = "4 -> 3 [label=\"t\" style=\"bold\"";
+ } else {
+ expected = "8 -> 6 [label=\"t\" style=\"bold\"";
+ }
+ assertTrue(w.toString().indexOf(expected) != -1);
}
// Make sure raw FST can differentiate between final vs
// non-final end nodes
- public void testNonFinalStopNodes() throws Exception {
+ public void testNonFinalStopNode() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
final Long nothing = outputs.getNoOutput();
final Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
- final FST<Long> fst = new FST<Long>(FST.INPUT_TYPE.BYTE1, outputs);
+ final FST<Long> fst = new FST<Long>(FST.INPUT_TYPE.BYTE1, outputs, false);
final Builder.UnCompiledNode<Long> rootNode = new Builder.UnCompiledNode<Long>(b, 0);
@@ -1841,8 +1902,8 @@ public class TestFSTs extends LuceneTest
node.isFinal = true;
rootNode.addArc('a', node);
final Builder.CompiledNode frozen = new Builder.CompiledNode();
- frozen.address = fst.addNode(node);
- rootNode.arcs[0].nextFinalOutput = outputs.get(17);
+ frozen.node = fst.addNode(node);
+ rootNode.arcs[0].nextFinalOutput = 17L;
rootNode.arcs[0].isFinal = true;
rootNode.arcs[0].output = nothing;
rootNode.arcs[0].target = frozen;
@@ -1853,13 +1914,18 @@ public class TestFSTs extends LuceneTest
final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<Long>(b, 0);
rootNode.addArc('b', node);
final Builder.CompiledNode frozen = new Builder.CompiledNode();
- frozen.address = fst.addNode(node);
+ frozen.node = fst.addNode(node);
rootNode.arcs[1].nextFinalOutput = nothing;
- rootNode.arcs[1].output = outputs.get(42);
+ rootNode.arcs[1].output = 42L;
rootNode.arcs[1].target = frozen;
}
fst.finish(fst.addNode(rootNode));
+
+ StringWriter w = new StringWriter();
+ //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
+ Util.toDot(fst, w, false, false);
+ w.close();
checkStopNodes(fst, outputs);