You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/01/31 23:30:49 UTC
svn commit: r1238832 - in /lucene/dev/trunk/lucene/src:
java/org/apache/lucene/util/fst/FST.java
java/org/apache/lucene/util/fst/Util.java
test/org/apache/lucene/util/fst/TestFSTs.java
Author: mikemccand
Date: Tue Jan 31 22:30:49 2012
New Revision: 1238832
URL: http://svn.apache.org/viewvc?rev=1238832&view=rev
Log:
use binary search for array arcs in FST.getByOutput
Modified:
lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/FST.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/Util.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/FST.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/FST.java?rev=1238832&r1=1238831&r2=1238832&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/FST.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/FST.java Tue Jan 31 22:30:49 2012
@@ -69,14 +69,14 @@ public final class FST<T> {
public static enum INPUT_TYPE {BYTE1, BYTE2, BYTE4};
public final INPUT_TYPE inputType;
- private final static int BIT_FINAL_ARC = 1 << 0;
- private final static int BIT_LAST_ARC = 1 << 1;
+ final static int BIT_FINAL_ARC = 1 << 0;
+ final static int BIT_LAST_ARC = 1 << 1;
final static int BIT_TARGET_NEXT = 1 << 2;
// TODO: we can free up a bit if we can nuke this:
- private final static int BIT_STOP_NODE = 1 << 3;
- private final static int BIT_ARC_HAS_OUTPUT = 1 << 4;
- private final static int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5;
+ final static int BIT_STOP_NODE = 1 << 3;
+ final static int BIT_ARC_HAS_OUTPUT = 1 << 4;
+ final static int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5;
// Arcs are stored as fixed-size (per entry) array, so
// that we can find an arc using binary search. We do
@@ -228,10 +228,10 @@ public final class FST<T> {
b.append(" targetNext");
}
if (flag(BIT_ARC_HAS_OUTPUT)) {
- b.append(" hasOutput");
+ b.append(" output=" + output);
}
if (flag(BIT_ARC_HAS_FINAL_OUTPUT)) {
- b.append(" hasFinalOutput");
+ b.append(" nextFinalOutput=" + nextFinalOutput);
}
if (bytesPerArc != 0) {
b.append(" arcArray(idx=" + arcIdx + " of " + numArcs + ")");
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/Util.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/Util.java?rev=1238832&r1=1238831&r2=1238832&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/Util.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/Util.java Tue Jan 31 22:30:49 2012
@@ -119,6 +119,7 @@ public final class Util {
//System.out.println("reverseLookup output=" + targetOutput);
while(true) {
+ //System.out.println("loop: output=" + output + " upto=" + upto + " arc=" + arc);
if (arc.isFinal()) {
final long finalOutput = output + arc.nextFinalOutput;
//System.out.println(" isFinal finalOutput=" + finalOutput);
@@ -140,47 +141,91 @@ public final class Util {
fst.readFirstRealTargetArc(arc.target, arc, in);
- FST.Arc<Long> prevArc = null;
+ if (arc.bytesPerArc != 0) {
- // TODO: we could do binary search here if node arcs
- // are array'd:
- while(true) {
- //System.out.println(" cycle label=" + arc.label + " output=" + arc.output);
-
- // This is the min output we'd hit if we follow
- // this arc:
- final long minArcOutput = output + arc.output;
-
- if (minArcOutput == targetOutput) {
- // Recurse on this arc:
- //System.out.println(" match! break");
- output = minArcOutput;
- result.ints[upto++] = arc.label;
- break;
- } else if (minArcOutput > targetOutput) {
- if (prevArc == null) {
- // Output doesn't exist
- return null;
+ int low = 0;
+ int high = arc.numArcs-1;
+ int mid = 0;
+ //System.out.println("bsearch: numArcs=" + arc.numArcs + " target=" + targetOutput + " output=" + output);
+ boolean exact = false;
+ while (low <= high) {
+ mid = (low + high) >>> 1;
+ in.pos = arc.posArcsStart;
+ in.skip(arc.bytesPerArc*mid);
+ final byte flags = in.readByte();
+ fst.readLabel(in);
+ final long minArcOutput;
+ if ((flags & fst.BIT_ARC_HAS_OUTPUT) != 0) {
+ final long arcOutput = fst.outputs.read(in);
+ minArcOutput = output + arcOutput;
} else {
- // Recurse on previous arc:
- arc.copyFrom(prevArc);
- result.ints[upto++] = arc.label;
- output += arc.output;
- //System.out.println(" recurse prev label=" + (char) arc.label + " output=" + output);
+ minArcOutput = output;
+ }
+ //System.out.println(" cycle mid=" + mid + " label=" + (char) label + " output=" + minArcOutput);
+ if (minArcOutput == targetOutput) {
+ exact = true;
break;
+ } else if (minArcOutput < targetOutput) {
+ low = mid + 1;
+ } else {
+ high = mid - 1;
}
- } else if (arc.isLast()) {
- // Recurse on this arc:
- output = minArcOutput;
- //System.out.println(" recurse last label=" + (char) arc.label + " output=" + output);
- result.ints[upto++] = arc.label;
- break;
+ }
+
+ if (high == -1) {
+ return null;
+ } else if (exact) {
+ arc.arcIdx = mid-1;
} else {
- // Read next arc in this node:
- prevArc = scratchArc;
- prevArc.copyFrom(arc);
- //System.out.println(" after copy label=" + (char) prevArc.label + " vs " + (char) arc.label);
- fst.readNextRealArc(arc, in);
+ arc.arcIdx = low-2;
+ }
+
+ fst.readNextRealArc(arc, in);
+ result.ints[upto++] = arc.label;
+ output += arc.output;
+
+ } else {
+
+ FST.Arc<Long> prevArc = null;
+
+ while(true) {
+ //System.out.println(" cycle label=" + arc.label + " output=" + arc.output);
+
+ // This is the min output we'd hit if we follow
+ // this arc:
+ final long minArcOutput = output + arc.output;
+
+ if (minArcOutput == targetOutput) {
+ // Recurse on this arc:
+ //System.out.println(" match! break");
+ output = minArcOutput;
+ result.ints[upto++] = arc.label;
+ break;
+ } else if (minArcOutput > targetOutput) {
+ if (prevArc == null) {
+ // Output doesn't exist
+ return null;
+ } else {
+ // Recurse on previous arc:
+ arc.copyFrom(prevArc);
+ result.ints[upto++] = arc.label;
+ output += arc.output;
+ //System.out.println(" recurse prev label=" + (char) arc.label + " output=" + output);
+ break;
+ }
+ } else if (arc.isLast()) {
+ // Recurse on this arc:
+ output = minArcOutput;
+ //System.out.println(" recurse last label=" + (char) arc.label + " output=" + output);
+ result.ints[upto++] = arc.label;
+ break;
+ } else {
+ // Read next arc in this node:
+ prevArc = scratchArc;
+ prevArc.copyFrom(arc);
+ //System.out.println(" after copy label=" + (char) prevArc.label + " vs " + (char) arc.label);
+ fst.readNextRealArc(arc, in);
+ }
}
}
} else {
Modified: lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java?rev=1238832&r1=1238831&r2=1238832&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java (original)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java Tue Jan 31 22:30:49 2012
@@ -1259,7 +1259,7 @@ public class TestFSTs extends LuceneTest
protected abstract T getOutput(IntsRef input, int ord) throws IOException;
- public void run(int limit, boolean verify) throws IOException {
+ public void run(int limit, boolean verify, boolean verifyByOutput) throws IOException {
BufferedReader is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536);
try {
final IntsRef intsRef = new IntsRef(10);
@@ -1323,38 +1323,56 @@ public class TestFSTs extends LuceneTest
System.out.println("\nNow verify...");
while(true) {
- is.close();
- is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536);
-
- ord = 0;
- tStart = System.currentTimeMillis();
- while(true) {
- String w = is.readLine();
- if (w == null) {
- break;
- }
- toIntsRef(w, inputMode, intsRef);
- T expected = getOutput(intsRef, ord);
- T actual = Util.get(fst, intsRef);
- if (actual == null) {
- throw new RuntimeException("unexpected null output on input=" + w);
- }
- if (!actual.equals(expected)) {
- throw new RuntimeException("wrong output (got " + outputs.outputToString(actual) + " but expected " + outputs.outputToString(expected) + ") on input=" + w);
+ for(int iter=0;iter<2;iter++) {
+ is.close();
+ is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536);
+
+ ord = 0;
+ tStart = System.currentTimeMillis();
+ while(true) {
+ String w = is.readLine();
+ if (w == null) {
+ break;
+ }
+ toIntsRef(w, inputMode, intsRef);
+ if (iter == 0) {
+ T expected = getOutput(intsRef, ord);
+ T actual = Util.get(fst, intsRef);
+ if (actual == null) {
+ throw new RuntimeException("unexpected null output on input=" + w);
+ }
+ if (!actual.equals(expected)) {
+ throw new RuntimeException("wrong output (got " + outputs.outputToString(actual) + " but expected " + outputs.outputToString(expected) + ") on input=" + w);
+ }
+ } else {
+ // Get by output
+ final Long output = (Long) getOutput(intsRef, ord);
+ @SuppressWarnings("unchecked") final IntsRef actual = Util.getByOutput((FST<Long>) fst, output.longValue());
+ if (actual == null) {
+ throw new RuntimeException("unexpected null input from output=" + output);
+ }
+ if (!actual.equals(intsRef)) {
+ throw new RuntimeException("wrong input (got " + actual + " but expected " + intsRef + " from output=" + output);
+ }
+ }
+
+ ord++;
+ if (ord % 500000 == 0) {
+ System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "...");
+ }
+ if (ord >= limit) {
+ break;
+ }
}
- ord++;
- if (ord % 500000 == 0) {
- System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "...");
- }
- if (ord >= limit) {
+ double totSec = ((System.currentTimeMillis() - tStart)/1000.0);
+ System.out.println("Verify " + (iter == 1 ? "(by output) " : "") + "took " + totSec + " sec + (" + (int) ((totSec*1000000000/ord)) + " nsec per lookup)");
+
+ if (!verifyByOutput) {
break;
}
}
- double totSec = ((System.currentTimeMillis() - tStart)/1000.0);
- System.out.println("Verify took " + totSec + " sec + (" + (int) ((totSec*1000000000/ord)) + " nsec per lookup)");
-
// NOTE: comment out to profile lookup...
break;
}
@@ -1438,7 +1456,7 @@ public class TestFSTs extends LuceneTest
return outputs.newPair((long) ord,
(long) _TestUtil.nextInt(rand, 1, 5000));
}
- }.run(limit, verify);
+ }.run(limit, verify, false);
} else if (storeOrds) {
// Store only ords
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
@@ -1447,7 +1465,7 @@ public class TestFSTs extends LuceneTest
public Long getOutput(IntsRef input, int ord) {
return (long) ord;
}
- }.run(limit, verify);
+ }.run(limit, verify, true);
} else if (storeDocFreqs) {
// Store only docFreq
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(false);
@@ -1460,7 +1478,7 @@ public class TestFSTs extends LuceneTest
}
return (long) _TestUtil.nextInt(rand, 1, 5000);
}
- }.run(limit, verify);
+ }.run(limit, verify, false);
} else {
// Store nothing
final NoOutputs outputs = NoOutputs.getSingleton();
@@ -1470,7 +1488,7 @@ public class TestFSTs extends LuceneTest
public Object getOutput(IntsRef input, int ord) {
return NO_OUTPUT;
}
- }.run(limit, verify);
+ }.run(limit, verify, false);
}
}