You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/01/31 23:30:49 UTC

svn commit: r1238832 - in /lucene/dev/trunk/lucene/src: java/org/apache/lucene/util/fst/FST.java java/org/apache/lucene/util/fst/Util.java test/org/apache/lucene/util/fst/TestFSTs.java

Author: mikemccand
Date: Tue Jan 31 22:30:49 2012
New Revision: 1238832

URL: http://svn.apache.org/viewvc?rev=1238832&view=rev
Log:
use binary search for array arcs in FST.getByOutput

Modified:
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/FST.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/Util.java
    lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/FST.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/FST.java?rev=1238832&r1=1238831&r2=1238832&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/FST.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/FST.java Tue Jan 31 22:30:49 2012
@@ -69,14 +69,14 @@ public final class FST<T> {
   public static enum INPUT_TYPE {BYTE1, BYTE2, BYTE4};
   public final INPUT_TYPE inputType;
 
-  private final static int BIT_FINAL_ARC = 1 << 0;
-  private final static int BIT_LAST_ARC = 1 << 1;
+  final static int BIT_FINAL_ARC = 1 << 0;
+  final static int BIT_LAST_ARC = 1 << 1;
   final static int BIT_TARGET_NEXT = 1 << 2;
 
   // TODO: we can free up a bit if we can nuke this:
-  private final static int BIT_STOP_NODE = 1 << 3;
-  private final static int BIT_ARC_HAS_OUTPUT = 1 << 4;
-  private final static int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5;
+  final static int BIT_STOP_NODE = 1 << 3;
+  final static int BIT_ARC_HAS_OUTPUT = 1 << 4;
+  final static int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5;
 
   // Arcs are stored as fixed-size (per entry) array, so
   // that we can find an arc using binary search.  We do
@@ -228,10 +228,10 @@ public final class FST<T> {
         b.append(" targetNext");
       }
       if (flag(BIT_ARC_HAS_OUTPUT)) {
-        b.append(" hasOutput");
+        b.append(" output=" + output);
       }
       if (flag(BIT_ARC_HAS_FINAL_OUTPUT)) {
-        b.append(" hasFinalOutput");
+        b.append(" nextFinalOutput=" + nextFinalOutput);
       }
       if (bytesPerArc != 0) {
         b.append(" arcArray(idx=" + arcIdx + " of " + numArcs + ")");

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/Util.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/Util.java?rev=1238832&r1=1238831&r2=1238832&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/Util.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/Util.java Tue Jan 31 22:30:49 2012
@@ -119,6 +119,7 @@ public final class Util {
     //System.out.println("reverseLookup output=" + targetOutput);
 
     while(true) {
+      //System.out.println("loop: output=" + output + " upto=" + upto + " arc=" + arc);
       if (arc.isFinal()) {
         final long finalOutput = output + arc.nextFinalOutput;
         //System.out.println("  isFinal finalOutput=" + finalOutput);
@@ -140,47 +141,91 @@ public final class Util {
         
         fst.readFirstRealTargetArc(arc.target, arc, in);
 
-        FST.Arc<Long> prevArc = null;
+        if (arc.bytesPerArc != 0) {
 
-        // TODO: we could do binary search here if node arcs
-        // are array'd:
-        while(true) {
-          //System.out.println("    cycle label=" + arc.label + " output=" + arc.output);
-
-          // This is the min output we'd hit if we follow
-          // this arc:
-          final long minArcOutput = output + arc.output;
-
-          if (minArcOutput == targetOutput) {
-            // Recurse on this arc:
-            //System.out.println("  match!  break");
-            output = minArcOutput;
-            result.ints[upto++] = arc.label;
-            break;
-          } else if (minArcOutput > targetOutput) {
-            if (prevArc == null) {
-              // Output doesn't exist
-              return null;
+          int low = 0;
+          int high = arc.numArcs-1;
+          int mid = 0;
+          //System.out.println("bsearch: numArcs=" + arc.numArcs + " target=" + targetOutput + " output=" + output);
+          boolean exact = false;
+          while (low <= high) {
+            mid = (low + high) >>> 1;
+            in.pos = arc.posArcsStart;
+            in.skip(arc.bytesPerArc*mid);
+            final byte flags = in.readByte();
+            fst.readLabel(in);
+            final long minArcOutput;
+            if ((flags & fst.BIT_ARC_HAS_OUTPUT) != 0) {
+              final long arcOutput = fst.outputs.read(in);
+              minArcOutput = output + arcOutput;
             } else {
-              // Recurse on previous arc:
-              arc.copyFrom(prevArc);
-              result.ints[upto++] = arc.label;
-              output += arc.output;
-              //System.out.println("    recurse prev label=" + (char) arc.label + " output=" + output);
+              minArcOutput = output;
+            }
+            //System.out.println("  cycle mid=" + mid + " label=" + (char) label + " output=" + minArcOutput);
+            if (minArcOutput == targetOutput) {
+              exact = true;
               break;
+            } else if (minArcOutput < targetOutput) {
+              low = mid + 1;
+            } else {
+              high = mid - 1;
             }
-          } else if (arc.isLast()) {
-            // Recurse on this arc:
-            output = minArcOutput;
-            //System.out.println("    recurse last label=" + (char) arc.label + " output=" + output);
-            result.ints[upto++] = arc.label;
-            break;
+          }
+
+          if (high == -1) {
+            return null;
+          } else if (exact) {
+            arc.arcIdx = mid-1;
           } else {
-            // Read next arc in this node:
-            prevArc = scratchArc;
-            prevArc.copyFrom(arc);
-            //System.out.println("      after copy label=" + (char) prevArc.label + " vs " + (char) arc.label);
-            fst.readNextRealArc(arc, in);
+            arc.arcIdx = low-2;
+          }
+
+          fst.readNextRealArc(arc, in);
+          result.ints[upto++] = arc.label;
+          output += arc.output;
+
+        } else {
+
+          FST.Arc<Long> prevArc = null;
+
+          while(true) {
+            //System.out.println("    cycle label=" + arc.label + " output=" + arc.output);
+
+            // This is the min output we'd hit if we follow
+            // this arc:
+            final long minArcOutput = output + arc.output;
+
+            if (minArcOutput == targetOutput) {
+              // Recurse on this arc:
+              //System.out.println("  match!  break");
+              output = minArcOutput;
+              result.ints[upto++] = arc.label;
+              break;
+            } else if (minArcOutput > targetOutput) {
+              if (prevArc == null) {
+                // Output doesn't exist
+                return null;
+              } else {
+                // Recurse on previous arc:
+                arc.copyFrom(prevArc);
+                result.ints[upto++] = arc.label;
+                output += arc.output;
+                //System.out.println("    recurse prev label=" + (char) arc.label + " output=" + output);
+                break;
+              }
+            } else if (arc.isLast()) {
+              // Recurse on this arc:
+              output = minArcOutput;
+              //System.out.println("    recurse last label=" + (char) arc.label + " output=" + output);
+              result.ints[upto++] = arc.label;
+              break;
+            } else {
+              // Read next arc in this node:
+              prevArc = scratchArc;
+              prevArc.copyFrom(arc);
+              //System.out.println("      after copy label=" + (char) prevArc.label + " vs " + (char) arc.label);
+              fst.readNextRealArc(arc, in);
+            }
           }
         }
       } else {

Modified: lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java?rev=1238832&r1=1238831&r2=1238832&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java (original)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java Tue Jan 31 22:30:49 2012
@@ -1259,7 +1259,7 @@ public class TestFSTs extends LuceneTest
 
     protected abstract T getOutput(IntsRef input, int ord) throws IOException;
 
-    public void run(int limit, boolean verify) throws IOException {
+    public void run(int limit, boolean verify, boolean verifyByOutput) throws IOException {
       BufferedReader is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536);
       try {
         final IntsRef intsRef = new IntsRef(10);
@@ -1323,38 +1323,56 @@ public class TestFSTs extends LuceneTest
         System.out.println("\nNow verify...");
 
         while(true) {
-          is.close();
-          is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536);
-
-          ord = 0;
-          tStart = System.currentTimeMillis();
-          while(true) {
-            String w = is.readLine();
-            if (w == null) {
-              break;
-            }
-            toIntsRef(w, inputMode, intsRef);
-            T expected = getOutput(intsRef, ord);
-            T actual = Util.get(fst, intsRef);
-            if (actual == null) {
-              throw new RuntimeException("unexpected null output on input=" + w);
-            }
-            if (!actual.equals(expected)) {
-              throw new RuntimeException("wrong output (got " + outputs.outputToString(actual) + " but expected " + outputs.outputToString(expected) + ") on input=" + w);
+          for(int iter=0;iter<2;iter++) {
+            is.close();
+            is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536);
+
+            ord = 0;
+            tStart = System.currentTimeMillis();
+            while(true) {
+              String w = is.readLine();
+              if (w == null) {
+                break;
+              }
+              toIntsRef(w, inputMode, intsRef);
+              if (iter == 0) {
+                T expected = getOutput(intsRef, ord);
+                T actual = Util.get(fst, intsRef);
+                if (actual == null) {
+                  throw new RuntimeException("unexpected null output on input=" + w);
+                }
+                if (!actual.equals(expected)) {
+                  throw new RuntimeException("wrong output (got " + outputs.outputToString(actual) + " but expected " + outputs.outputToString(expected) + ") on input=" + w);
+                }
+              } else {
+                // Get by output
+                final Long output = (Long) getOutput(intsRef, ord);
+                @SuppressWarnings("unchecked") final IntsRef actual = Util.getByOutput((FST<Long>) fst, output.longValue());
+                if (actual == null) {
+                  throw new RuntimeException("unexpected null input from output=" + output);
+                }
+                if (!actual.equals(intsRef)) {
+                  throw new RuntimeException("wrong input (got " + actual + " but expected " + intsRef + " from output=" + output);
+                }
+              }
+
+              ord++;
+              if (ord % 500000 == 0) {
+                System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "...");
+              }
+              if (ord >= limit) {
+                break;
+              }
             }
 
-            ord++;
-            if (ord % 500000 == 0) {
-              System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "...");
-            }
-            if (ord >= limit) {
+            double totSec = ((System.currentTimeMillis() - tStart)/1000.0);
+            System.out.println("Verify " + (iter == 1 ? "(by output) " : "") + "took " + totSec + " sec + (" + (int) ((totSec*1000000000/ord)) + " nsec per lookup)");
+
+            if (!verifyByOutput) {
               break;
             }
           }
 
-          double totSec = ((System.currentTimeMillis() - tStart)/1000.0);
-          System.out.println("Verify took " + totSec + " sec + (" + (int) ((totSec*1000000000/ord)) + " nsec per lookup)");
-
           // NOTE: comment out to profile lookup...
           break;
         }
@@ -1438,7 +1456,7 @@ public class TestFSTs extends LuceneTest
           return outputs.newPair((long) ord,
                                  (long) _TestUtil.nextInt(rand, 1, 5000));
         }
-      }.run(limit, verify);
+      }.run(limit, verify, false);
     } else if (storeOrds) {
       // Store only ords
       final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
@@ -1447,7 +1465,7 @@ public class TestFSTs extends LuceneTest
         public Long getOutput(IntsRef input, int ord) {
           return (long) ord;
         }
-      }.run(limit, verify);
+      }.run(limit, verify, true);
     } else if (storeDocFreqs) {
       // Store only docFreq
       final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(false);
@@ -1460,7 +1478,7 @@ public class TestFSTs extends LuceneTest
           }
           return (long) _TestUtil.nextInt(rand, 1, 5000);
         }
-      }.run(limit, verify);
+      }.run(limit, verify, false);
     } else {
       // Store nothing
       final NoOutputs outputs = NoOutputs.getSingleton();
@@ -1470,7 +1488,7 @@ public class TestFSTs extends LuceneTest
         public Object getOutput(IntsRef input, int ord) {
           return NO_OUTPUT;
         }
-      }.run(limit, verify);
+      }.run(limit, verify, false);
     }
   }