You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2011/02/23 10:35:24 UTC

svn commit: r1073653 - in /lucene/dev/trunk/lucene/src: java/org/apache/lucene/util/automaton/fst/ test/org/apache/lucene/util/automaton/fst/

Author: dweiss
Date: Wed Feb 23 09:35:24 2011
New Revision: 1073653

URL: http://svn.apache.org/viewvc?rev=1073653&view=rev
Log:
LUCENE-2933: Two-stage state expansion criterion for the FST (distance from root and child count).

Modified:
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java
    lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java?rev=1073653&r1=1073652&r2=1073653&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java Wed Feb 23 09:35:24 2011
@@ -83,7 +83,7 @@ public class Builder<T> {
     @SuppressWarnings("unchecked") final UnCompiledNode<T>[] f = (UnCompiledNode<T>[]) new UnCompiledNode[10];
     frontier = f;
     for(int idx=0;idx<frontier.length;idx++) {
-      frontier[idx] = new UnCompiledNode<T>(this);
+      frontier[idx] = new UnCompiledNode<T>(this, idx);
     }
   }
 
@@ -201,7 +201,7 @@ public class Builder<T> {
           // undecided on whether to prune it.  later, it
           // will be either compiled or pruned, so we must
           // allocate a new node:
-          frontier[idx] = new UnCompiledNode<T>(this);
+          frontier[idx] = new UnCompiledNode<T>(this, idx);
         }
       }
     }
@@ -292,7 +292,7 @@ public class Builder<T> {
         new UnCompiledNode[ArrayUtil.oversize(input.length+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
       System.arraycopy(frontier, 0, next, 0, frontier.length);
       for(int idx=frontier.length;idx<next.length;idx++) {
-        next[idx] = new UnCompiledNode<T>(this);
+        next[idx] = new UnCompiledNode<T>(this, idx);
       }
       frontier = next;
     }
@@ -424,12 +424,22 @@ public class Builder<T> {
     boolean isFinal;
     int inputCount;
 
+    /** This node's depth, starting from the automaton root. */
+    final int depth;
+
+    /**
+     * @param depth
+     *          The node's depth starting from the automaton root. Needed for
+     *          LUCENE-2934 (node expansion based on conditions other than the
+     *          fanout size).
+     */
     @SuppressWarnings("unchecked")
-    public UnCompiledNode(Builder<T> owner) {
+    public UnCompiledNode(Builder<T> owner, int depth) {
       this.owner = owner;
       arcs = (Arc<T>[]) new Arc[1];
       arcs[0] = new Arc<T>();
       output = owner.NO_OUTPUT;
+      this.depth = depth;
     }
 
     public boolean isCompiled() {
@@ -441,6 +451,9 @@ public class Builder<T> {
       isFinal = false;
       output = owner.NO_OUTPUT;
       inputCount = 0;
+
+      // We don't clear the depth here because it never changes 
+      // for nodes on the frontier (even when reused).
     }
 
     public T getLastOutput(int labelToMatch) {

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java?rev=1073653&r1=1073652&r2=1073653&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java Wed Feb 23 09:35:24 2011
@@ -25,6 +25,7 @@ import org.apache.lucene.store.IndexInpu
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.CodecUtil;
+import org.apache.lucene.util.automaton.fst.Builder.UnCompiledNode;
 
 /** Represents an FST using a compact byte[] format.
  *  <p> The format is similar to what's used by Morfologik
@@ -47,11 +48,21 @@ public class FST<T> {
   // this when number of arcs is > NUM_ARCS_ARRAY:
   private final static int BIT_ARCS_AS_FIXED_ARRAY = 1 << 6;
 
-  // If the node has >= this number of arcs, the arcs are
-  // stored as a fixed array.  Fixed array consumes more RAM
-  // but enables binary search on the arcs (instead of
-  // linear scan) on lookup by arc label:
-  private final static int NUM_ARCS_FIXED_ARRAY = 10;
+  /**
+   * @see #shouldExpand(UnCompiledNode)
+   */
+  final static int FIXED_ARRAY_SHALLOW_DISTANCE = 3; // 0 => only root node.
+
+  /**
+   * @see #shouldExpand(UnCompiledNode)
+   */
+  final static int FIXED_ARRAY_NUM_ARCS_SHALLOW = 5;
+
+  /**
+   * @see #shouldExpand(UnCompiledNode)
+   */
+  final static int FIXED_ARRAY_NUM_ARCS_DEEP = 10;
+
   private int[] bytesPerArc = new int[0];
 
   // Increment version to change it
@@ -315,7 +326,7 @@ public class FST<T> {
     int startAddress = writer.posWrite;
     //System.out.println("  startAddr=" + startAddress);
 
-    final boolean doFixedArray = node.numArcs >= NUM_ARCS_FIXED_ARRAY;
+    final boolean doFixedArray = shouldExpand(node);
     final int fixedArrayStart;
     if (doFixedArray) {
       if (bytesPerArc.length < node.numArcs) {
@@ -518,6 +529,23 @@ public class FST<T> {
     return readNextArc(arc);
   }
 
+  /**
+   * Checks if <code>arc</code>'s target state is in expanded (or vector) format. 
+   * 
+   * @return Returns <code>true</code> if <code>arc</code> points to a state in an
+   * expanded array format.
+   */
+  boolean isExpandedTarget(Arc<T> follow) throws IOException {
+    if (follow.isFinal()) {
+      return false;
+    } else {
+      final BytesReader in = getBytesReader(follow.target);
+      final byte b = in.readByte();
+      
+      return (b & BIT_ARCS_AS_FIXED_ARRAY) != 0;
+    }
+  }
+
   /** In-place read; returns the arc. */
   public Arc<T> readNextArc(Arc<T> arc) throws IOException {
     if (arc.label == -1) {
@@ -712,6 +740,26 @@ public class FST<T> {
   public int getArcWithOutputCount() {
     return arcWithOutputCount;
   }
+  
+  /**
+   * Nodes will be expanded if their depth (distance from the root node) is
+   * &lt;= this value and their number of arcs is &gt;=
+   * {@link #FIXED_ARRAY_NUM_ARCS_SHALLOW}.
+   * 
+   * <p>
+   * Fixed array consumes more RAM but enables binary search on the arcs
+   * (instead of a linear scan) on lookup by arc label.
+   * 
+   * @return <code>true</code> if <code>node</code> should be stored in an
+   *         expanded (array) form.
+   * 
+   * @see #FIXED_ARRAY_NUM_ARCS_DEEP
+   * @see Builder.UnCompiledNode#depth
+   */
+  private boolean shouldExpand(UnCompiledNode<T> node) {
+    return (node.depth <= FIXED_ARRAY_SHALLOW_DISTANCE && node.numArcs >= FIXED_ARRAY_NUM_ARCS_SHALLOW) || 
+            node.numArcs >= FIXED_ARRAY_NUM_ARCS_DEEP;
+  }
 
   // Non-static: writes to FST's byte[]
   class BytesWriter extends DataOutput {

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java?rev=1073653&r1=1073652&r2=1073653&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java Wed Feb 23 09:35:24 2011
@@ -189,6 +189,8 @@ public final class Util {
    */
   public static <T> void toDot(FST<T> fst, Writer out, boolean sameRank, boolean labelStates) 
     throws IOException {    
+    final String expandedNodeColor = "blue";
+
     // This is the start arc in the automaton (from the epsilon state to the first state 
     // with outgoing transitions.
     final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());
@@ -219,7 +221,9 @@ public final class Util {
     }
 
     emitDotState(out, "initial", "point", "white", "");
-    emitDotState(out, Integer.toString(startArc.target), stateShape, null, "");
+    emitDotState(out, Integer.toString(startArc.target), stateShape, 
+        fst.isExpandedTarget(startArc) ? expandedNodeColor : null, 
+        "");
     out.write("  initial -> " + startArc.target + "\n");
 
     final T NO_OUTPUT = fst.outputs.getNoOutput();
@@ -243,7 +247,9 @@ public final class Util {
           while (true) {
             // Emit the unseen state and add it to the queue for the next level.
             if (arc.target >= 0 && !seen.get(arc.target)) {
-              emitDotState(out, Integer.toString(arc.target), stateShape, null, 
+              final boolean isExpanded = fst.isExpandedTarget(arc);
+              emitDotState(out, Integer.toString(arc.target), stateShape, 
+                  isExpanded ?  expandedNodeColor : null, 
                   labelStates ? Integer.toString(arc.target) : ""); 
               seen.set(arc.target);
               nextLevelQueue.add(new FST.Arc<T>().copyFrom(arc));
@@ -285,10 +291,10 @@ public final class Util {
       }
       sameLevelStates.clear();                
     }
-    
+
     // Emit terminating state (always there anyway).
     out.write("  -1 [style=filled, color=black, shape=circle, label=\"\"]\n\n");
-    out.write("  {rank=sink; -1 } ");
+    out.write("  {rank=sink; -1 }\n");
     
     out.write("}\n");
     out.flush();

Modified: lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java?rev=1073653&r1=1073652&r2=1073653&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java (original)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java Wed Feb 23 09:35:24 2011
@@ -56,6 +56,7 @@ import org.apache.lucene.util.LineFileDo
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util._TestUtil;
+import org.apache.lucene.util.automaton.fst.FST.Arc;
 
 public class TestFSTs extends LuceneTestCase {
 
@@ -1322,4 +1323,85 @@ public class TestFSTs extends LuceneTest
     assertEquals(b, seekResult.input);
     assertEquals(42, (long) seekResult.output);
   }
+
+  /**
+   * Test state expansion (array format) on close-to-root states. Creates
+   * synthetic input that has one expanded state on each level.
+   * 
+   * @see "https://issues.apache.org/jira/browse/LUCENE-2933" 
+   */
+  public void testExpandedCloseToRoot() throws Exception {
+    class SyntheticData {
+      FST<Object> compile(String[] lines) throws IOException {
+        final NoOutputs outputs = NoOutputs.getSingleton();
+        final Object nothing = outputs.getNoOutput();
+        final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
+
+        int line = 0;
+        final BytesRef term = new BytesRef();
+        while (line < lines.length) {
+          String w = lines[line++];
+          if (w == null) {
+            break;
+          }
+          term.copy(w);
+          b.add(term, nothing);
+        }
+        
+        return b.finish();
+      }
+      
+      void generate(ArrayList<String> out, StringBuilder b, char from, char to,
+          int depth) {
+        if (depth == 0 || from == to) {
+          String seq = b.toString() + "_" + out.size() + "_end";
+          out.add(seq);
+        } else {
+          for (char c = from; c <= to; c++) {
+            b.append(c);
+            generate(out, b, from, c == to ? to : from, depth - 1);
+            b.deleteCharAt(b.length() - 1);
+          }
+        }
+      }
+
+      public int verifyStateAndBelow(FST<Object> fst, Arc<Object> arc, int depth) 
+        throws IOException {
+        if (fst.targetHasArcs(arc)) {
+          int childCount = 0;
+          for (arc = fst.readFirstTargetArc(arc, arc);; 
+               arc = fst.readNextArc(arc), childCount++)
+          {
+            boolean expanded = fst.isExpandedTarget(arc);
+            int children = verifyStateAndBelow(fst, new FST.Arc<Object>().copyFrom(arc), depth + 1);
+
+            assertEquals(
+                expanded,
+                (depth <= FST.FIXED_ARRAY_SHALLOW_DISTANCE && 
+                    children >= FST.FIXED_ARRAY_NUM_ARCS_SHALLOW) ||
+                 children >= FST.FIXED_ARRAY_NUM_ARCS_DEEP);
+            if (arc.isLast()) break;
+          }
+
+          return childCount;
+        }
+        return 0;
+      }
+    }
+
+    // Sanity check.
+    assertTrue(FST.FIXED_ARRAY_NUM_ARCS_SHALLOW < FST.FIXED_ARRAY_NUM_ARCS_DEEP);
+    assertTrue(FST.FIXED_ARRAY_SHALLOW_DISTANCE >= 0);
+
+    SyntheticData s = new SyntheticData();
+
+    ArrayList<String> out = new ArrayList<String>();
+    StringBuilder b = new StringBuilder();
+    s.generate(out, b, 'a', 'i', 10);
+    String[] input = out.toArray(new String[out.size()]);
+    Arrays.sort(input);
+    FST<Object> fst = s.compile(input);
+    FST.Arc<Object> arc = fst.getFirstArc(new FST.Arc<Object>());
+    s.verifyStateAndBelow(fst, arc, 1);
+  }
 }