You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ha...@apache.org on 2013/07/24 04:40:33 UTC
svn commit: r1506385 - in /lucene/dev/branches/lucene3069/lucene/core/src: java/org/apache/lucene/codecs/temp/ java/org/apache/lucene/util/automaton/ java/org/apache/lucene/util/fst/ test/org/apache/lucene/index/

Author: han
Date: Wed Jul 24 02:40:33 2013
New Revision: 1506385

URL: http://svn.apache.org/r1506385
Log:
LUCENE-3069: support intersect operations

Modified:
    lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTTermsReader.java
    lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTTermsWriter.java
    lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempTermOutputs.java
    lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
    lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
    lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
    lucene/dev/branches/lucene3069/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java

Modified: lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTTermsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTTermsReader.java?rev=1506385&r1=1506384&r2=1506385&view=diff
==============================================================================
--- lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTTermsReader.java (original)
+++ lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTTermsReader.java Wed Jul 24 02:40:33 2013
@@ -24,6 +24,7 @@ import java.util.ArrayList;
 import java.util.BitSet;
 import java.util.Collections;
 import java.util.Comparator;
+import java.util.Stack;
 import java.util.Iterator;
 import java.util.TreeMap;
 
@@ -42,6 +43,9 @@ import org.apache.lucene.index.TermsEnum
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.automaton.ByteRunAutomaton;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;
@@ -58,12 +62,10 @@ public class TempFSTTermsReader extends 
   final TreeMap<String, TermsReader> fields = new TreeMap<String, TermsReader>();
   final TempPostingsReaderBase postingsReader;
   final IndexInput in;
-  boolean DEBUG = false;
-  //String tmpname;
+  //static boolean DEBUG = false;
 
   public TempFSTTermsReader(SegmentReadState state, TempPostingsReaderBase postingsReader) throws IOException {
     final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TempFSTTermsWriter.TERMS_EXTENSION);
-    //tmpname = termsFileName;
 
     this.postingsReader = postingsReader;
     this.in = state.directory.openInput(termsFileName, state.context);
@@ -79,7 +81,7 @@ public class TempFSTTermsReader extends 
       for (int i = 0; i < numFields; i++) {
         int fieldNumber = in.readVInt();
         FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber);
-        long numTerms = in.readVLong(); 
+        long numTerms = in.readVLong();
         long sumTotalTermFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY ? -1 : in.readVLong();
         long sumDocFreq = in.readVLong();
         int docCount = in.readVInt();
@@ -97,10 +99,9 @@ public class TempFSTTermsReader extends 
   }
 
   private int readHeader(IndexInput in) throws IOException {
-    return CodecUtil.checkHeader(in, 
-                                 TempFSTTermsWriter.TERMS_CODEC_NAME,
-                                 TempFSTTermsWriter.TERMS_VERSION_START,
-                                 TempFSTTermsWriter.TERMS_VERSION_CURRENT);
+    return CodecUtil.checkHeader(in, TempFSTTermsWriter.TERMS_CODEC_NAME,
+                                     TempFSTTermsWriter.TERMS_VERSION_START,
+                                     TempFSTTermsWriter.TERMS_VERSION_CURRENT);
   }
   private void seekDir(IndexInput in) throws IOException {
     in.seek(in.length() - 8);
@@ -108,15 +109,15 @@ public class TempFSTTermsReader extends 
   }
   private void checkFieldSummary(SegmentInfo info, TermsReader field, TermsReader previous) throws IOException {
     // #docs with field must be <= #docs
-    if (field.docCount < 0 || field.docCount > info.getDocCount()) { 
+    if (field.docCount < 0 || field.docCount > info.getDocCount()) {
       throw new CorruptIndexException("invalid docCount: " + field.docCount + " maxDoc: " + info.getDocCount() + " (resource=" + in + ")");
     }
     // #postings must be >= #docs with field
-    if (field.sumDocFreq < field.docCount) {  
+    if (field.sumDocFreq < field.docCount) {
       throw new CorruptIndexException("invalid sumDocFreq: " + field.sumDocFreq + " docCount: " + field.docCount + " (resource=" + in + ")");
     }
     // #positions must be >= #postings
-    if (field.sumTotalTermFreq != -1 && field.sumTotalTermFreq < field.sumDocFreq) { 
+    if (field.sumTotalTermFreq != -1 && field.sumTotalTermFreq < field.sumDocFreq) {
       throw new CorruptIndexException("invalid sumTotalTermFreq: " + field.sumTotalTermFreq + " sumDocFreq: " + field.sumDocFreq + " (resource=" + in + ")");
     }
     if (previous != null) {
@@ -166,24 +167,14 @@ public class TempFSTTermsReader extends 
       this.docCount = docCount;
       this.longsSize = longsSize;
       this.dict = new FST<TempTermOutputs.TempMetaData>(in, new TempTermOutputs(fieldInfo, longsSize));
-      //PrintWriter pw = new PrintWriter(new File("ohohoh."+tmpname+".xxx.txt"));
-      //Util.toDot(dict, pw, false, false);
-      //pw.close();
     }
 
-    // nocommit: implement intersect
-    // nocommit: why do we need this comparator overridden again and again?
     @Override
     public Comparator<BytesRef> getComparator() {
       return BytesRef.getUTF8SortedAsUnicodeComparator();
     }
 
     @Override
-    public TermsEnum iterator(TermsEnum reuse) throws IOException {
-      return new SegmentTermsEnum();
-    }
-
-    @Override
     public boolean hasOffsets() {
       return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
     }
@@ -192,7 +183,7 @@ public class TempFSTTermsReader extends 
     public boolean hasPositions() {
       return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
     }
-    
+
     @Override
     public boolean hasPayloads() {
       return fieldInfo.hasPayloads();
@@ -218,10 +209,18 @@ public class TempFSTTermsReader extends 
       return docCount;
     }
 
-    // Iterates through terms in this field
-    final class SegmentTermsEnum extends TermsEnum {
-      final BytesRefFSTEnum<TempTermOutputs.TempMetaData> fstEnum;
+    @Override
+    public TermsEnum iterator(TermsEnum reuse) throws IOException {
+      return new SegmentTermsEnum();
+    }
+
+    @Override
+    public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
+      return new IntersectTermsEnum(compiled, startTerm);
+    }
 
+    // Only wraps common operations for PBF interact
+    abstract class BaseTermsEnum extends TermsEnum {
       /* Current term, null when enum ends or unpositioned */
       BytesRef term;
 
@@ -232,19 +231,14 @@ public class TempFSTTermsReader extends 
       TempTermOutputs.TempMetaData meta;
       ByteArrayDataInput bytesReader;
 
-      /* True when current term's metadata is decoded */
-      boolean decoded;
-
-      /* True when current enum is 'positioned' by seekExact(TermState) */
-      boolean seekPending;
+      /** Decodes metadata into customized term state */
+      abstract void decodeMetaData() throws IOException;
 
-      SegmentTermsEnum() throws IOException {
-        this.fstEnum = new BytesRefFSTEnum<TempTermOutputs.TempMetaData>(dict);
+      BaseTermsEnum() throws IOException {
         this.state = postingsReader.newTermState();
         this.bytesReader = new ByteArrayDataInput();
         this.term = null;
-        this.decoded = false;
-        this.seekPending = false;
+        // NOTE: metadata will only be initialized in child class
       }
 
       @Override
@@ -262,7 +256,7 @@ public class TempFSTTermsReader extends 
       public BytesRef term() {
         return term;
       }
-      
+
       @Override
       public int docFreq() throws IOException {
         return state.docFreq;
@@ -273,8 +267,58 @@ public class TempFSTTermsReader extends 
         return state.totalTermFreq;
       }
 
+      @Override
+      public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException {
+        decodeMetaData();
+        return postingsReader.docs(fieldInfo, state, liveDocs, reuse, flags);
+      }
+
+      @Override
+      public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException {
+        if (!hasPositions()) {
+          return null;
+        }
+        decodeMetaData();
+        return postingsReader.docsAndPositions(fieldInfo, state, liveDocs, reuse, flags);
+      }
+
+      // nocommit: do we need this? for SegmentTermsEnum, we can maintain
+      // a stack to record how current term is constructed on FST, (and ord on each alphabet)
+      // so that during seek we don't have to start from the first arc.
+      // however, we'll be implementing a new fstEnum instead of wrapping current one.
+      @Override
+      public void seekExact(long ord) throws IOException {
+        throw new UnsupportedOperationException();
+      }
+
+      @Override
+      public long ord() {
+        throw new UnsupportedOperationException();
+      }
+    }
+
+
+    // Iterates through all terms in this field
+    private final class SegmentTermsEnum extends BaseTermsEnum {
+      final BytesRefFSTEnum<TempTermOutputs.TempMetaData> fstEnum;
+
+      /* True when current term's metadata is decoded */
+      boolean decoded;
+
+      /* True when current enum is 'positioned' by seekExact(TermState) */
+      boolean seekPending;
+
+      SegmentTermsEnum() throws IOException {
+        super();
+        this.fstEnum = new BytesRefFSTEnum<TempTermOutputs.TempMetaData>(dict);
+        this.decoded = false;
+        this.seekPending = false;
+        this.meta = null;
+      }
+
       // Let PBF decode metadata from long[] and byte[]
-      private void decodeMetaData() throws IOException {
+      @Override
+      void decodeMetaData() throws IOException {
         if (!decoded && !seekPending) {
           if (meta.bytes != null) {
             bytesReader.reset(meta.bytes, 0, meta.bytes.length);
@@ -285,7 +329,7 @@ public class TempFSTTermsReader extends 
       }
 
       // Update current enum according to FSTEnum
-      private void updateEnum(final InputOutput<TempTermOutputs.TempMetaData> pair) {
+      void updateEnum(final InputOutput<TempTermOutputs.TempMetaData> pair) {
         if (pair == null) {
           term = null;
         } else {
@@ -299,21 +343,6 @@ public class TempFSTTermsReader extends 
       }
 
       @Override
-      public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException {
-        decodeMetaData();
-        return postingsReader.docs(fieldInfo, state, liveDocs, reuse, flags);
-      }
-
-      @Override
-      public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException {
-        if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
-          return null;
-        }
-        decodeMetaData();
-        return postingsReader.docsAndPositions(fieldInfo, state, liveDocs, reuse, flags);
-      }
-
-      @Override
       public BytesRef next() throws IOException {
         if (seekPending) {  // previously positioned, but termOutputs not fetched
           seekPending = false;
@@ -325,13 +354,13 @@ public class TempFSTTermsReader extends 
       }
 
       @Override
-      public boolean seekExact(final BytesRef target, final boolean useCache) throws IOException {
+      public boolean seekExact(BytesRef target, boolean useCache) throws IOException {
         updateEnum(fstEnum.seekExact(target));
         return term != null;
       }
 
       @Override
-      public SeekStatus seekCeil(final BytesRef target, final boolean useCache) throws IOException {
+      public SeekStatus seekCeil(BytesRef target, boolean useCache) throws IOException {
         updateEnum(fstEnum.seekCeil(target));
         if (term == null) {
           return SeekStatus.END;
@@ -340,28 +369,338 @@ public class TempFSTTermsReader extends 
         }
       }
 
-      // nocommit: this method doesn't act as 'seekExact' right?
       @Override
       public void seekExact(BytesRef target, TermState otherState) {
-        if (term == null || target.compareTo(term) != 0) {
+        if (!target.equals(term)) {
           state.copyFrom(otherState);
           term = BytesRef.deepCopyOf(target);
           seekPending = true;
         }
       }
+    }
+
+    // Iterates intersect result with automaton (cannot seek!)
+    private final class IntersectTermsEnum extends BaseTermsEnum {
+      /* True when current term's metadata is decoded */
+      boolean decoded;
+
+      /* True when there is pending term when calling next() */
+      boolean pending;
+
+      /* stack to record how current term is constructed, used to accumulate
+       * metadata or rewind term:
+       *   stack.size() == term.length + 2,
+       *                == 1 when term is null */
+      final Stack<Frame> stack;
+
+      /* term dict fst */
+      final FST<TempTermOutputs.TempMetaData> fst;
+      final FST.BytesReader fstReader;
+      final Outputs<TempTermOutputs.TempMetaData> fstOutputs;
+
+      /* query automaton to intersect with */
+      final ByteRunAutomaton fsa;
+      final BytesRef fsaCommonSuffix;
+
+      private final class Frame {
+        /* fst stats */
+        FST.Arc<TempTermOutputs.TempMetaData> fstArc;
+        long fstPos;
+
+        /* automaton stats */
+        int fsaState;
+
+        Frame() {
+          this.fstArc = new FST.Arc<TempTermOutputs.TempMetaData>();
+          this.fsaState = -1;
+        }
+
+        public String toString() {
+          return "arc=" + fstArc + " state=" + fsaState;
+        }
+      }
+
+      IntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
+        super();
+        //if (DEBUG) System.out.println("Enum init, startTerm=" + startTerm);
+        this.fst = dict;
+        this.fstReader = fst.getBytesReader();
+        this.fstOutputs = dict.outputs;
+        this.fsa = compiled.runAutomaton;
+        this.fsaCommonSuffix = compiled.commonSuffixRef;
+        /*
+        PrintWriter pw1 = new PrintWriter(new File("../temp/fst.txt"));
+        Util.toDot(dict,pw1, false, false);
+        pw1.close();
+        PrintWriter pw2 = new PrintWriter(new File("../temp/fsa.txt"));
+        pw2.write(compiled.toDot());
+        pw2.close();
+        */
+        this.meta = fstOutputs.getNoOutput();
+        this.stack = new Stack<Frame>();
+
+        Frame frame;
+        frame = loadVirtualFrame(newFrame());
+        stack.push(frame);
+        frame = loadFirstFrame(newFrame());
+        pushFrame(frame);
+
+        this.decoded = false;
+        this.pending = false;
+
+        if (startTerm == null) {
+          pending = isAccept(topFrame());
+        } else {
+          doSeekCeil(startTerm);
+          pending = !startTerm.equals(term) && isValid(topFrame()) && isAccept(topFrame());
+        }
+      }
 
-      // nocommit: do we need this?
       @Override
-      public void seekExact(long ord) throws IOException {
-        throw new UnsupportedOperationException();
+      void decodeMetaData() throws IOException {
+        assert term != null;
+        if (!decoded) {
+          if (meta.bytes != null) {
+            bytesReader.reset(meta.bytes, 0, meta.bytes.length);
+          }
+          postingsReader.decodeTerm(meta.longs, bytesReader, fieldInfo, state);
+          decoded = true;
+        }
       }
 
       @Override
-      public long ord() {
-        throw new UnsupportedOperationException();
+      public SeekStatus seekCeil(BytesRef target, boolean useCache) throws IOException {
+        decoded = false;
+        term = doSeekCeil(target);
+        if (term == null) {
+          return SeekStatus.END;
+        } else {
+          return term.equals(target) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND;
+        }
+      }
+
+      // nocommit: make use of extra info, like commonSuffixRef
+      @Override
+      public BytesRef next() throws IOException {
+        //if (DEBUG) System.out.println("Enum next()");
+        if (pending) {
+          pending = false;
+          return term;
+        }
+        decoded = false;
+      DFS:
+        while (stack.size() > 1) {
+          Frame frame = newFrame();
+          if (loadExpandFrame(topFrame(), frame) != null) {  // has valid target
+            pushFrame(frame);
+            if (isAccept(frame)) {  // gotcha
+              return term;
+            }
+            continue;  // check next target
+          } 
+          frame = popFrame();
+          while(stack.size() > 1) {
+            if (loadNextFrame(topFrame(), frame) != null) {  // has valid sibling 
+              pushFrame(frame);
+              if (isAccept(frame)) {  // gotcha
+                return term;
+              }
+              continue DFS;   // check next target 
+            }
+            frame = popFrame();
+          }
+          return null;
+        }
+        return null;
+      }
+
+      private BytesRef doSeekCeil(BytesRef target) throws IOException {
+        //while (next() != null && term.compareTo(target) < 0) {} if (true) return term;
+        //if (DEBUG) System.out.println("Enum doSeekCeil()");
+        Frame frame= null;
+        int label, upto = 0, limit = target.length;
+        while (upto < limit) {  // to target prefix, or ceil label (rewind prefix)
+          frame = newFrame();
+          label = target.bytes[upto] & 0xff;
+          frame = loadCeilFrame(label, topFrame(), frame);
+          if (frame == null || frame.fstArc.label != label) {
+            break;
+          }
+          assert isValid(frame);  // target must be fetched from automaton
+          pushFrame(frame);
+          upto++;
+        }
+        if (upto == limit) {  // got target
+          return term;
+        }
+        if (frame != null) {  // got larger term('s prefix)
+          pushFrame(frame);
+          return isAccept(frame) ? term : next();
+        }
+        while (stack.size() > 1) {  // got target's prefix, advance to larger term
+          frame = popFrame();
+          while (stack.size() > 1 && !canRewind(frame)) {
+            frame = popFrame();
+          }
+          if (loadNextFrame(topFrame(), frame) != null) {
+            pushFrame(frame);
+            return isAccept(frame) ? term : next();
+          }
+        }
+        return null;
+      }
+
+      // nocommit: might be great if we can set flag BIT_LAST_ARC
+
+      /** Virtual frame, never pop */
+      Frame loadVirtualFrame(Frame frame) throws IOException {
+        frame.fstArc.output = fstOutputs.getNoOutput();
+        frame.fstArc.nextFinalOutput = fstOutputs.getNoOutput();
+        frame.fstPos = -1;
+        frame.fsaState = -1;
+        return frame;
+      }
+
+      /** Load frame for start arc(node) on fst */
+      Frame loadFirstFrame(Frame frame) throws IOException {
+        frame.fstArc = fst.getFirstArc(frame.fstArc);
+        frame.fstPos = fstReader.getPosition();
+        frame.fsaState = fsa.getInitialState();
+        return frame;
+      }
+
+      // nocommit: expected to use readFirstTargetArc here?
+
+      /** Load frame for target arc(node) on fst */
+      Frame loadExpandFrame(Frame top, Frame frame) throws IOException {
+        if (!canGrow(top)) {
+          return null;
+        }
+        frame.fstArc = fst.readFirstRealTargetArc(top.fstArc.target, frame.fstArc, fstReader);
+        frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label);
+        //if (DEBUG) System.out.println(" loadExpand frame="+frame);
+        if (frame.fsaState == -1) {
+          return loadNextFrame(top, frame);
+        }
+        frame.fstPos = fstReader.getPosition();
+        return frame;
+      }
+
+      // nocommit: actually, here we're looking for an valid state for fsa, 
+      //           so if numArcs is large in fst, we should try a reverse lookup?
+      //           but we don have methods like advance(label) in fst, even 
+      //           binary search hurts. 
+      
+      /** Load frame for sibling arc(node) on fst */
+      Frame loadNextFrame(Frame top, Frame frame) throws IOException {
+        if (!canRewind(frame)) {
+          return null;
+        }
+        fstReader.setPosition(frame.fstPos);
+        while (!frame.fstArc.isLast()) {
+          frame.fstArc = fst.readNextRealArc(frame.fstArc, fstReader);
+          frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label);
+          if (frame.fsaState != -1) {
+            break;
+          }
+        }
+        //if (DEBUG) System.out.println(" loadNext frame="+frame);
+        if (frame.fsaState == -1) {
+          return null;
+        }
+        frame.fstPos = fstReader.getPosition();
+        return frame;
+      }
+
+      /** Load frame for target arc(node) on fst, so that 
+       *  arc.label >= label and !fsa.reject(arc.label) */
+      Frame loadCeilFrame(int label, Frame top, Frame frame) throws IOException {
+        FST.Arc<TempTermOutputs.TempMetaData> arc = frame.fstArc;
+        arc = Util.readCeilArc(label, fst, top.fstArc, arc, fstReader);
+        if (arc == null) {
+          return null;
+        }
+        frame.fsaState = fsa.step(top.fsaState, arc.label);
+        //if (DEBUG) System.out.println(" loadCeil frame="+frame);
+        if (frame.fsaState == -1) {
+          return loadNextFrame(top, frame);
+        }
+        frame.fstPos = fstReader.getPosition();
+        return frame;
+      }
+
+      boolean isAccept(Frame frame) {  // reach a term both fst&fsa accepts
+        return fsa.isAccept(frame.fsaState) && frame.fstArc.isFinal();
+      }
+      boolean isValid(Frame frame) {   // reach a prefix both fst&fsa won't reject
+        return frame.fsaState != -1;// && frame != null;
+      }
+      boolean canGrow(Frame frame) {   // can walk forward on both fst&fsa
+        return frame.fsaState != -1 && FST.targetHasArcs(frame.fstArc);
+      }
+      boolean canRewind(Frame frame) { // can jump to sibling
+        return !frame.fstArc.isLast();
+      }
+
+      void pushFrame(Frame frame) throws IOException {
+        final FST.Arc<TempTermOutputs.TempMetaData> arc = frame.fstArc;
+        arc.output = fstOutputs.add(topFrame().fstArc.output, arc.output);
+        if (arc.isFinal()) {
+          arc.nextFinalOutput = fstOutputs.add(arc.output, arc.nextFinalOutput);
+          meta = arc.nextFinalOutput;
+        } else {
+          meta = arc.output;
+        }
+        term = grow(arc.label);
+        state.docFreq = meta.docFreq;
+        state.totalTermFreq = meta.totalTermFreq;
+        stack.push(frame);
+        //if (DEBUG) System.out.println("  term=" + term + " stack=" + stack.size());
+      }
+
+      Frame popFrame() throws IOException {
+        final Frame pop = stack.pop(), top = topFrame();
+        if (top.fstArc.isFinal()) {
+          meta = top.fstArc.nextFinalOutput;
+        } else {
+          meta = top.fstArc.output;
+        }
+        term = shrink();
+        //if (DEBUG) System.out.println("  term=" + term + " stack=" + stack.size());
+        return pop;
+      }
+
+      Frame newFrame() throws IOException {
+        return new Frame();
+      }
+
+      Frame topFrame() throws IOException {
+        return stack.peek();
+      }
+
+      BytesRef grow(int label) {
+        if (term == null) {
+          term = new BytesRef(new byte[16], 0, 0);
+        } else {
+          if (term.length == term.bytes.length) {
+            term.grow(term.length+1);
+          }
+          term.bytes[term.length++] = (byte)label;
+        }
+        return term;
+      }
+
+      BytesRef shrink() {
+        if (term.length == 0) {
+          term = null;
+        } else {
+          term.length--;
+        }
+        return term;
       }
     }
   }
+
   static<T> void walk(FST<T> fst) throws IOException {
     final ArrayList<FST.Arc<T>> queue = new ArrayList<FST.Arc<T>>();
     final BitSet seen = new BitSet();
@@ -371,9 +710,11 @@ public class TempFSTTermsReader extends 
     while (!queue.isEmpty()) {
       final FST.Arc<T> arc = queue.remove(0);
       final long node = arc.target;
+      // nocommit: hmm... for startArc, Arc.toString() is broken???
+      //           BIT_ARC_HAS_FINAL_OUTPUT never set
       //System.out.println(arc);
       if (FST.targetHasArcs(arc) && !seen.get((int) node)) {
-        //seen.set((int) node);
+        seen.set((int) node);
         fst.readFirstRealTargetArc(node, arc, reader);
         while (true) {
           queue.add(new FST.Arc<T>().copyFrom(arc));

Modified: lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTTermsWriter.java?rev=1506385&r1=1506384&r2=1506385&view=diff
==============================================================================
--- lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTTermsWriter.java (original)
+++ lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempFSTTermsWriter.java Wed Jul 24 02:40:33 2013
@@ -179,7 +179,6 @@ public class TempFSTTermsWriter extends 
         metaWriter.writeTo(meta.bytes, 0);
         metaWriter.reset();
       }
-      //System.out.println("add term:<"+text.utf8ToString()+", "+meta+">");
       builder.add(Util.toIntsRef(text, scratchTerm), meta);
       numTerms++;
     }
@@ -189,7 +188,6 @@ public class TempFSTTermsWriter extends 
       // save FST dict
       if (numTerms > 0) {
         final FST<TempTermOutputs.TempMetaData> fst = builder.finish();
-        //fst.dump();
         fields.add(new FieldMetaData(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize, fst));
       }
     }

Modified: lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempTermOutputs.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempTermOutputs.java?rev=1506385&r1=1506384&r2=1506385&view=diff
==============================================================================
--- lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempTermOutputs.java (original)
+++ lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/temp/TempTermOutputs.java Wed Jul 24 02:40:33 2013
@@ -205,6 +205,8 @@ public class TempTermOutputs extends Out
     return ret;
   }
 
+  // nocommit: we might refactor out an 'addSelf' later, 
+  // which improves 5~7% for fuzzy queries
   @Override
   public TempMetaData add(TempMetaData t1, TempMetaData t2) {
     if (DEBUG) System.out.print("add("+t1+", "+t2+") = ");
@@ -328,13 +330,13 @@ public class TempTermOutputs extends Out
     if (t1.bytes == null && t2.bytes == null) {
       return true;
     }
-    return t1.bytes != null && t2.bytes !=null && Arrays.equals(t1.bytes, t2.bytes);
+    return t1.bytes != null && t2.bytes != null && Arrays.equals(t1.bytes, t2.bytes);
   }
   static boolean longsEqual(final TempMetaData t1, final TempMetaData t2) {
     if (t1.longs == null && t2.longs == null) {
       return true;
     }
-    return t1.longs != null && t2.longs !=null && Arrays.equals(t1.longs, t2.longs);
+    return t1.longs != null && t2.longs != null && Arrays.equals(t1.longs, t2.longs);
   }
   static boolean allZero(final long[] l) {
     for (int i = 0; i < l.length; i++) {

Modified: lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java?rev=1506385&r1=1506384&r2=1506385&view=diff
==============================================================================
--- lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java (original)
+++ lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java Wed Jul 24 02:40:33 2013
@@ -345,4 +345,24 @@ public class CompiledAutomaton {
       }
     }
   }
+  
+  public String toDot() {
+    StringBuilder b = new StringBuilder("digraph CompiledAutomaton {\n");
+    b.append("  rankdir = LR;\n");
+    int initial = runAutomaton.getInitialState();
+    for (int i = 0; i < sortedTransitions.length; i++) {
+      b.append("  ").append(i);
+      if (runAutomaton.isAccept(i)) b.append(" [shape=doublecircle,label=\"\"];\n");
+      else b.append(" [shape=circle,label=\"\"];\n");
+      if (i == initial) {
+        b.append("  initial [shape=plaintext,label=\"\"];\n");
+        b.append("  initial -> ").append(i).append("\n");
+      }
+      for (int j = 0; j < sortedTransitions[i].length; j++) {
+        b.append("  ").append(i);
+        sortedTransitions[i][j].appendDot(b);
+      }
+    }
+    return b.append("}\n").toString();
+  }
 }

Modified: lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/util/fst/FST.java?rev=1506385&r1=1506384&r2=1506385&view=diff
==============================================================================
--- lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/util/fst/FST.java (original)
+++ lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/util/fst/FST.java Wed Jul 24 02:40:33 2013
@@ -231,16 +231,19 @@ public final class FST<T> {
       StringBuilder b = new StringBuilder();
       b.append("node=" + node);
       b.append(" target=" + target);
-      b.append(" label=" + label);
-      if (flag(BIT_LAST_ARC)) {
-        b.append(" last");
-      }
+      b.append(" label=0x" + Integer.toHexString(label));
       if (flag(BIT_FINAL_ARC)) {
         b.append(" final");
       }
+      if (flag(BIT_LAST_ARC)) {
+        b.append(" last");
+      }
       if (flag(BIT_TARGET_NEXT)) {
         b.append(" targetNext");
       }
+      if (flag(BIT_STOP_NODE)) {
+        b.append(" stop");
+      }
       if (flag(BIT_ARC_HAS_OUTPUT)) {
         b.append(" output=" + output);
       }

Modified: lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/util/fst/Util.java?rev=1506385&r1=1506384&r2=1506385&view=diff
==============================================================================
--- lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/util/fst/Util.java (original)
+++ lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/util/fst/Util.java Wed Jul 24 02:40:33 2013
@@ -762,10 +762,11 @@ public final class Util {
    */
   private static String printableLabel(int label) {
     if (label >= 0x20 && label <= 0x7d) {
-      return Character.toString((char) label);
-    } else {
-      return "0x" + Integer.toHexString(label);
+      if (label != 0x22 && label != 0x5c) {  // " OR \
+        return Character.toString((char) label);
+      }
     }
+    return "0x" + Integer.toHexString(label);
   }
 
   /** Just maps each UTF16 unit (char) to the ints in an

Modified: lucene/dev/branches/lucene3069/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3069/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java?rev=1506385&r1=1506384&r2=1506385&view=diff
==============================================================================
--- lucene/dev/branches/lucene3069/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java (original)
+++ lucene/dev/branches/lucene3069/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java Wed Jul 24 02:40:33 2013
@@ -340,7 +340,6 @@ public class TestTermsEnum extends Lucen
             loc++;
           } while (loc < termsArray.length && !acceptTermsSet.contains(termsArray[loc]));
         }
-
         assertNull(te.next());
       }
     }
@@ -771,4 +770,116 @@ public class TestTermsEnum extends Lucen
     r.close();
     dir.close();
   }
+  public void testIntersectStartTerm() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+    iwc.setMergePolicy(new LogDocMergePolicy());
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
+    Document doc = new Document();
+    doc.add(newStringField("field", "abc", Field.Store.NO));
+    w.addDocument(doc);
+
+    doc = new Document();
+    doc.add(newStringField("field", "abd", Field.Store.NO));
+    w.addDocument(doc);
+
+    doc = new Document();
+    doc.add(newStringField("field", "acd", Field.Store.NO));
+    w.addDocument(doc);
+
+    doc = new Document();
+    doc.add(newStringField("field", "bcd", Field.Store.NO));
+    w.addDocument(doc);
+
+    w.forceMerge(1);
+    DirectoryReader r = w.getReader();
+    w.close();
+    AtomicReader sub = getOnlySegmentReader(r);
+    Terms terms = sub.fields().terms("field");
+
+    Automaton automaton = new RegExp(".*d", RegExp.NONE).toAutomaton();
+    CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false);    
+    TermsEnum te;
+    
+    // should seek to startTerm
+    te = terms.intersect(ca, new BytesRef("aad"));
+    assertEquals("abd", te.next().utf8ToString());
+    assertEquals(1, te.docs(null, null, DocsEnum.FLAG_NONE).nextDoc());
+    assertEquals("acd", te.next().utf8ToString());
+    assertEquals(2, te.docs(null, null, DocsEnum.FLAG_NONE).nextDoc());
+    assertEquals("bcd", te.next().utf8ToString());
+    assertEquals(3, te.docs(null, null, DocsEnum.FLAG_NONE).nextDoc());
+    assertNull(te.next());
+
+    // should fail to find ceil label on second arc, rewind 
+    te = terms.intersect(ca, new BytesRef("add"));
+    assertEquals("bcd", te.next().utf8ToString());
+    assertEquals(3, te.docs(null, null, DocsEnum.FLAG_NONE).nextDoc());
+    assertNull(te.next());
+
+    // should reach end
+    te = terms.intersect(ca, new BytesRef("bcd"));
+    assertNull(te.next());
+    te = terms.intersect(ca, new BytesRef("ddd"));
+    assertNull(te.next());
+
+    r.close();
+    dir.close();
+  }
+
+  public void testIntersectEmptyString() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+    iwc.setMergePolicy(new LogDocMergePolicy());
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
+    Document doc = new Document();
+    doc.add(newStringField("field", "", Field.Store.NO));
+    doc.add(newStringField("field", "abc", Field.Store.NO));
+    w.addDocument(doc);
+
+    doc = new Document();
+    // add empty string to both documents, so that singletonDocID == -1.
+    // For a FST-based term dict, we'll expect to see the first arc is 
+    // flaged with HAS_FINAL_OUTPUT
+    doc.add(newStringField("field", "abc", Field.Store.NO));
+    doc.add(newStringField("field", "", Field.Store.NO));
+    w.addDocument(doc);
+
+    w.forceMerge(1);
+    DirectoryReader r = w.getReader();
+    w.close();
+    AtomicReader sub = getOnlySegmentReader(r);
+    Terms terms = sub.fields().terms("field");
+
+    Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton();  // accept ALL
+    CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false);    
+
+    TermsEnum te = terms.intersect(ca, null);
+    DocsEnum de;
+
+    assertEquals("", te.next().utf8ToString());
+    de = te.docs(null, null, DocsEnum.FLAG_NONE);
+    assertEquals(0, de.nextDoc());
+    assertEquals(1, de.nextDoc());
+
+    assertEquals("abc", te.next().utf8ToString());
+    de = te.docs(null, null, DocsEnum.FLAG_NONE);
+    assertEquals(0, de.nextDoc());
+    assertEquals(1, de.nextDoc());
+
+    assertNull(te.next());
+
+    // pass empty string
+    te = terms.intersect(ca, new BytesRef(""));
+
+    assertEquals("abc", te.next().utf8ToString());
+    de = te.docs(null, null, DocsEnum.FLAG_NONE);
+    assertEquals(0, de.nextDoc());
+    assertEquals(1, de.nextDoc());
+
+    assertNull(te.next());
+
+    r.close();
+    dir.close();
+  }
 }