You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2013/10/14 17:55:58 UTC

svn commit: r1531949 [2/3] - in /lucene/dev/trunk/lucene: ./ codecs/src/java/org/apache/lucene/codecs/blockterms/ codecs/src/java/org/apache/lucene/codecs/bloom/ codecs/src/java/org/apache/lucene/codecs/memory/ codecs/src/java/org/apache/lucene/codecs/...

Added: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java?rev=1531949&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java (added)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java Mon Oct 14 15:55:57 2013
@@ -0,0 +1,1155 @@
+package org.apache.lucene.codecs;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RAMOutputStream;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.ByteSequenceOutputs;
+import org.apache.lucene.util.fst.BytesRefFSTEnum;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.NoOutputs;
+import org.apache.lucene.util.fst.Util;
+import org.apache.lucene.util.packed.PackedInts;
+
+/*
+  TODO:
+  
+    - Currently there is a one-to-one mapping of indexed
+      term to term block, but we could decouple the two, ie,
+      put more terms into the index than there are blocks.
+      The index would take up more RAM but then it'd be able
+      to avoid seeking more often and could make PK/FuzzyQ
+      faster if the additional indexed terms could store
+      the offset into the terms block.
+
+    - The blocks are not written in true depth-first
+      order, meaning if you just next() the file pointer will
+      sometimes jump backwards.  For example, block foo* will
+      be written before block f* because it finished before.
+      This could possibly hurt performance if the terms dict is
+      not hot, since OSs anticipate sequential file access.  We
+      could fix the writer to re-order the blocks as a 2nd
+      pass.
+
+    - Each block encodes the term suffixes packed
+      sequentially using a separate vInt per term, which is
+      1) wasteful and 2) slow (must linear scan to find a
+      particular suffix).  We should instead 1) make
+      random-access array so we can directly access the Nth
+      suffix, and 2) bulk-encode this array using bulk int[]
+      codecs; then at search time we can binary search when
+      we seek a particular term.
+*/
+
+/**
+ * Block-based terms index and dictionary writer.
+ * <p>
+ * Writes terms dict and index, block-encoding (column
+ * stride) each term's metadata for each set of terms
+ * between two index terms.
+ * <p>
+ * Files:
+ * <ul>
+ *   <li><tt>.tim</tt>: <a href="#Termdictionary">Term Dictionary</a></li>
+ *   <li><tt>.tip</tt>: <a href="#Termindex">Term Index</a></li>
+ * </ul>
+ * <p>
+ * <a name="Termdictionary" id="Termdictionary"></a>
+ * <h3>Term Dictionary</h3>
+ *
+ * <p>The .tim file contains the list of terms in each
+ * field along with per-term statistics (such as docfreq)
+ * and per-term metadata (typically pointers to the postings list
+ * for that term in the inverted index).
+ * </p>
+ *
+ * <p>The .tim is arranged in blocks: with blocks containing
+ * a variable number of entries (by default 25-48), where
+ * each entry is either a term or a reference to a
+ * sub-block.</p>
+ *
+ * <p>NOTE: The term dictionary can plug into different postings implementations:
+ * the postings writer/reader are actually responsible for encoding 
+ * and decoding the Postings Metadata and Term Metadata sections.</p>
+ *
+ * <ul>
+ *    <li>TermsDict (.tim) --&gt; Header, <i>PostingsHeader</i>, NodeBlock<sup>NumBlocks</sup>,
+ *                               FieldSummary, DirOffset</li>
+ *    <li>NodeBlock --&gt; (OuterNode | InnerNode)</li>
+ *    <li>OuterNode --&gt; EntryCount, SuffixLength, Byte<sup>SuffixLength</sup>, StatsLength, &lt; TermStats &gt;<sup>EntryCount</sup>, MetaLength, &lt;<i>TermMetadata</i>&gt;<sup>EntryCount</sup></li>
+ *    <li>InnerNode --&gt; EntryCount, SuffixLength[,Sub?], Byte<sup>SuffixLength</sup>, StatsLength, &lt; TermStats ? &gt;<sup>EntryCount</sup>, MetaLength, &lt;<i>TermMetadata ? </i>&gt;<sup>EntryCount</sup></li>
+ *    <li>TermStats --&gt; DocFreq, TotalTermFreq </li>
+ *    <li>FieldSummary --&gt; NumFields, &lt;FieldNumber, NumTerms, RootCodeLength, Byte<sup>RootCodeLength</sup>,
+ *                            SumTotalTermFreq?, SumDocFreq, DocCount&gt;<sup>NumFields</sup></li>
+ *    <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
+ *    <li>DirOffset --&gt; {@link DataOutput#writeLong Uint64}</li>
+ *    <li>EntryCount,SuffixLength,StatsLength,DocFreq,MetaLength,NumFields,
+ *        FieldNumber,RootCodeLength,DocCount --&gt; {@link DataOutput#writeVInt VInt}</li>
+ *    <li>TotalTermFreq,NumTerms,SumTotalTermFreq,SumDocFreq --&gt; 
+ *        {@link DataOutput#writeVLong VLong}</li>
+ * </ul>
+ * <p>Notes:</p>
+ * <ul>
+ *    <li>Header is a {@link CodecUtil#writeHeader CodecHeader} storing the version information
+ *        for the BlockTree implementation.</li>
+ *    <li>DirOffset is a pointer to the FieldSummary section.</li>
+ *    <li>DocFreq is the count of documents which contain the term.</li>
+ *    <li>TotalTermFreq is the total number of occurrences of the term. This is encoded
+ *        as the difference between the total number of occurrences and the DocFreq.</li>
+ *    <li>FieldNumber is the fields number from {@link FieldInfos}. (.fnm)</li>
+ *    <li>NumTerms is the number of unique terms for the field.</li>
+ *    <li>RootCode points to the root block for the field.</li>
+ *    <li>SumDocFreq is the total number of postings, the number of term-document pairs across
+ *        the entire field.</li>
+ *    <li>DocCount is the number of documents that have at least one posting for this field.</li>
+ *    <li>PostingsHeader and TermMetadata are plugged into by the specific postings implementation:
+ *        these contain arbitrary per-file data (such as parameters or versioning information) 
+ *        and per-term data (such as pointers to inverted files).</li>
+ *    <li>For inner nodes of the tree, every entry will steal one bit to mark whether it points
+ *        to child nodes(sub-block). If so, the corresponding TermStats and TermMetaData are omitted </li>
+ * </ul>
+ * <a name="Termindex" id="Termindex"></a>
+ * <h3>Term Index</h3>
+ * <p>The .tip file contains an index into the term dictionary, so that it can be 
+ * accessed randomly.  The index is also used to determine
+ * when a given term cannot exist on disk (in the .tim file), saving a disk seek.</p>
+ * <ul>
+ *   <li>TermsIndex (.tip) --&gt; Header, FSTIndex<sup>NumFields</sup>
+ *                                &lt;IndexStartFP&gt;<sup>NumFields</sup>, DirOffset</li>
+ *   <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
+ *   <li>DirOffset --&gt; {@link DataOutput#writeLong Uint64}</li>
+ *   <li>IndexStartFP --&gt; {@link DataOutput#writeVLong VLong}</li>
+ *   <!-- TODO: better describe FST output here -->
+ *   <li>FSTIndex --&gt; {@link FST FST&lt;byte[]&gt;}</li>
+ * </ul>
+ * <p>Notes:</p>
+ * <ul>
+ *   <li>The .tip file contains a separate FST for each
+ *       field.  The FST maps a term prefix to the on-disk
+ *       block that holds all terms starting with that
+ *       prefix.  Each field's IndexStartFP points to its
+ *       FST.</li>
+ *   <li>DirOffset is a pointer to the start of the IndexStartFPs
+ *       for all fields</li>
+ *   <li>It's possible that an on-disk block would contain
+ *       too many terms (more than the allowed maximum
+ *       (default: 48)).  When this happens, the block is
+ *       sub-divided into new blocks (called "floor
+ *       blocks"), and then the output in the FST for the
+ *       block's prefix encodes the leading byte of each
+ *       sub-block, and its file pointer.
+ * </ul>
+ *
+ * @see BlockTreeTermsReader
+ * @lucene.experimental
+ */
+
+public class BlockTreeTermsWriter extends FieldsConsumer implements Closeable {
+
+  /** Suggested default value for the {@code
+   *  minItemsInBlock} parameter to {@link
+   *  #BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. */
+  public final static int DEFAULT_MIN_BLOCK_SIZE = 25;
+
+  /** Suggested default value for the {@code
+   *  maxItemsInBlock} parameter to {@link
+   *  #BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. */
+  public final static int DEFAULT_MAX_BLOCK_SIZE = 48;
+
+  //public final static boolean DEBUG = false;
+  //private final static boolean SAVE_DOT_FILES = false;
+
+  static final int OUTPUT_FLAGS_NUM_BITS = 2;
+  static final int OUTPUT_FLAGS_MASK = 0x3;
+  static final int OUTPUT_FLAG_IS_FLOOR = 0x1;
+  static final int OUTPUT_FLAG_HAS_TERMS = 0x2;
+
+  /** Extension of terms file */
+  static final String TERMS_EXTENSION = "tim";
+  final static String TERMS_CODEC_NAME = "BLOCK_TREE_TERMS_DICT";
+
+  /** Initial terms format. */
+  public static final int TERMS_VERSION_START = 0;
+  
+  /** Append-only */
+  public static final int TERMS_VERSION_APPEND_ONLY = 1;
+
+  /** Meta data as array */
+  public static final int TERMS_VERSION_META_ARRAY = 2;
+
+  /** Current terms format. */
+  public static final int TERMS_VERSION_CURRENT = TERMS_VERSION_META_ARRAY;
+
+  /** Extension of terms index file */
+  static final String TERMS_INDEX_EXTENSION = "tip";
+  final static String TERMS_INDEX_CODEC_NAME = "BLOCK_TREE_TERMS_INDEX";
+
+  /** Initial index format. */
+  public static final int TERMS_INDEX_VERSION_START = 0;
+  
+  /** Append-only */
+  public static final int TERMS_INDEX_VERSION_APPEND_ONLY = 1;
+
+  /** Meta data as array */
+  public static final int TERMS_INDEX_VERSION_META_ARRAY = 2;
+
+  /** Current index format. */
+  public static final int TERMS_INDEX_VERSION_CURRENT = TERMS_INDEX_VERSION_META_ARRAY;
+
+  private final IndexOutput out;
+  private final IndexOutput indexOut;
+  final int maxDoc;
+  final int minItemsInBlock;
+  final int maxItemsInBlock;
+
+  final PostingsWriterBase postingsWriter;
+  final FieldInfos fieldInfos;
+
+  private static class FieldMetaData {
+    public final FieldInfo fieldInfo;
+    public final BytesRef rootCode;
+    public final long numTerms;
+    public final long indexStartFP;
+    public final long sumTotalTermFreq;
+    public final long sumDocFreq;
+    public final int docCount;
+    private final int longsSize;
+
+    public FieldMetaData(FieldInfo fieldInfo, BytesRef rootCode, long numTerms, long indexStartFP, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize) {
+      assert numTerms > 0;
+      this.fieldInfo = fieldInfo;
+      assert rootCode != null: "field=" + fieldInfo.name + " numTerms=" + numTerms;
+      this.rootCode = rootCode;
+      this.indexStartFP = indexStartFP;
+      this.numTerms = numTerms;
+      this.sumTotalTermFreq = sumTotalTermFreq;
+      this.sumDocFreq = sumDocFreq;
+      this.docCount = docCount;
+      this.longsSize = longsSize;
+    }
+  }
+
+  private final List<FieldMetaData> fields = new ArrayList<FieldMetaData>();
+  // private final String segment;
+
+  /** Create a new writer.  The number of items (terms or
+   *  sub-blocks) per block will aim to be between
+   *  minItemsPerBlock and maxItemsPerBlock, though in some
+   *  cases the blocks may be smaller than the min. */
+  public BlockTreeTermsWriter(
+                              SegmentWriteState state,
+                              PostingsWriterBase postingsWriter,
+                              int minItemsInBlock,
+                              int maxItemsInBlock)
+    throws IOException
+  {
+    if (minItemsInBlock <= 1) {
+      throw new IllegalArgumentException("minItemsInBlock must be >= 2; got " + minItemsInBlock);
+    }
+    if (maxItemsInBlock <= 0) {
+      throw new IllegalArgumentException("maxItemsInBlock must be >= 1; got " + maxItemsInBlock);
+    }
+    if (minItemsInBlock > maxItemsInBlock) {
+      throw new IllegalArgumentException("maxItemsInBlock must be >= minItemsInBlock; got maxItemsInBlock=" + maxItemsInBlock + " minItemsInBlock=" + minItemsInBlock);
+    }
+    if (2*(minItemsInBlock-1) > maxItemsInBlock) {
+      throw new IllegalArgumentException("maxItemsInBlock must be at least 2*(minItemsInBlock-1); got maxItemsInBlock=" + maxItemsInBlock + " minItemsInBlock=" + minItemsInBlock);
+    }
+
+    maxDoc = state.segmentInfo.getDocCount();
+
+    final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION);
+    out = state.directory.createOutput(termsFileName, state.context);
+    boolean success = false;
+    IndexOutput indexOut = null;
+    try {
+      fieldInfos = state.fieldInfos;
+      this.minItemsInBlock = minItemsInBlock;
+      this.maxItemsInBlock = maxItemsInBlock;
+      writeHeader(out);
+
+      //DEBUG = state.segmentName.equals("_4a");
+
+      final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
+      indexOut = state.directory.createOutput(termsIndexFileName, state.context);
+      writeIndexHeader(indexOut);
+
+      this.postingsWriter = postingsWriter;
+      // segment = state.segmentName;
+
+      // System.out.println("BTW.init seg=" + state.segmentName);
+
+      postingsWriter.init(out);                          // have consumer write its format/header
+      success = true;
+    } finally {
+      if (!success) {
+        IOUtils.closeWhileHandlingException(out, indexOut);
+      }
+    }
+    this.indexOut = indexOut;
+  }
+
+  /** Writes the terms file header. */
+  private void writeHeader(IndexOutput out) throws IOException {
+    CodecUtil.writeHeader(out, TERMS_CODEC_NAME, TERMS_VERSION_CURRENT);   
+  }
+
+  /** Writes the index file header. */
+  private void writeIndexHeader(IndexOutput out) throws IOException {
+    CodecUtil.writeHeader(out, TERMS_INDEX_CODEC_NAME, TERMS_INDEX_VERSION_CURRENT); 
+  }
+
+  /** Writes the terms file trailer. */
+  private void writeTrailer(IndexOutput out, long dirStart) throws IOException {
+    out.writeLong(dirStart);    
+  }
+
+  /** Writes the index file trailer. */
+  private void writeIndexTrailer(IndexOutput indexOut, long dirStart) throws IOException {
+    indexOut.writeLong(dirStart);    
+  }
+
+  @Override
+  public void write(Fields fields) throws IOException {
+
+    boolean success = false;
+    try {
+      String lastField = null;
+      for(String field : fields) {
+        assert lastField == null || lastField.compareTo(field) < 0;
+        lastField = field;
+
+        Terms terms = fields.terms(field);
+        if (terms == null) {
+          continue;
+        }
+
+        TermsEnum termsEnum = terms.iterator(null);
+
+        TermsWriter termsWriter = new TermsWriter(fieldInfos.fieldInfo(field));
+
+        while (true) {
+          BytesRef term = termsEnum.next();
+          if (term == null) {
+            break;
+          }
+          termsWriter.write(term, termsEnum);
+        }
+
+        termsWriter.finish();
+      }
+      success = true;
+    } finally {
+      if (success) {
+        IOUtils.close(this);
+      } else {
+        IOUtils.closeWhileHandlingException(this);
+      }
+    }
+  }
+  
+  static long encodeOutput(long fp, boolean hasTerms, boolean isFloor) {
+    assert fp < (1L << 62);
+    return (fp << 2) | (hasTerms ? OUTPUT_FLAG_HAS_TERMS : 0) | (isFloor ? OUTPUT_FLAG_IS_FLOOR : 0);
+  }
+
+  private static class PendingEntry {
+    public final boolean isTerm;
+
+    protected PendingEntry(boolean isTerm) {
+      this.isTerm = isTerm;
+    }
+  }
+
+  private static final class PendingTerm extends PendingEntry {
+    public final BytesRef term;
+    // stats + metadata
+    public final BlockTermState state;
+
+    public PendingTerm(BytesRef term, BlockTermState state) {
+      super(true);
+      this.term = term;
+      this.state = state;
+    }
+
+    @Override
+    public String toString() {
+      return term.utf8ToString();
+    }
+  }
+
+  private static final class PendingBlock extends PendingEntry {
+    public final BytesRef prefix;
+    public final long fp;
+    public FST<BytesRef> index;
+    public List<FST<BytesRef>> subIndices;
+    public final boolean hasTerms;
+    public final boolean isFloor;
+    public final int floorLeadByte;
+    private final IntsRef scratchIntsRef = new IntsRef();
+
+    public PendingBlock(BytesRef prefix, long fp, boolean hasTerms, boolean isFloor, int floorLeadByte, List<FST<BytesRef>> subIndices) {
+      super(false);
+      this.prefix = prefix;
+      this.fp = fp;
+      this.hasTerms = hasTerms;
+      this.isFloor = isFloor;
+      this.floorLeadByte = floorLeadByte;
+      this.subIndices = subIndices;
+    }
+
+    @Override
+    public String toString() {
+      return "BLOCK: " + prefix.utf8ToString();
+    }
+
+    public void compileIndex(List<PendingBlock> floorBlocks, RAMOutputStream scratchBytes) throws IOException {
+
+      assert (isFloor && floorBlocks != null && floorBlocks.size() != 0) || (!isFloor && floorBlocks == null): "isFloor=" + isFloor + " floorBlocks=" + floorBlocks;
+
+      assert scratchBytes.getFilePointer() == 0;
+
+      // TODO: try writing the leading vLong in MSB order
+      // (opposite of what Lucene does today), for better
+      // outputs sharing in the FST
+      scratchBytes.writeVLong(encodeOutput(fp, hasTerms, isFloor));
+      if (isFloor) {
+        scratchBytes.writeVInt(floorBlocks.size());
+        for (PendingBlock sub : floorBlocks) {
+          assert sub.floorLeadByte != -1;
+          //if (DEBUG) {
+          //  System.out.println("    write floorLeadByte=" + Integer.toHexString(sub.floorLeadByte&0xff));
+          //}
+          scratchBytes.writeByte((byte) sub.floorLeadByte);
+          assert sub.fp > fp;
+          scratchBytes.writeVLong((sub.fp - fp) << 1 | (sub.hasTerms ? 1 : 0));
+        }
+      }
+
+      final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
+      final Builder<BytesRef> indexBuilder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1,
+                                                                   0, 0, true, false, Integer.MAX_VALUE,
+                                                                   outputs, null, false,
+                                                                   PackedInts.COMPACT, true, 15);
+      //if (DEBUG) {
+      //  System.out.println("  compile index for prefix=" + prefix);
+      //}
+      //indexBuilder.DEBUG = false;
+      final byte[] bytes = new byte[(int) scratchBytes.getFilePointer()];
+      assert bytes.length > 0;
+      scratchBytes.writeTo(bytes, 0);
+      indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef), new BytesRef(bytes, 0, bytes.length));
+      scratchBytes.reset();
+
+      // Copy over index for all sub-blocks
+
+      if (subIndices != null) {
+        for(FST<BytesRef> subIndex : subIndices) {
+          append(indexBuilder, subIndex);
+        }
+      }
+
+      if (floorBlocks != null) {
+        for (PendingBlock sub : floorBlocks) {
+          if (sub.subIndices != null) {
+            for(FST<BytesRef> subIndex : sub.subIndices) {
+              append(indexBuilder, subIndex);
+            }
+          }
+          sub.subIndices = null;
+        }
+      }
+
+      index = indexBuilder.finish();
+      subIndices = null;
+
+      /*
+      Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
+      Util.toDot(index, w, false, false);
+      System.out.println("SAVED to out.dot");
+      w.close();
+      */
+    }
+
+    // TODO: maybe we could add bulk-add method to
+    // Builder?  Takes FST and unions it w/ current
+    // FST.
+    private void append(Builder<BytesRef> builder, FST<BytesRef> subIndex) throws IOException {
+      final BytesRefFSTEnum<BytesRef> subIndexEnum = new BytesRefFSTEnum<BytesRef>(subIndex);
+      BytesRefFSTEnum.InputOutput<BytesRef> indexEnt;
+      while((indexEnt = subIndexEnum.next()) != null) {
+        //if (DEBUG) {
+        //  System.out.println("      add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
+        //}
+        builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
+      }
+    }
+  }
+
+  final RAMOutputStream scratchBytes = new RAMOutputStream();
+
+  class TermsWriter {
+    private final FieldInfo fieldInfo;
+    private final int longsSize;
+    private long numTerms;
+    final FixedBitSet docsSeen;
+    long sumTotalTermFreq;
+    long sumDocFreq;
+    long indexStartFP;
+
+    // Used only to partition terms into the block tree; we
+    // don't pull an FST from this builder:
+    private final NoOutputs noOutputs;
+    private final Builder<Object> blockBuilder;
+
+    // PendingTerm or PendingBlock:
+    private final List<PendingEntry> pending = new ArrayList<PendingEntry>();
+
+    // Index into pending of most recently written block
+    private int lastBlockIndex = -1;
+
+    // Re-used when segmenting a too-large block into floor
+    // blocks:
+    private int[] subBytes = new int[10];
+    private int[] subTermCounts = new int[10];
+    private int[] subTermCountSums = new int[10];
+    private int[] subSubCounts = new int[10];
+
+    // This class assigns terms to blocks "naturally", ie,
+    // according to the number of terms under a given prefix
+    // that we encounter:
+    private class FindBlocks extends Builder.FreezeTail<Object> {
+
+      @Override
+      public void freeze(final Builder.UnCompiledNode<Object>[] frontier, int prefixLenPlus1, final IntsRef lastInput) throws IOException {
+
+        //if (DEBUG) System.out.println("  freeze prefixLenPlus1=" + prefixLenPlus1);
+
+        for(int idx=lastInput.length; idx >= prefixLenPlus1; idx--) {
+          final Builder.UnCompiledNode<Object> node = frontier[idx];
+
+          long totCount = 0;
+
+          if (node.isFinal) {
+            totCount++;
+          }
+
+          for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) {
+            @SuppressWarnings("unchecked") final Builder.UnCompiledNode<Object> target = (Builder.UnCompiledNode<Object>) node.arcs[arcIdx].target;
+            totCount += target.inputCount;
+            target.clear();
+            node.arcs[arcIdx].target = null;
+          }
+          node.numArcs = 0;
+
+          if (totCount >= minItemsInBlock || idx == 0) {
+            // We are on a prefix node that has enough
+            // entries (terms or sub-blocks) under it to let
+            // us write a new block or multiple blocks (main
+            // block + follow on floor blocks):
+            //if (DEBUG) {
+            //  if (totCount < minItemsInBlock && idx != 0) {
+            //    System.out.println("  force block has terms");
+            //  }
+            //}
+            writeBlocks(lastInput, idx, (int) totCount);
+            node.inputCount = 1;
+          } else {
+            // stragglers!  carry count upwards
+            node.inputCount = totCount;
+          }
+          frontier[idx] = new Builder.UnCompiledNode<Object>(blockBuilder, idx);
+        }
+      }
+    }
+
+    // Write the top count entries on the pending stack as
+    // one or more blocks.  Returns how many blocks were
+    // written.  If the entry count is <= maxItemsPerBlock
+    // we just write a single block; else we break into
+    // primary (initial) block and then one or more
+    // following floor blocks:
+
+    void writeBlocks(IntsRef prevTerm, int prefixLength, int count) throws IOException {
+      if (prefixLength == 0 || count <= maxItemsInBlock) {
+        // Easy case: not floor block.  Eg, prefix is "foo",
+        // and we found 30 terms/sub-blocks starting w/ that
+        // prefix, and minItemsInBlock <= 30 <=
+        // maxItemsInBlock.
+        final PendingBlock nonFloorBlock = writeBlock(prevTerm, prefixLength, prefixLength, count, count, 0, false, -1, true);
+        nonFloorBlock.compileIndex(null, scratchBytes);
+        pending.add(nonFloorBlock);
+      } else {
+        // Floor block case.  Eg, prefix is "foo" but we
+        // have 100 terms/sub-blocks starting w/ that
+        // prefix.  We segment the entries into a primary
+        // block and following floor blocks using the first
+        // label in the suffix to assign to floor blocks.
+
+        // TODO: we could store min & max suffix start byte
+        // in each block, to make floor blocks authoritative
+
+        //if (DEBUG) {
+        //  final BytesRef prefix = new BytesRef(prefixLength);
+        //  for(int m=0;m<prefixLength;m++) {
+        //    prefix.bytes[m] = (byte) prevTerm.ints[m];
+        //  }
+        //  prefix.length = prefixLength;
+        //  //System.out.println("\nWBS count=" + count + " prefix=" + prefix.utf8ToString() + " " + prefix);
+        //  System.out.println("writeBlocks: prefix=" + prefix + " " + prefix + " count=" + count + " pending.size()=" + pending.size());
+        //}
+        //System.out.println("\nwbs count=" + count);
+
+        final int savLabel = prevTerm.ints[prevTerm.offset + prefixLength];
+
+        // Count up how many items fall under
+        // each unique label after the prefix.
+        
+        // TODO: this is wasteful since the builder had
+        // already done this (partitioned these sub-terms
+        // according to their leading prefix byte)
+        
+        final List<PendingEntry> slice = pending.subList(pending.size()-count, pending.size());
+        int lastSuffixLeadLabel = -1;
+        int termCount = 0;
+        int subCount = 0;
+        int numSubs = 0;
+
+        for(PendingEntry ent : slice) {
+
+          // First byte in the suffix of this term
+          final int suffixLeadLabel;
+          if (ent.isTerm) {
+            PendingTerm term = (PendingTerm) ent;
+            if (term.term.length == prefixLength) {
+              // Suffix is 0, ie prefix 'foo' and term is
+              // 'foo' so the term has empty string suffix
+              // in this block
+              assert lastSuffixLeadLabel == -1;
+              assert numSubs == 0;
+              suffixLeadLabel = -1;
+            } else {
+              suffixLeadLabel = term.term.bytes[term.term.offset + prefixLength] & 0xff;
+            }
+          } else {
+            PendingBlock block = (PendingBlock) ent;
+            assert block.prefix.length > prefixLength;
+            suffixLeadLabel = block.prefix.bytes[block.prefix.offset + prefixLength] & 0xff;
+          }
+
+          if (suffixLeadLabel != lastSuffixLeadLabel && (termCount + subCount) != 0) {
+            if (subBytes.length == numSubs) {
+              subBytes = ArrayUtil.grow(subBytes);
+              subTermCounts = ArrayUtil.grow(subTermCounts);
+              subSubCounts = ArrayUtil.grow(subSubCounts);
+            }
+            subBytes[numSubs] = lastSuffixLeadLabel;
+            lastSuffixLeadLabel = suffixLeadLabel;
+            subTermCounts[numSubs] = termCount;
+            subSubCounts[numSubs] = subCount;
+            /*
+            if (suffixLeadLabel == -1) {
+              System.out.println("  sub " + -1 + " termCount=" + termCount + " subCount=" + subCount);
+            } else {
+              System.out.println("  sub " + Integer.toHexString(suffixLeadLabel) + " termCount=" + termCount + " subCount=" + subCount);
+            }
+            */
+            termCount = subCount = 0;
+            numSubs++;
+          }
+
+          if (ent.isTerm) {
+            termCount++;
+          } else {
+            subCount++;
+          }
+        }
+
+        if (subBytes.length == numSubs) {
+          subBytes = ArrayUtil.grow(subBytes);
+          subTermCounts = ArrayUtil.grow(subTermCounts);
+          subSubCounts = ArrayUtil.grow(subSubCounts);
+        }
+
+        subBytes[numSubs] = lastSuffixLeadLabel;
+        subTermCounts[numSubs] = termCount;
+        subSubCounts[numSubs] = subCount;
+        numSubs++;
+        /*
+        if (lastSuffixLeadLabel == -1) {
+          System.out.println("  sub " + -1 + " termCount=" + termCount + " subCount=" + subCount);
+        } else {
+          System.out.println("  sub " + Integer.toHexString(lastSuffixLeadLabel) + " termCount=" + termCount + " subCount=" + subCount);
+        }
+        */
+
+        if (subTermCountSums.length < numSubs) {
+          subTermCountSums = ArrayUtil.grow(subTermCountSums, numSubs);
+        }
+
+        // Roll up (backwards) the termCounts; postings impl
+        // needs this to know where to pull the term slice
+        // from its pending terms stack:
+        int sum = 0;
+        for(int idx=numSubs-1;idx>=0;idx--) {
+          sum += subTermCounts[idx];
+          subTermCountSums[idx] = sum;
+        }
+
+        // TODO: make a better segmenter?  It'd have to
+        // absorb the too-small end blocks backwards into
+        // the previous blocks
+
+        // Naive greedy segmentation; this is not always
+        // best (it can produce a too-small block as the
+        // last block):
+        int pendingCount = 0;
+        int startLabel = subBytes[0];
+        int curStart = count;
+        subCount = 0;
+
+        final List<PendingBlock> floorBlocks = new ArrayList<PendingBlock>();
+        PendingBlock firstBlock = null;
+
+        for(int sub=0;sub<numSubs;sub++) {
+          pendingCount += subTermCounts[sub] + subSubCounts[sub];
+          //System.out.println("  " + (subTermCounts[sub] + subSubCounts[sub]));
+          subCount++;
+
+          // Greedily make a floor block as soon as we've
+          // crossed the min count
+          if (pendingCount >= minItemsInBlock) {
+            final int curPrefixLength;
+            if (startLabel == -1) {
+              curPrefixLength = prefixLength;
+            } else {
+              curPrefixLength = 1+prefixLength;
+              // floor term:
+              prevTerm.ints[prevTerm.offset + prefixLength] = startLabel;
+            }
+            //System.out.println("  " + subCount + " subs");
+            final PendingBlock floorBlock = writeBlock(prevTerm, prefixLength, curPrefixLength, curStart, pendingCount, subTermCountSums[1+sub], true, startLabel, curStart == pendingCount);
+            if (firstBlock == null) {
+              firstBlock = floorBlock;
+            } else {
+              floorBlocks.add(floorBlock);
+            }
+            curStart -= pendingCount;
+            //System.out.println("    = " + pendingCount);
+            pendingCount = 0;
+
+            assert minItemsInBlock == 1 || subCount > 1: "minItemsInBlock=" + minItemsInBlock + " subCount=" + subCount + " sub=" + sub + " of " + numSubs + " subTermCount=" + subTermCountSums[sub] + " subSubCount=" + subSubCounts[sub] + " depth=" + prefixLength;
+            subCount = 0;
+            startLabel = subBytes[sub+1];
+
+            if (curStart == 0) {
+              break;
+            }
+
+            if (curStart <= maxItemsInBlock) {
+              // remainder is small enough to fit into a
+              // block.  NOTE that this may be too small (<
+              // minItemsInBlock); need a true segmenter
+              // here
+              assert startLabel != -1;
+              assert firstBlock != null;
+              prevTerm.ints[prevTerm.offset + prefixLength] = startLabel;
+              //System.out.println("  final " + (numSubs-sub-1) + " subs");
+              /*
+              for(sub++;sub < numSubs;sub++) {
+                System.out.println("  " + (subTermCounts[sub] + subSubCounts[sub]));
+              }
+              System.out.println("    = " + curStart);
+              if (curStart < minItemsInBlock) {
+                System.out.println("      **");
+              }
+              */
+              floorBlocks.add(writeBlock(prevTerm, prefixLength, prefixLength+1, curStart, curStart, 0, true, startLabel, true));
+              break;
+            }
+          }
+        }
+
+        prevTerm.ints[prevTerm.offset + prefixLength] = savLabel;
+
+        assert firstBlock != null;
+        firstBlock.compileIndex(floorBlocks, scratchBytes);
+
+        pending.add(firstBlock);
+        //if (DEBUG) System.out.println("  done pending.size()=" + pending.size());
+      }
+      lastBlockIndex = pending.size()-1;
+    }
+
+    // for debugging
+    @SuppressWarnings("unused")
+    private String toString(BytesRef b) {
+      try {
+        return b.utf8ToString() + " " + b;
+      } catch (Throwable t) {
+        // If BytesRef isn't actually UTF8, or it's eg a
+        // prefix of UTF8 that ends mid-unicode-char, we
+        // fallback to hex:
+        return b.toString();
+      }
+    }
+
+    // Writes all entries in the pending slice as a single
+    // block: 
+    private PendingBlock writeBlock(IntsRef prevTerm, int prefixLength, int indexPrefixLength, int startBackwards, int length,
+                                    int futureTermCount, boolean isFloor, int floorLeadByte, boolean isLastInFloor) throws IOException {
+
+      assert length > 0;
+
+      final int start = pending.size()-startBackwards;
+
+      assert start >= 0: "pending.size()=" + pending.size() + " startBackwards=" + startBackwards + " length=" + length;
+
+      final List<PendingEntry> slice = pending.subList(start, start + length);
+
+      final long startFP = out.getFilePointer();
+
+      final BytesRef prefix = new BytesRef(indexPrefixLength);
+      for(int m=0;m<indexPrefixLength;m++) {
+        prefix.bytes[m] = (byte) prevTerm.ints[m];
+      }
+      prefix.length = indexPrefixLength;
+
+      // Write block header:
+      out.writeVInt((length<<1)|(isLastInFloor ? 1:0));
+
+      // if (DEBUG) {
+      //   System.out.println("  writeBlock " + (isFloor ? "(floor) " : "") + "seg=" + segment + " pending.size()=" + pending.size() + " prefixLength=" + prefixLength + " indexPrefix=" + toString(prefix) + " entCount=" + length + " startFP=" + startFP + " futureTermCount=" + futureTermCount + (isFloor ? (" floorLeadByte=" + Integer.toHexString(floorLeadByte&0xff)) : "") + " isLastInFloor=" + isLastInFloor);
+      // }
+
+      // 1st pass: pack term suffix bytes into byte[] blob
+      // TODO: cutover to bulk int codec... simple64?
+
+      final boolean isLeafBlock;
+      if (lastBlockIndex < start) {
+        // This block definitely does not contain sub-blocks:
+        isLeafBlock = true;
+        //System.out.println("no scan true isFloor=" + isFloor);
+      } else if (!isFloor) {
+        // This block definitely does contain at least one sub-block:
+        isLeafBlock = false;
+        //System.out.println("no scan false " + lastBlockIndex + " vs start=" + start + " len=" + length);
+      } else {
+        // Must scan up-front to see if there is a sub-block
+        boolean v = true;
+        //System.out.println("scan " + lastBlockIndex + " vs start=" + start + " len=" + length);
+        for (PendingEntry ent : slice) {
+          if (!ent.isTerm) {
+            v = false;
+            break;
+          }
+        }
+        isLeafBlock = v;
+      }
+
+      final List<FST<BytesRef>> subIndices;
+
+      int termCount;
+
+      long[] longs = new long[longsSize];
+      boolean absolute = true;
+
+      if (isLeafBlock) {
+        subIndices = null;
+        for (PendingEntry ent : slice) {
+          assert ent.isTerm;
+          PendingTerm term = (PendingTerm) ent;
+          BlockTermState state = term.state;
+          final int suffix = term.term.length - prefixLength;
+          // if (DEBUG) {
+          //   BytesRef suffixBytes = new BytesRef(suffix);
+          //   System.arraycopy(term.term.bytes, prefixLength, suffixBytes.bytes, 0, suffix);
+          //   suffixBytes.length = suffix;
+          //   System.out.println("    write term suffix=" + suffixBytes);
+          // }
+          // For leaf block we write suffix straight
+          suffixWriter.writeVInt(suffix);
+          suffixWriter.writeBytes(term.term.bytes, prefixLength, suffix);
+
+          // Write term stats, to separate byte[] blob:
+          statsWriter.writeVInt(state.docFreq);
+          if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
+            assert state.totalTermFreq >= state.docFreq: state.totalTermFreq + " vs " + state.docFreq;
+            statsWriter.writeVLong(state.totalTermFreq - state.docFreq);
+          }
+
+          // Write term meta data
+          postingsWriter.encodeTerm(longs, bytesWriter, fieldInfo, state, absolute);
+          for (int pos = 0; pos < longsSize; pos++) {
+            assert longs[pos] >= 0;
+            metaWriter.writeVLong(longs[pos]);
+          }
+          bytesWriter.writeTo(metaWriter);
+          bytesWriter.reset();
+          absolute = false;
+        }
+        termCount = length;
+      } else {
+        subIndices = new ArrayList<FST<BytesRef>>();
+        termCount = 0;
+        for (PendingEntry ent : slice) {
+          if (ent.isTerm) {
+            PendingTerm term = (PendingTerm) ent;
+            BlockTermState state = term.state;
+            final int suffix = term.term.length - prefixLength;
+            // if (DEBUG) {
+            //   BytesRef suffixBytes = new BytesRef(suffix);
+            //   System.arraycopy(term.term.bytes, prefixLength, suffixBytes.bytes, 0, suffix);
+            //   suffixBytes.length = suffix;
+            //   System.out.println("    write term suffix=" + suffixBytes);
+            // }
+            // For non-leaf block we borrow 1 bit to record
+            // if entry is term or sub-block
+            suffixWriter.writeVInt(suffix<<1);
+            suffixWriter.writeBytes(term.term.bytes, prefixLength, suffix);
+
+            // Write term stats, to separate byte[] blob:
+            statsWriter.writeVInt(state.docFreq);
+            if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
+              assert state.totalTermFreq >= state.docFreq;
+              statsWriter.writeVLong(state.totalTermFreq - state.docFreq);
+            }
+
+            // TODO: now that terms dict "sees" these longs,
+            // we can explore better column-stride encodings
+            // to encode all long[0]s for this block at
+            // once, all long[1]s, etc., e.g. using
+            // Simple64.  Alternatively, we could interleave
+            // stats + meta ... no reason to have them
+            // separate anymore:
+
+            // Write term meta data
+            postingsWriter.encodeTerm(longs, bytesWriter, fieldInfo, state, absolute);
+            for (int pos = 0; pos < longsSize; pos++) {
+              assert longs[pos] >= 0;
+              metaWriter.writeVLong(longs[pos]);
+            }
+            bytesWriter.writeTo(metaWriter);
+            bytesWriter.reset();
+            absolute = false;
+
+            termCount++;
+          } else {
+            PendingBlock block = (PendingBlock) ent;
+            final int suffix = block.prefix.length - prefixLength;
+
+            assert suffix > 0;
+
+            // For non-leaf block we borrow 1 bit to record
+            // if entry is term or sub-block
+            suffixWriter.writeVInt((suffix<<1)|1);
+            suffixWriter.writeBytes(block.prefix.bytes, prefixLength, suffix);
+            assert block.fp < startFP;
+
+            // if (DEBUG) {
+            //   BytesRef suffixBytes = new BytesRef(suffix);
+            //   System.arraycopy(block.prefix.bytes, prefixLength, suffixBytes.bytes, 0, suffix);
+            //   suffixBytes.length = suffix;
+            //   System.out.println("    write sub-block suffix=" + toString(suffixBytes) + " subFP=" + block.fp + " subCode=" + (startFP-block.fp) + " floor=" + block.isFloor);
+            // }
+
+            suffixWriter.writeVLong(startFP - block.fp);
+            subIndices.add(block.index);
+          }
+        }
+
+        assert subIndices.size() != 0;
+      }
+
+      // TODO: we could block-write the term suffix pointers;
+      // this would take more space but would enable binary
+      // search on lookup
+
+      // Write suffixes byte[] blob to terms dict output:
+      out.writeVInt((int) (suffixWriter.getFilePointer() << 1) | (isLeafBlock ? 1:0));
+      suffixWriter.writeTo(out);
+      suffixWriter.reset();
+
+      // Write term stats byte[] blob
+      out.writeVInt((int) statsWriter.getFilePointer());
+      statsWriter.writeTo(out);
+      statsWriter.reset();
+
+      // Write term meta data byte[] blob
+      out.writeVInt((int) metaWriter.getFilePointer());
+      metaWriter.writeTo(out);
+      metaWriter.reset();
+
+      // Remove slice replaced by block:
+      slice.clear();
+
+      if (lastBlockIndex >= start) {
+        if (lastBlockIndex < start+length) {
+          lastBlockIndex = start;
+        } else {
+          lastBlockIndex -= length;
+        }
+      }
+
+      // if (DEBUG) {
+      //   System.out.println("      fpEnd=" + out.getFilePointer());
+      // }
+
+      return new PendingBlock(prefix, startFP, termCount != 0, isFloor, floorLeadByte, subIndices);
+    }
+
+    TermsWriter(FieldInfo fieldInfo) {
+      this.fieldInfo = fieldInfo;
+      docsSeen = new FixedBitSet(maxDoc);
+
+      noOutputs = NoOutputs.getSingleton();
+
+      // This Builder is just used transiently to fragment
+      // terms into "good" blocks; we don't save the
+      // resulting FST:
+      blockBuilder = new Builder<Object>(FST.INPUT_TYPE.BYTE1,
+                                         0, 0, true,
+                                         true, Integer.MAX_VALUE,
+                                         noOutputs,
+                                         new FindBlocks(), false,
+                                         PackedInts.COMPACT,
+                                         true, 15);
+
+      this.longsSize = postingsWriter.setField(fieldInfo);
+    }
+    
+    private final IntsRef scratchIntsRef = new IntsRef();
+
+    /** Writes one term's worth of postings. */
+    public void write(BytesRef text, TermsEnum termsEnum) throws IOException {
+
+      BlockTermState state = postingsWriter.writeTerm(text, termsEnum, docsSeen);
+      if (state != null) {
+        assert state.docFreq != 0;
+        assert fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY || state.totalTermFreq >= state.docFreq: "postingsWriter=" + postingsWriter;
+        sumDocFreq += state.docFreq;
+        sumTotalTermFreq += state.totalTermFreq;
+        blockBuilder.add(Util.toIntsRef(text, scratchIntsRef), noOutputs.getNoOutput());
+
+        PendingTerm term = new PendingTerm(BytesRef.deepCopyOf(text), state);
+        pending.add(term);
+        numTerms++;
+      }
+    }
+
+    // Finishes all terms in this field
+    public void finish() throws IOException {
+      if (numTerms > 0) {
+        blockBuilder.finish();
+
+        // We better have one final "root" block:
+        assert pending.size() == 1 && !pending.get(0).isTerm: "pending.size()=" + pending.size() + " pending=" + pending;
+        final PendingBlock root = (PendingBlock) pending.get(0);
+        assert root.prefix.length == 0;
+        assert root.index.getEmptyOutput() != null;
+
+        // Write FST to index
+        indexStartFP = indexOut.getFilePointer();
+        root.index.save(indexOut);
+        //System.out.println("  write FST " + indexStartFP + " field=" + fieldInfo.name);
+
+        // if (SAVE_DOT_FILES || DEBUG) {
+        //   final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
+        //   Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName));
+        //   Util.toDot(root.index, w, false, false);
+        //   System.out.println("SAVED to " + dotFileName);
+        //   w.close();
+        // }
+
+        fields.add(new FieldMetaData(fieldInfo,
+                                     ((PendingBlock) pending.get(0)).index.getEmptyOutput(),
+                                     numTerms,
+                                     indexStartFP,
+                                     sumTotalTermFreq,
+                                     sumDocFreq,
+                                     docsSeen.cardinality(),
+                                     longsSize));
+      } else {
+        assert sumTotalTermFreq == 0 || fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY && sumTotalTermFreq == -1;
+        assert sumDocFreq == 0;
+        assert docsSeen.cardinality() == 0;
+      }
+    }
+
+    private final RAMOutputStream suffixWriter = new RAMOutputStream();
+    private final RAMOutputStream statsWriter = new RAMOutputStream();
+    private final RAMOutputStream metaWriter = new RAMOutputStream();
+    private final RAMOutputStream bytesWriter = new RAMOutputStream();
+  }
+
+  @Override
+  public void close() throws IOException {
+
+    IOException ioe = null;
+    try {
+      
+      final long dirStart = out.getFilePointer();
+      final long indexDirStart = indexOut.getFilePointer();
+
+      out.writeVInt(fields.size());
+      
+      for(FieldMetaData field : fields) {
+        //System.out.println("  field " + field.fieldInfo.name + " " + field.numTerms + " terms");
+        out.writeVInt(field.fieldInfo.number);
+        out.writeVLong(field.numTerms);
+        out.writeVInt(field.rootCode.length);
+        out.writeBytes(field.rootCode.bytes, field.rootCode.offset, field.rootCode.length);
+        if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
+          out.writeVLong(field.sumTotalTermFreq);
+        }
+        out.writeVLong(field.sumDocFreq);
+        out.writeVInt(field.docCount);
+        if (TERMS_VERSION_CURRENT >= TERMS_VERSION_META_ARRAY) {
+          out.writeVInt(field.longsSize);
+        }
+        indexOut.writeVLong(field.indexStartFP);
+      }
+      writeTrailer(out, dirStart);
+      writeIndexTrailer(indexOut, indexDirStart);
+    } catch (IOException ioe2) {
+      ioe = ioe2;
+    } finally {
+      IOUtils.closeWhileHandlingException(ioe, out, indexOut, postingsWriter);
+    }
+  }
+}

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/FieldsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/FieldsConsumer.java?rev=1531949&r1=1531948&r2=1531949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/FieldsConsumer.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/FieldsConsumer.java Mon Oct 14 15:55:57 2013
@@ -48,16 +48,12 @@ public abstract class FieldsConsumer {
   /** Write all fields, terms and postings.  This the "pull"
    *  API, allowing you to iterate more than once over the
    *  postings, somewhat analogous to using a DOM API to
-   *  traverse an XML tree.  Alternatively, if you subclass
-   *  {@link PushFieldsConsumer}, then all postings are
-   *  pushed in a single pass, somewhat analogous to using a
-   *  SAX API to traverse an XML tree.
+   *  traverse an XML tree.
    *
-   *  <p>This API is has certain restrictions vs {@link
-   *  PushFieldsConsumer}:
+   *  <p><b>Notes</b>:
    *
    *  <ul>
-   *    <li> You must compute index statistics yourself,
+   *    <li> You must compute index statistics,
    *         including each Term's docFreq and totalTermFreq,
    *         as well as the summary sumTotalTermFreq,
    *         sumTotalDocFreq and docCount.

Added: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsBaseFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsBaseFormat.java?rev=1531949&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsBaseFormat.java (added)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsBaseFormat.java Mon Oct 14 15:55:57 2013
@@ -0,0 +1,55 @@
+package org.apache.lucene.codecs;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.SegmentReadState;
+
+/** 
+ * Provides a {@link PostingsReaderBase} and {@link
+ * PostingsWriterBase}.
+ *
+ * @lucene.experimental */
+
+// TODO: find a better name; this defines the API that the
+// terms dict impls use to talk to a postings impl.
+// TermsDict + PostingsReader/WriterBase == PostingsConsumer/Producer
+
+// can we clean this up and do this some other way? 
+// refactor some of these classes and use covariant return?
+public abstract class PostingsBaseFormat {
+
+  /** Unique name that's used to retrieve this codec when
+   *  reading the index */
+  public final String name;
+  
+  /** Sole constructor. */
+  protected PostingsBaseFormat(String name) {
+    this.name = name;
+  }
+
+  /** Creates the {@link PostingsReaderBase} for this
+   *  format. */
+  public abstract PostingsReaderBase postingsReaderBase(SegmentReadState state) throws IOException;
+
+  /** Creates the {@link PostingsWriterBase} for this
+   *  format. */
+  public abstract PostingsWriterBase postingsWriterBase(SegmentWriteState state) throws IOException;
+}

Added: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java?rev=1531949&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java (added)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java Mon Oct 14 15:55:57 2013
@@ -0,0 +1,89 @@
+package org.apache.lucene.codecs;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Closeable;
+import java.io.IOException;
+
+import org.apache.lucene.index.DocsAndPositionsEnum; // javadocs
+import org.apache.lucene.index.DocsEnum; // javadocs
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
+
+/**
+ * Class that plugs into term dictionaries, such as {@link
+ * BlockTreeTermsWriter}, and handles writing postings.
+ * 
+ * @see PostingsReaderBase
+ * @lucene.experimental
+ */
+// TODO: find a better name; this defines the API that the
+// terms dict impls use to talk to a postings impl.
+// TermsDict + PostingsReader/WriterBase == FieldsProducer/Consumer
+public abstract class PostingsWriterBase implements Closeable {
+
+  /** Sole constructor. (For invocation by subclass 
+   *  constructors, typically implicit.) */
+  protected PostingsWriterBase() {
+  }
+
+  /** Called once after startup, before any terms have been
+   *  added.  Implementations typically write a header to
+   *  the provided {@code termsOut}. */
+  public abstract void init(IndexOutput termsOut) throws IOException;
+
+  /** Write all postings for one term; use the provided
+   *  {@link TermsEnum} to pull a {@link DocsEnum} or {@link
+   *  DocsAndPositionsEnum}.  This method should not
+   *  re-position the {@code TermsEnum}!  It is already
+   *  positioned on the term that should be written.  This
+   *  method must set the bit in the provided {@link
+   *  FixedBitSet} for every docID written.  If no docs
+   *  were written, this method should return null, and the
+   *  terms dict will skip the term. */
+  public abstract BlockTermState writeTerm(BytesRef term, TermsEnum termsEnum, FixedBitSet docsSeen) throws IOException;
+
+  /**
+   * Encode metadata as long[] and byte[]. {@code absolute} controls whether 
+   * current term is delta encoded according to latest term. 
+   * Usually elements in {@code longs} are file pointers, so each one always 
+   * increases when a new term is consumed. {@code out} is used to write generic
+   * bytes, which are not monotonic.
+   *
+   * NOTE: sometimes long[] might contain "don't care" values that are unused, e.g. 
+   * the pointer to postings list may not be defined for some terms but is defined
+   * for others, if it is designed to inline  some postings data in term dictionary.
+   * In this case, the postings writer should always use the last value, so that each
+   * element in metadata long[] remains monotonic.
+   */
+  public abstract void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState state, boolean absolute) throws IOException;
+
+  /** 
+   * Sets the current field for writing, and returns the
+   * fixed length of long[] metadata (which is fixed per
+   * field), called when the writing switches to another field. */
+  // TODO: better name?
+  public abstract int setField(FieldInfo fieldInfo);
+
+  @Override
+  public abstract void close() throws IOException;
+}

Added: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java?rev=1531949&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java (added)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java Mon Oct 14 15:55:57 2013
@@ -0,0 +1,219 @@
+package org.apache.lucene.codecs;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
+
+/**
+ * Extension of {@link PostingsWriterBase}, adding a push
+ * API for writing each element of the postings.  This API
+ * is somewhat analagous to an XML SAX API, while {@link
+ * PostingsWriterBase} is more like an XML DOM API.
+ * 
+ * @see PostingsReaderBase
+ * @lucene.experimental
+ */
+// TODO: find a better name; this defines the API that the
+// terms dict impls use to talk to a postings impl.
+// TermsDict + PostingsReader/WriterBase == PostingsConsumer/Producer
+public abstract class PushPostingsWriterBase extends PostingsWriterBase {
+
+  // Reused in writeTerm
+  private DocsEnum docsEnum;
+  private DocsAndPositionsEnum posEnum;
+  private int enumFlags;
+
+  /** {@link FieldInfo} of current field being written. */
+  protected FieldInfo fieldInfo;
+
+  /** {@link IndexOptions} of current field being
+      written */
+  protected IndexOptions indexOptions;
+
+  /** True if the current field writes freqs. */
+  protected boolean writeFreqs;
+
+  /** True if the current field writes positions. */
+  protected boolean writePositions;
+
+  /** True if the current field writes payloads. */
+  protected boolean writePayloads;
+
+  /** True if the current field writes offsets. */
+  protected boolean writeOffsets;
+
+  /** Sole constructor. (For invocation by subclass 
+   *  constructors, typically implicit.) */
+  protected PushPostingsWriterBase() {
+  }
+
+  /** Called once after startup, before any terms have been
+   *  added.  Implementations typically write a header to
+   *  the provided {@code termsOut}. */
+  public abstract void init(IndexOutput termsOut) throws IOException;
+
+  /** Return a newly created empty TermState */
+  public abstract BlockTermState newTermState() throws IOException;
+
+  /** Start a new term.  Note that a matching call to {@link
+   *  #finishTerm(BlockTermState)} is done, only if the term has at least one
+   *  document. */
+  public abstract void startTerm() throws IOException;
+
+  /** Finishes the current term.  The provided {@link
+   *  BlockTermState} contains the term's summary statistics, 
+   *  and will holds metadata from PBF when returned */
+  public abstract void finishTerm(BlockTermState state) throws IOException;
+
+  /**
+   * Encode metadata as long[] and byte[]. {@code absolute} controls whether 
+   * current term is delta encoded according to latest term. 
+   * Usually elements in {@code longs} are file pointers, so each one always 
+   * increases when a new term is consumed. {@code out} is used to write generic
+   * bytes, which are not monotonic.
+   *
+   * NOTE: sometimes long[] might contain "don't care" values that are unused, e.g. 
+   * the pointer to postings list may not be defined for some terms but is defined
+   * for others, if it is designed to inline  some postings data in term dictionary.
+   * In this case, the postings writer should always use the last value, so that each
+   * element in metadata long[] remains monotonic.
+   */
+  public abstract void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState state, boolean absolute) throws IOException;
+
+  /** 
+   * Sets the current field for writing, and returns the
+   * fixed length of long[] metadata (which is fixed per
+   * field), called when the writing switches to another field. */
+  // TODO: better name?
+  public int setField(FieldInfo fieldInfo) {
+    this.fieldInfo = fieldInfo;
+    indexOptions = fieldInfo.getIndexOptions();
+
+    writeFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
+    writePositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
+    writeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;        
+    writePayloads = fieldInfo.hasPayloads();
+
+    if (writeFreqs == false) {
+      enumFlags = 0;
+    } else if (writePositions == false) {
+      enumFlags = DocsEnum.FLAG_FREQS;
+    } else if (writeOffsets == false) {
+      if (writePayloads) {
+        enumFlags = DocsAndPositionsEnum.FLAG_PAYLOADS;
+      } else {
+        enumFlags = 0;
+      }
+    } else {
+      if (writePayloads) {
+        enumFlags = DocsAndPositionsEnum.FLAG_PAYLOADS | DocsAndPositionsEnum.FLAG_OFFSETS;
+      } else {
+        enumFlags = DocsAndPositionsEnum.FLAG_OFFSETS;
+      }
+    }
+
+    return 0;
+  }
+
+  @Override
+  public final BlockTermState writeTerm(BytesRef term, TermsEnum termsEnum, FixedBitSet docsSeen) throws IOException {
+    startTerm();
+    if (writePositions == false) {
+      docsEnum = termsEnum.docs(null, docsEnum, enumFlags);
+    } else {
+      posEnum = termsEnum.docsAndPositions(null, posEnum, enumFlags);
+      docsEnum = posEnum;
+    }
+    assert docsEnum != null;
+
+    int docFreq = 0;
+    long totalTermFreq = 0;
+    while (true) {
+      int docID = docsEnum.nextDoc();
+      if (docID == DocsEnum.NO_MORE_DOCS) {
+        break;
+      }
+      docFreq++;
+      docsSeen.set(docID);
+      int freq;
+      if (writeFreqs) {
+        freq = docsEnum.freq();
+        totalTermFreq += freq;
+      } else {
+        freq = -1;
+      }
+      startDoc(docID, freq);
+
+      if (writePositions) {
+        for(int i=0;i<freq;i++) {
+          int pos = posEnum.nextPosition();
+          BytesRef payload = writePayloads ? posEnum.getPayload() : null;
+          int startOffset;
+          int endOffset;
+          if (writeOffsets) {
+            startOffset = posEnum.startOffset();
+            endOffset = posEnum.endOffset();
+          } else {
+            startOffset = -1;
+            endOffset = -1;
+          }
+          addPosition(pos, payload, startOffset, endOffset);
+        }
+      }
+
+      finishDoc();
+    }
+
+    if (docFreq == 0) {
+      return null;
+    } else {
+      BlockTermState state = newTermState();
+      state.docFreq = docFreq;
+      state.totalTermFreq = writeFreqs ? totalTermFreq : -1;
+      finishTerm(state);
+      return state;
+    }
+  }
+
+  /** Adds a new doc in this term. 
+   * <code>freq</code> will be -1 when term frequencies are omitted
+   * for the field. */
+  public abstract void startDoc(int docID, int freq) throws IOException;
+
+  /** Add a new position & payload, and start/end offset.  A
+   *  null payload means no payload; a non-null payload with
+   *  zero length also means no payload.  Caller may reuse
+   *  the {@link BytesRef} for the payload between calls
+   *  (method must fully consume the payload). <code>startOffset</code>
+   *  and <code>endOffset</code> will be -1 when offsets are not indexed. */
+  public abstract void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException;
+
+  /** Called when we are done adding positions & payloads
+   *  for each doc. */
+  public abstract void finishDoc() throws IOException;
+}

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java?rev=1531949&r1=1531948&r2=1531949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java Mon Oct 14 15:55:57 2013
@@ -26,7 +26,7 @@ import org.apache.lucene.codecs.FieldsCo
 import org.apache.lucene.codecs.FieldsProducer;
 import org.apache.lucene.codecs.PostingsFormat;
 import org.apache.lucene.codecs.PostingsReaderBase;
-import org.apache.lucene.codecs.PostingsWriterBase;
+import org.apache.lucene.codecs.PostingsWriterBase; // javadocs
 import org.apache.lucene.index.DocsEnum; // javadocs
 import org.apache.lucene.index.FieldInfo.IndexOptions; // javadocs
 import org.apache.lucene.index.FieldInfos; // javadocs

Added: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsBaseFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsBaseFormat.java?rev=1531949&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsBaseFormat.java (added)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsBaseFormat.java Mon Oct 14 15:55:57 2013
@@ -0,0 +1,51 @@
+package org.apache.lucene.codecs.lucene41;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.PostingsBaseFormat;
+import org.apache.lucene.codecs.PostingsReaderBase;
+import org.apache.lucene.codecs.PostingsWriterBase;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+
+/** 
+ * Provides a {@link PostingsReaderBase} and {@link
+ * PostingsWriterBase}.
+ *
+ * @lucene.experimental */
+
+// TODO: should these also be named / looked up via SPI?
+public final class Lucene41PostingsBaseFormat extends PostingsBaseFormat {
+
+  /** Sole constructor. */
+  public Lucene41PostingsBaseFormat() {
+    super("Lucene41");
+  }
+
+  @Override
+  public PostingsReaderBase postingsReaderBase(SegmentReadState state) throws IOException {
+    return new Lucene41PostingsReader(state.directory, state.fieldInfos, state.segmentInfo, state.context, state.segmentSuffix);
+  }
+
+  @Override
+  public PostingsWriterBase postingsWriterBase(SegmentWriteState state) throws IOException {
+    return new Lucene41PostingsWriter(state);
+  }
+}