You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2013/10/14 17:55:58 UTC
svn commit: r1531949 [2/3] - in /lucene/dev/trunk/lucene: ./
codecs/src/java/org/apache/lucene/codecs/blockterms/
codecs/src/java/org/apache/lucene/codecs/bloom/
codecs/src/java/org/apache/lucene/codecs/memory/
codecs/src/java/org/apache/lucene/codecs/...
Added: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java?rev=1531949&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java (added)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java Mon Oct 14 15:55:57 2013
@@ -0,0 +1,1155 @@
+package org.apache.lucene.codecs;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RAMOutputStream;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.ByteSequenceOutputs;
+import org.apache.lucene.util.fst.BytesRefFSTEnum;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.NoOutputs;
+import org.apache.lucene.util.fst.Util;
+import org.apache.lucene.util.packed.PackedInts;
+
+/*
+ TODO:
+
+ - Currently there is a one-to-one mapping of indexed
+ term to term block, but we could decouple the two, ie,
+ put more terms into the index than there are blocks.
+ The index would take up more RAM but then it'd be able
+ to avoid seeking more often and could make PK/FuzzyQ
+ faster if the additional indexed terms could store
+ the offset into the terms block.
+
+ - The blocks are not written in true depth-first
+ order, meaning if you just next() the file pointer will
+ sometimes jump backwards. For example, block foo* will
+ be written before block f* because it finished before.
+ This could possibly hurt performance if the terms dict is
+ not hot, since OSs anticipate sequential file access. We
+ could fix the writer to re-order the blocks as a 2nd
+ pass.
+
+ - Each block encodes the term suffixes packed
+ sequentially using a separate vInt per term, which is
+ 1) wasteful and 2) slow (must linear scan to find a
+ particular suffix). We should instead 1) make
+ random-access array so we can directly access the Nth
+ suffix, and 2) bulk-encode this array using bulk int[]
+ codecs; then at search time we can binary search when
+ we seek a particular term.
+*/
+
+/**
+ * Block-based terms index and dictionary writer.
+ * <p>
+ * Writes terms dict and index, block-encoding (column
+ * stride) each term's metadata for each set of terms
+ * between two index terms.
+ * <p>
+ * Files:
+ * <ul>
+ * <li><tt>.tim</tt>: <a href="#Termdictionary">Term Dictionary</a></li>
+ * <li><tt>.tip</tt>: <a href="#Termindex">Term Index</a></li>
+ * </ul>
+ * <p>
+ * <a name="Termdictionary" id="Termdictionary"></a>
+ * <h3>Term Dictionary</h3>
+ *
+ * <p>The .tim file contains the list of terms in each
+ * field along with per-term statistics (such as docfreq)
+ * and per-term metadata (typically pointers to the postings list
+ * for that term in the inverted index).
+ * </p>
+ *
+ * <p>The .tim is arranged in blocks: with blocks containing
+ * a variable number of entries (by default 25-48), where
+ * each entry is either a term or a reference to a
+ * sub-block.</p>
+ *
+ * <p>NOTE: The term dictionary can plug into different postings implementations:
+ * the postings writer/reader are actually responsible for encoding
+ * and decoding the Postings Metadata and Term Metadata sections.</p>
+ *
+ * <ul>
+ * <li>TermsDict (.tim) --> Header, <i>PostingsHeader</i>, NodeBlock<sup>NumBlocks</sup>,
+ * FieldSummary, DirOffset</li>
+ * <li>NodeBlock --> (OuterNode | InnerNode)</li>
+ * <li>OuterNode --> EntryCount, SuffixLength, Byte<sup>SuffixLength</sup>, StatsLength, < TermStats ><sup>EntryCount</sup>, MetaLength, <<i>TermMetadata</i>><sup>EntryCount</sup></li>
+ * <li>InnerNode --> EntryCount, SuffixLength[,Sub?], Byte<sup>SuffixLength</sup>, StatsLength, < TermStats ? ><sup>EntryCount</sup>, MetaLength, <<i>TermMetadata ? </i>><sup>EntryCount</sup></li>
+ * <li>TermStats --> DocFreq, TotalTermFreq </li>
+ * <li>FieldSummary --> NumFields, <FieldNumber, NumTerms, RootCodeLength, Byte<sup>RootCodeLength</sup>,
+ * SumTotalTermFreq?, SumDocFreq, DocCount><sup>NumFields</sup></li>
+ * <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
+ * <li>DirOffset --> {@link DataOutput#writeLong Uint64}</li>
+ * <li>EntryCount,SuffixLength,StatsLength,DocFreq,MetaLength,NumFields,
+ * FieldNumber,RootCodeLength,DocCount --> {@link DataOutput#writeVInt VInt}</li>
+ * <li>TotalTermFreq,NumTerms,SumTotalTermFreq,SumDocFreq -->
+ * {@link DataOutput#writeVLong VLong}</li>
+ * </ul>
+ * <p>Notes:</p>
+ * <ul>
+ * <li>Header is a {@link CodecUtil#writeHeader CodecHeader} storing the version information
+ * for the BlockTree implementation.</li>
+ * <li>DirOffset is a pointer to the FieldSummary section.</li>
+ * <li>DocFreq is the count of documents which contain the term.</li>
+ * <li>TotalTermFreq is the total number of occurrences of the term. This is encoded
+ * as the difference between the total number of occurrences and the DocFreq.</li>
+ * <li>FieldNumber is the fields number from {@link FieldInfos}. (.fnm)</li>
+ * <li>NumTerms is the number of unique terms for the field.</li>
+ * <li>RootCode points to the root block for the field.</li>
+ * <li>SumDocFreq is the total number of postings, the number of term-document pairs across
+ * the entire field.</li>
+ * <li>DocCount is the number of documents that have at least one posting for this field.</li>
+ * <li>PostingsHeader and TermMetadata are plugged into by the specific postings implementation:
+ * these contain arbitrary per-file data (such as parameters or versioning information)
+ * and per-term data (such as pointers to inverted files).</li>
+ * <li>For inner nodes of the tree, every entry will steal one bit to mark whether it points
+ * to child nodes(sub-block). If so, the corresponding TermStats and TermMetaData are omitted </li>
+ * </ul>
+ * <a name="Termindex" id="Termindex"></a>
+ * <h3>Term Index</h3>
+ * <p>The .tip file contains an index into the term dictionary, so that it can be
+ * accessed randomly. The index is also used to determine
+ * when a given term cannot exist on disk (in the .tim file), saving a disk seek.</p>
+ * <ul>
+ * <li>TermsIndex (.tip) --> Header, FSTIndex<sup>NumFields</sup>
+ * <IndexStartFP><sup>NumFields</sup>, DirOffset</li>
+ * <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
+ * <li>DirOffset --> {@link DataOutput#writeLong Uint64}</li>
+ * <li>IndexStartFP --> {@link DataOutput#writeVLong VLong}</li>
+ * <!-- TODO: better describe FST output here -->
+ * <li>FSTIndex --> {@link FST FST<byte[]>}</li>
+ * </ul>
+ * <p>Notes:</p>
+ * <ul>
+ * <li>The .tip file contains a separate FST for each
+ * field. The FST maps a term prefix to the on-disk
+ * block that holds all terms starting with that
+ * prefix. Each field's IndexStartFP points to its
+ * FST.</li>
+ * <li>DirOffset is a pointer to the start of the IndexStartFPs
+ * for all fields</li>
+ * <li>It's possible that an on-disk block would contain
+ * too many terms (more than the allowed maximum
+ * (default: 48)). When this happens, the block is
+ * sub-divided into new blocks (called "floor
+ * blocks"), and then the output in the FST for the
+ * block's prefix encodes the leading byte of each
+ * sub-block, and its file pointer.
+ * </ul>
+ *
+ * @see BlockTreeTermsReader
+ * @lucene.experimental
+ */
+
+public class BlockTreeTermsWriter extends FieldsConsumer implements Closeable {
+
+ /** Suggested default value for the {@code
+ * minItemsInBlock} parameter to {@link
+ * #BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. */
+ public final static int DEFAULT_MIN_BLOCK_SIZE = 25;
+
+ /** Suggested default value for the {@code
+ * maxItemsInBlock} parameter to {@link
+ * #BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. */
+ public final static int DEFAULT_MAX_BLOCK_SIZE = 48;
+
+ //public final static boolean DEBUG = false;
+ //private final static boolean SAVE_DOT_FILES = false;
+
+ static final int OUTPUT_FLAGS_NUM_BITS = 2;
+ static final int OUTPUT_FLAGS_MASK = 0x3;
+ static final int OUTPUT_FLAG_IS_FLOOR = 0x1;
+ static final int OUTPUT_FLAG_HAS_TERMS = 0x2;
+
+ /** Extension of terms file */
+ static final String TERMS_EXTENSION = "tim";
+ final static String TERMS_CODEC_NAME = "BLOCK_TREE_TERMS_DICT";
+
+ /** Initial terms format. */
+ public static final int TERMS_VERSION_START = 0;
+
+ /** Append-only */
+ public static final int TERMS_VERSION_APPEND_ONLY = 1;
+
+ /** Meta data as array */
+ public static final int TERMS_VERSION_META_ARRAY = 2;
+
+ /** Current terms format. */
+ public static final int TERMS_VERSION_CURRENT = TERMS_VERSION_META_ARRAY;
+
+ /** Extension of terms index file */
+ static final String TERMS_INDEX_EXTENSION = "tip";
+ final static String TERMS_INDEX_CODEC_NAME = "BLOCK_TREE_TERMS_INDEX";
+
+ /** Initial index format. */
+ public static final int TERMS_INDEX_VERSION_START = 0;
+
+ /** Append-only */
+ public static final int TERMS_INDEX_VERSION_APPEND_ONLY = 1;
+
+ /** Meta data as array */
+ public static final int TERMS_INDEX_VERSION_META_ARRAY = 2;
+
+ /** Current index format. */
+ public static final int TERMS_INDEX_VERSION_CURRENT = TERMS_INDEX_VERSION_META_ARRAY;
+
+ private final IndexOutput out;
+ private final IndexOutput indexOut;
+ final int maxDoc;
+ final int minItemsInBlock;
+ final int maxItemsInBlock;
+
+ final PostingsWriterBase postingsWriter;
+ final FieldInfos fieldInfos;
+
+ private static class FieldMetaData {
+ public final FieldInfo fieldInfo;
+ public final BytesRef rootCode;
+ public final long numTerms;
+ public final long indexStartFP;
+ public final long sumTotalTermFreq;
+ public final long sumDocFreq;
+ public final int docCount;
+ private final int longsSize;
+
+ public FieldMetaData(FieldInfo fieldInfo, BytesRef rootCode, long numTerms, long indexStartFP, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize) {
+ assert numTerms > 0;
+ this.fieldInfo = fieldInfo;
+ assert rootCode != null: "field=" + fieldInfo.name + " numTerms=" + numTerms;
+ this.rootCode = rootCode;
+ this.indexStartFP = indexStartFP;
+ this.numTerms = numTerms;
+ this.sumTotalTermFreq = sumTotalTermFreq;
+ this.sumDocFreq = sumDocFreq;
+ this.docCount = docCount;
+ this.longsSize = longsSize;
+ }
+ }
+
+ private final List<FieldMetaData> fields = new ArrayList<FieldMetaData>();
+ // private final String segment;
+
+ /** Create a new writer. The number of items (terms or
+ * sub-blocks) per block will aim to be between
+ * minItemsPerBlock and maxItemsPerBlock, though in some
+ * cases the blocks may be smaller than the min. */
+ public BlockTreeTermsWriter(
+ SegmentWriteState state,
+ PostingsWriterBase postingsWriter,
+ int minItemsInBlock,
+ int maxItemsInBlock)
+ throws IOException
+ {
+ if (minItemsInBlock <= 1) {
+ throw new IllegalArgumentException("minItemsInBlock must be >= 2; got " + minItemsInBlock);
+ }
+ if (maxItemsInBlock <= 0) {
+ throw new IllegalArgumentException("maxItemsInBlock must be >= 1; got " + maxItemsInBlock);
+ }
+ if (minItemsInBlock > maxItemsInBlock) {
+ throw new IllegalArgumentException("maxItemsInBlock must be >= minItemsInBlock; got maxItemsInBlock=" + maxItemsInBlock + " minItemsInBlock=" + minItemsInBlock);
+ }
+ if (2*(minItemsInBlock-1) > maxItemsInBlock) {
+ throw new IllegalArgumentException("maxItemsInBlock must be at least 2*(minItemsInBlock-1); got maxItemsInBlock=" + maxItemsInBlock + " minItemsInBlock=" + minItemsInBlock);
+ }
+
+ maxDoc = state.segmentInfo.getDocCount();
+
+ final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION);
+ out = state.directory.createOutput(termsFileName, state.context);
+ boolean success = false;
+ IndexOutput indexOut = null;
+ try {
+ fieldInfos = state.fieldInfos;
+ this.minItemsInBlock = minItemsInBlock;
+ this.maxItemsInBlock = maxItemsInBlock;
+ writeHeader(out);
+
+ //DEBUG = state.segmentName.equals("_4a");
+
+ final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
+ indexOut = state.directory.createOutput(termsIndexFileName, state.context);
+ writeIndexHeader(indexOut);
+
+ this.postingsWriter = postingsWriter;
+ // segment = state.segmentName;
+
+ // System.out.println("BTW.init seg=" + state.segmentName);
+
+ postingsWriter.init(out); // have consumer write its format/header
+ success = true;
+ } finally {
+ if (!success) {
+ IOUtils.closeWhileHandlingException(out, indexOut);
+ }
+ }
+ this.indexOut = indexOut;
+ }
+
+ /** Writes the terms file header. */
+ private void writeHeader(IndexOutput out) throws IOException {
+ CodecUtil.writeHeader(out, TERMS_CODEC_NAME, TERMS_VERSION_CURRENT);
+ }
+
+ /** Writes the index file header. */
+ private void writeIndexHeader(IndexOutput out) throws IOException {
+ CodecUtil.writeHeader(out, TERMS_INDEX_CODEC_NAME, TERMS_INDEX_VERSION_CURRENT);
+ }
+
+ /** Writes the terms file trailer. */
+ private void writeTrailer(IndexOutput out, long dirStart) throws IOException {
+ out.writeLong(dirStart);
+ }
+
+ /** Writes the index file trailer. */
+ private void writeIndexTrailer(IndexOutput indexOut, long dirStart) throws IOException {
+ indexOut.writeLong(dirStart);
+ }
+
+ @Override
+ public void write(Fields fields) throws IOException {
+
+ boolean success = false;
+ try {
+ String lastField = null;
+ for(String field : fields) {
+ assert lastField == null || lastField.compareTo(field) < 0;
+ lastField = field;
+
+ Terms terms = fields.terms(field);
+ if (terms == null) {
+ continue;
+ }
+
+ TermsEnum termsEnum = terms.iterator(null);
+
+ TermsWriter termsWriter = new TermsWriter(fieldInfos.fieldInfo(field));
+
+ while (true) {
+ BytesRef term = termsEnum.next();
+ if (term == null) {
+ break;
+ }
+ termsWriter.write(term, termsEnum);
+ }
+
+ termsWriter.finish();
+ }
+ success = true;
+ } finally {
+ if (success) {
+ IOUtils.close(this);
+ } else {
+ IOUtils.closeWhileHandlingException(this);
+ }
+ }
+ }
+
+ static long encodeOutput(long fp, boolean hasTerms, boolean isFloor) {
+ assert fp < (1L << 62);
+ return (fp << 2) | (hasTerms ? OUTPUT_FLAG_HAS_TERMS : 0) | (isFloor ? OUTPUT_FLAG_IS_FLOOR : 0);
+ }
+
+ private static class PendingEntry {
+ public final boolean isTerm;
+
+ protected PendingEntry(boolean isTerm) {
+ this.isTerm = isTerm;
+ }
+ }
+
+ private static final class PendingTerm extends PendingEntry {
+ public final BytesRef term;
+ // stats + metadata
+ public final BlockTermState state;
+
+ public PendingTerm(BytesRef term, BlockTermState state) {
+ super(true);
+ this.term = term;
+ this.state = state;
+ }
+
+ @Override
+ public String toString() {
+ return term.utf8ToString();
+ }
+ }
+
+ private static final class PendingBlock extends PendingEntry {
+ public final BytesRef prefix;
+ public final long fp;
+ public FST<BytesRef> index;
+ public List<FST<BytesRef>> subIndices;
+ public final boolean hasTerms;
+ public final boolean isFloor;
+ public final int floorLeadByte;
+ private final IntsRef scratchIntsRef = new IntsRef();
+
+ public PendingBlock(BytesRef prefix, long fp, boolean hasTerms, boolean isFloor, int floorLeadByte, List<FST<BytesRef>> subIndices) {
+ super(false);
+ this.prefix = prefix;
+ this.fp = fp;
+ this.hasTerms = hasTerms;
+ this.isFloor = isFloor;
+ this.floorLeadByte = floorLeadByte;
+ this.subIndices = subIndices;
+ }
+
+ @Override
+ public String toString() {
+ return "BLOCK: " + prefix.utf8ToString();
+ }
+
+ public void compileIndex(List<PendingBlock> floorBlocks, RAMOutputStream scratchBytes) throws IOException {
+
+ assert (isFloor && floorBlocks != null && floorBlocks.size() != 0) || (!isFloor && floorBlocks == null): "isFloor=" + isFloor + " floorBlocks=" + floorBlocks;
+
+ assert scratchBytes.getFilePointer() == 0;
+
+ // TODO: try writing the leading vLong in MSB order
+ // (opposite of what Lucene does today), for better
+ // outputs sharing in the FST
+ scratchBytes.writeVLong(encodeOutput(fp, hasTerms, isFloor));
+ if (isFloor) {
+ scratchBytes.writeVInt(floorBlocks.size());
+ for (PendingBlock sub : floorBlocks) {
+ assert sub.floorLeadByte != -1;
+ //if (DEBUG) {
+ // System.out.println(" write floorLeadByte=" + Integer.toHexString(sub.floorLeadByte&0xff));
+ //}
+ scratchBytes.writeByte((byte) sub.floorLeadByte);
+ assert sub.fp > fp;
+ scratchBytes.writeVLong((sub.fp - fp) << 1 | (sub.hasTerms ? 1 : 0));
+ }
+ }
+
+ final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
+ final Builder<BytesRef> indexBuilder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1,
+ 0, 0, true, false, Integer.MAX_VALUE,
+ outputs, null, false,
+ PackedInts.COMPACT, true, 15);
+ //if (DEBUG) {
+ // System.out.println(" compile index for prefix=" + prefix);
+ //}
+ //indexBuilder.DEBUG = false;
+ final byte[] bytes = new byte[(int) scratchBytes.getFilePointer()];
+ assert bytes.length > 0;
+ scratchBytes.writeTo(bytes, 0);
+ indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef), new BytesRef(bytes, 0, bytes.length));
+ scratchBytes.reset();
+
+ // Copy over index for all sub-blocks
+
+ if (subIndices != null) {
+ for(FST<BytesRef> subIndex : subIndices) {
+ append(indexBuilder, subIndex);
+ }
+ }
+
+ if (floorBlocks != null) {
+ for (PendingBlock sub : floorBlocks) {
+ if (sub.subIndices != null) {
+ for(FST<BytesRef> subIndex : sub.subIndices) {
+ append(indexBuilder, subIndex);
+ }
+ }
+ sub.subIndices = null;
+ }
+ }
+
+ index = indexBuilder.finish();
+ subIndices = null;
+
+ /*
+ Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
+ Util.toDot(index, w, false, false);
+ System.out.println("SAVED to out.dot");
+ w.close();
+ */
+ }
+
+ // TODO: maybe we could add bulk-add method to
+ // Builder? Takes FST and unions it w/ current
+ // FST.
+ private void append(Builder<BytesRef> builder, FST<BytesRef> subIndex) throws IOException {
+ final BytesRefFSTEnum<BytesRef> subIndexEnum = new BytesRefFSTEnum<BytesRef>(subIndex);
+ BytesRefFSTEnum.InputOutput<BytesRef> indexEnt;
+ while((indexEnt = subIndexEnum.next()) != null) {
+ //if (DEBUG) {
+ // System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
+ //}
+ builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
+ }
+ }
+ }
+
+ final RAMOutputStream scratchBytes = new RAMOutputStream();
+
+ class TermsWriter {
+ private final FieldInfo fieldInfo;
+ private final int longsSize;
+ private long numTerms;
+ final FixedBitSet docsSeen;
+ long sumTotalTermFreq;
+ long sumDocFreq;
+ long indexStartFP;
+
+ // Used only to partition terms into the block tree; we
+ // don't pull an FST from this builder:
+ private final NoOutputs noOutputs;
+ private final Builder<Object> blockBuilder;
+
+ // PendingTerm or PendingBlock:
+ private final List<PendingEntry> pending = new ArrayList<PendingEntry>();
+
+ // Index into pending of most recently written block
+ private int lastBlockIndex = -1;
+
+ // Re-used when segmenting a too-large block into floor
+ // blocks:
+ private int[] subBytes = new int[10];
+ private int[] subTermCounts = new int[10];
+ private int[] subTermCountSums = new int[10];
+ private int[] subSubCounts = new int[10];
+
+ // This class assigns terms to blocks "naturally", ie,
+ // according to the number of terms under a given prefix
+ // that we encounter:
+ private class FindBlocks extends Builder.FreezeTail<Object> {
+
+ @Override
+ public void freeze(final Builder.UnCompiledNode<Object>[] frontier, int prefixLenPlus1, final IntsRef lastInput) throws IOException {
+
+ //if (DEBUG) System.out.println(" freeze prefixLenPlus1=" + prefixLenPlus1);
+
+ for(int idx=lastInput.length; idx >= prefixLenPlus1; idx--) {
+ final Builder.UnCompiledNode<Object> node = frontier[idx];
+
+ long totCount = 0;
+
+ if (node.isFinal) {
+ totCount++;
+ }
+
+ for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) {
+ @SuppressWarnings("unchecked") final Builder.UnCompiledNode<Object> target = (Builder.UnCompiledNode<Object>) node.arcs[arcIdx].target;
+ totCount += target.inputCount;
+ target.clear();
+ node.arcs[arcIdx].target = null;
+ }
+ node.numArcs = 0;
+
+ if (totCount >= minItemsInBlock || idx == 0) {
+ // We are on a prefix node that has enough
+ // entries (terms or sub-blocks) under it to let
+ // us write a new block or multiple blocks (main
+ // block + follow on floor blocks):
+ //if (DEBUG) {
+ // if (totCount < minItemsInBlock && idx != 0) {
+ // System.out.println(" force block has terms");
+ // }
+ //}
+ writeBlocks(lastInput, idx, (int) totCount);
+ node.inputCount = 1;
+ } else {
+ // stragglers! carry count upwards
+ node.inputCount = totCount;
+ }
+ frontier[idx] = new Builder.UnCompiledNode<Object>(blockBuilder, idx);
+ }
+ }
+ }
+
+ // Write the top count entries on the pending stack as
+ // one or more blocks. Returns how many blocks were
+ // written. If the entry count is <= maxItemsPerBlock
+ // we just write a single block; else we break into
+ // primary (initial) block and then one or more
+ // following floor blocks:
+
+ void writeBlocks(IntsRef prevTerm, int prefixLength, int count) throws IOException {
+ if (prefixLength == 0 || count <= maxItemsInBlock) {
+ // Easy case: not floor block. Eg, prefix is "foo",
+ // and we found 30 terms/sub-blocks starting w/ that
+ // prefix, and minItemsInBlock <= 30 <=
+ // maxItemsInBlock.
+ final PendingBlock nonFloorBlock = writeBlock(prevTerm, prefixLength, prefixLength, count, count, 0, false, -1, true);
+ nonFloorBlock.compileIndex(null, scratchBytes);
+ pending.add(nonFloorBlock);
+ } else {
+ // Floor block case. Eg, prefix is "foo" but we
+ // have 100 terms/sub-blocks starting w/ that
+ // prefix. We segment the entries into a primary
+ // block and following floor blocks using the first
+ // label in the suffix to assign to floor blocks.
+
+ // TODO: we could store min & max suffix start byte
+ // in each block, to make floor blocks authoritative
+
+ //if (DEBUG) {
+ // final BytesRef prefix = new BytesRef(prefixLength);
+ // for(int m=0;m<prefixLength;m++) {
+ // prefix.bytes[m] = (byte) prevTerm.ints[m];
+ // }
+ // prefix.length = prefixLength;
+ // //System.out.println("\nWBS count=" + count + " prefix=" + prefix.utf8ToString() + " " + prefix);
+ // System.out.println("writeBlocks: prefix=" + prefix + " " + prefix + " count=" + count + " pending.size()=" + pending.size());
+ //}
+ //System.out.println("\nwbs count=" + count);
+
+ final int savLabel = prevTerm.ints[prevTerm.offset + prefixLength];
+
+ // Count up how many items fall under
+ // each unique label after the prefix.
+
+ // TODO: this is wasteful since the builder had
+ // already done this (partitioned these sub-terms
+ // according to their leading prefix byte)
+
+ final List<PendingEntry> slice = pending.subList(pending.size()-count, pending.size());
+ int lastSuffixLeadLabel = -1;
+ int termCount = 0;
+ int subCount = 0;
+ int numSubs = 0;
+
+ for(PendingEntry ent : slice) {
+
+ // First byte in the suffix of this term
+ final int suffixLeadLabel;
+ if (ent.isTerm) {
+ PendingTerm term = (PendingTerm) ent;
+ if (term.term.length == prefixLength) {
+ // Suffix is 0, ie prefix 'foo' and term is
+ // 'foo' so the term has empty string suffix
+ // in this block
+ assert lastSuffixLeadLabel == -1;
+ assert numSubs == 0;
+ suffixLeadLabel = -1;
+ } else {
+ suffixLeadLabel = term.term.bytes[term.term.offset + prefixLength] & 0xff;
+ }
+ } else {
+ PendingBlock block = (PendingBlock) ent;
+ assert block.prefix.length > prefixLength;
+ suffixLeadLabel = block.prefix.bytes[block.prefix.offset + prefixLength] & 0xff;
+ }
+
+ if (suffixLeadLabel != lastSuffixLeadLabel && (termCount + subCount) != 0) {
+ if (subBytes.length == numSubs) {
+ subBytes = ArrayUtil.grow(subBytes);
+ subTermCounts = ArrayUtil.grow(subTermCounts);
+ subSubCounts = ArrayUtil.grow(subSubCounts);
+ }
+ subBytes[numSubs] = lastSuffixLeadLabel;
+ lastSuffixLeadLabel = suffixLeadLabel;
+ subTermCounts[numSubs] = termCount;
+ subSubCounts[numSubs] = subCount;
+ /*
+ if (suffixLeadLabel == -1) {
+ System.out.println(" sub " + -1 + " termCount=" + termCount + " subCount=" + subCount);
+ } else {
+ System.out.println(" sub " + Integer.toHexString(suffixLeadLabel) + " termCount=" + termCount + " subCount=" + subCount);
+ }
+ */
+ termCount = subCount = 0;
+ numSubs++;
+ }
+
+ if (ent.isTerm) {
+ termCount++;
+ } else {
+ subCount++;
+ }
+ }
+
+ if (subBytes.length == numSubs) {
+ subBytes = ArrayUtil.grow(subBytes);
+ subTermCounts = ArrayUtil.grow(subTermCounts);
+ subSubCounts = ArrayUtil.grow(subSubCounts);
+ }
+
+ subBytes[numSubs] = lastSuffixLeadLabel;
+ subTermCounts[numSubs] = termCount;
+ subSubCounts[numSubs] = subCount;
+ numSubs++;
+ /*
+ if (lastSuffixLeadLabel == -1) {
+ System.out.println(" sub " + -1 + " termCount=" + termCount + " subCount=" + subCount);
+ } else {
+ System.out.println(" sub " + Integer.toHexString(lastSuffixLeadLabel) + " termCount=" + termCount + " subCount=" + subCount);
+ }
+ */
+
+ if (subTermCountSums.length < numSubs) {
+ subTermCountSums = ArrayUtil.grow(subTermCountSums, numSubs);
+ }
+
+ // Roll up (backwards) the termCounts; postings impl
+ // needs this to know where to pull the term slice
+ // from its pending terms stack:
+ int sum = 0;
+ for(int idx=numSubs-1;idx>=0;idx--) {
+ sum += subTermCounts[idx];
+ subTermCountSums[idx] = sum;
+ }
+
+ // TODO: make a better segmenter? It'd have to
+ // absorb the too-small end blocks backwards into
+ // the previous blocks
+
+ // Naive greedy segmentation; this is not always
+ // best (it can produce a too-small block as the
+ // last block):
+ int pendingCount = 0;
+ int startLabel = subBytes[0];
+ int curStart = count;
+ subCount = 0;
+
+ final List<PendingBlock> floorBlocks = new ArrayList<PendingBlock>();
+ PendingBlock firstBlock = null;
+
+ for(int sub=0;sub<numSubs;sub++) {
+ pendingCount += subTermCounts[sub] + subSubCounts[sub];
+ //System.out.println(" " + (subTermCounts[sub] + subSubCounts[sub]));
+ subCount++;
+
+ // Greedily make a floor block as soon as we've
+ // crossed the min count
+ if (pendingCount >= minItemsInBlock) {
+ final int curPrefixLength;
+ if (startLabel == -1) {
+ curPrefixLength = prefixLength;
+ } else {
+ curPrefixLength = 1+prefixLength;
+ // floor term:
+ prevTerm.ints[prevTerm.offset + prefixLength] = startLabel;
+ }
+ //System.out.println(" " + subCount + " subs");
+ final PendingBlock floorBlock = writeBlock(prevTerm, prefixLength, curPrefixLength, curStart, pendingCount, subTermCountSums[1+sub], true, startLabel, curStart == pendingCount);
+ if (firstBlock == null) {
+ firstBlock = floorBlock;
+ } else {
+ floorBlocks.add(floorBlock);
+ }
+ curStart -= pendingCount;
+ //System.out.println(" = " + pendingCount);
+ pendingCount = 0;
+
+ assert minItemsInBlock == 1 || subCount > 1: "minItemsInBlock=" + minItemsInBlock + " subCount=" + subCount + " sub=" + sub + " of " + numSubs + " subTermCount=" + subTermCountSums[sub] + " subSubCount=" + subSubCounts[sub] + " depth=" + prefixLength;
+ subCount = 0;
+ startLabel = subBytes[sub+1];
+
+ if (curStart == 0) {
+ break;
+ }
+
+ if (curStart <= maxItemsInBlock) {
+ // remainder is small enough to fit into a
+ // block. NOTE that this may be too small (<
+ // minItemsInBlock); need a true segmenter
+ // here
+ assert startLabel != -1;
+ assert firstBlock != null;
+ prevTerm.ints[prevTerm.offset + prefixLength] = startLabel;
+ //System.out.println(" final " + (numSubs-sub-1) + " subs");
+ /*
+ for(sub++;sub < numSubs;sub++) {
+ System.out.println(" " + (subTermCounts[sub] + subSubCounts[sub]));
+ }
+ System.out.println(" = " + curStart);
+ if (curStart < minItemsInBlock) {
+ System.out.println(" **");
+ }
+ */
+ floorBlocks.add(writeBlock(prevTerm, prefixLength, prefixLength+1, curStart, curStart, 0, true, startLabel, true));
+ break;
+ }
+ }
+ }
+
+ prevTerm.ints[prevTerm.offset + prefixLength] = savLabel;
+
+ assert firstBlock != null;
+ firstBlock.compileIndex(floorBlocks, scratchBytes);
+
+ pending.add(firstBlock);
+ //if (DEBUG) System.out.println(" done pending.size()=" + pending.size());
+ }
+ lastBlockIndex = pending.size()-1;
+ }
+
+ // for debugging
+ @SuppressWarnings("unused")
+ private String toString(BytesRef b) {
+ try {
+ return b.utf8ToString() + " " + b;
+ } catch (Throwable t) {
+ // If BytesRef isn't actually UTF8, or it's eg a
+ // prefix of UTF8 that ends mid-unicode-char, we
+ // fallback to hex:
+ return b.toString();
+ }
+ }
+
+ // Writes all entries in the pending slice as a single
+ // block:
+ private PendingBlock writeBlock(IntsRef prevTerm, int prefixLength, int indexPrefixLength, int startBackwards, int length,
+ int futureTermCount, boolean isFloor, int floorLeadByte, boolean isLastInFloor) throws IOException {
+
+ assert length > 0;
+
+ final int start = pending.size()-startBackwards;
+
+ assert start >= 0: "pending.size()=" + pending.size() + " startBackwards=" + startBackwards + " length=" + length;
+
+ final List<PendingEntry> slice = pending.subList(start, start + length);
+
+ final long startFP = out.getFilePointer();
+
+ final BytesRef prefix = new BytesRef(indexPrefixLength);
+ for(int m=0;m<indexPrefixLength;m++) {
+ prefix.bytes[m] = (byte) prevTerm.ints[m];
+ }
+ prefix.length = indexPrefixLength;
+
+ // Write block header:
+ out.writeVInt((length<<1)|(isLastInFloor ? 1:0));
+
+ // if (DEBUG) {
+ // System.out.println(" writeBlock " + (isFloor ? "(floor) " : "") + "seg=" + segment + " pending.size()=" + pending.size() + " prefixLength=" + prefixLength + " indexPrefix=" + toString(prefix) + " entCount=" + length + " startFP=" + startFP + " futureTermCount=" + futureTermCount + (isFloor ? (" floorLeadByte=" + Integer.toHexString(floorLeadByte&0xff)) : "") + " isLastInFloor=" + isLastInFloor);
+ // }
+
+ // 1st pass: pack term suffix bytes into byte[] blob
+ // TODO: cutover to bulk int codec... simple64?
+
+ final boolean isLeafBlock;
+ if (lastBlockIndex < start) {
+ // This block definitely does not contain sub-blocks:
+ isLeafBlock = true;
+ //System.out.println("no scan true isFloor=" + isFloor);
+ } else if (!isFloor) {
+ // This block definitely does contain at least one sub-block:
+ isLeafBlock = false;
+ //System.out.println("no scan false " + lastBlockIndex + " vs start=" + start + " len=" + length);
+ } else {
+ // Must scan up-front to see if there is a sub-block
+ boolean v = true;
+ //System.out.println("scan " + lastBlockIndex + " vs start=" + start + " len=" + length);
+ for (PendingEntry ent : slice) {
+ if (!ent.isTerm) {
+ v = false;
+ break;
+ }
+ }
+ isLeafBlock = v;
+ }
+
+ final List<FST<BytesRef>> subIndices;
+
+ int termCount;
+
+ long[] longs = new long[longsSize];
+ boolean absolute = true;
+
+ if (isLeafBlock) {
+ subIndices = null;
+ for (PendingEntry ent : slice) {
+ assert ent.isTerm;
+ PendingTerm term = (PendingTerm) ent;
+ BlockTermState state = term.state;
+ final int suffix = term.term.length - prefixLength;
+ // if (DEBUG) {
+ // BytesRef suffixBytes = new BytesRef(suffix);
+ // System.arraycopy(term.term.bytes, prefixLength, suffixBytes.bytes, 0, suffix);
+ // suffixBytes.length = suffix;
+ // System.out.println(" write term suffix=" + suffixBytes);
+ // }
+ // For leaf block we write suffix straight
+ suffixWriter.writeVInt(suffix);
+ suffixWriter.writeBytes(term.term.bytes, prefixLength, suffix);
+
+ // Write term stats, to separate byte[] blob:
+ statsWriter.writeVInt(state.docFreq);
+ if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
+ assert state.totalTermFreq >= state.docFreq: state.totalTermFreq + " vs " + state.docFreq;
+ statsWriter.writeVLong(state.totalTermFreq - state.docFreq);
+ }
+
+ // Write term meta data
+ postingsWriter.encodeTerm(longs, bytesWriter, fieldInfo, state, absolute);
+ for (int pos = 0; pos < longsSize; pos++) {
+ assert longs[pos] >= 0;
+ metaWriter.writeVLong(longs[pos]);
+ }
+ bytesWriter.writeTo(metaWriter);
+ bytesWriter.reset();
+ absolute = false;
+ }
+ termCount = length;
+ } else {
+ subIndices = new ArrayList<FST<BytesRef>>();
+ termCount = 0;
+ for (PendingEntry ent : slice) {
+ if (ent.isTerm) {
+ PendingTerm term = (PendingTerm) ent;
+ BlockTermState state = term.state;
+ final int suffix = term.term.length - prefixLength;
+ // if (DEBUG) {
+ // BytesRef suffixBytes = new BytesRef(suffix);
+ // System.arraycopy(term.term.bytes, prefixLength, suffixBytes.bytes, 0, suffix);
+ // suffixBytes.length = suffix;
+ // System.out.println(" write term suffix=" + suffixBytes);
+ // }
+ // For non-leaf block we borrow 1 bit to record
+ // if entry is term or sub-block
+ suffixWriter.writeVInt(suffix<<1);
+ suffixWriter.writeBytes(term.term.bytes, prefixLength, suffix);
+
+ // Write term stats, to separate byte[] blob:
+ statsWriter.writeVInt(state.docFreq);
+ if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
+ assert state.totalTermFreq >= state.docFreq;
+ statsWriter.writeVLong(state.totalTermFreq - state.docFreq);
+ }
+
+ // TODO: now that terms dict "sees" these longs,
+ // we can explore better column-stride encodings
+ // to encode all long[0]s for this block at
+ // once, all long[1]s, etc., e.g. using
+ // Simple64. Alternatively, we could interleave
+ // stats + meta ... no reason to have them
+ // separate anymore:
+
+ // Write term meta data
+ postingsWriter.encodeTerm(longs, bytesWriter, fieldInfo, state, absolute);
+ for (int pos = 0; pos < longsSize; pos++) {
+ assert longs[pos] >= 0;
+ metaWriter.writeVLong(longs[pos]);
+ }
+ bytesWriter.writeTo(metaWriter);
+ bytesWriter.reset();
+ absolute = false;
+
+ termCount++;
+ } else {
+ PendingBlock block = (PendingBlock) ent;
+ final int suffix = block.prefix.length - prefixLength;
+
+ assert suffix > 0;
+
+ // For non-leaf block we borrow 1 bit to record
+ // if entry is term or sub-block
+ suffixWriter.writeVInt((suffix<<1)|1);
+ suffixWriter.writeBytes(block.prefix.bytes, prefixLength, suffix);
+ assert block.fp < startFP;
+
+ // if (DEBUG) {
+ // BytesRef suffixBytes = new BytesRef(suffix);
+ // System.arraycopy(block.prefix.bytes, prefixLength, suffixBytes.bytes, 0, suffix);
+ // suffixBytes.length = suffix;
+ // System.out.println(" write sub-block suffix=" + toString(suffixBytes) + " subFP=" + block.fp + " subCode=" + (startFP-block.fp) + " floor=" + block.isFloor);
+ // }
+
+ suffixWriter.writeVLong(startFP - block.fp);
+ subIndices.add(block.index);
+ }
+ }
+
+ assert subIndices.size() != 0;
+ }
+
+ // TODO: we could block-write the term suffix pointers;
+ // this would take more space but would enable binary
+ // search on lookup
+
+ // Write suffixes byte[] blob to terms dict output:
+ out.writeVInt((int) (suffixWriter.getFilePointer() << 1) | (isLeafBlock ? 1:0));
+ suffixWriter.writeTo(out);
+ suffixWriter.reset();
+
+ // Write term stats byte[] blob
+ out.writeVInt((int) statsWriter.getFilePointer());
+ statsWriter.writeTo(out);
+ statsWriter.reset();
+
+ // Write term meta data byte[] blob
+ out.writeVInt((int) metaWriter.getFilePointer());
+ metaWriter.writeTo(out);
+ metaWriter.reset();
+
+ // Remove slice replaced by block:
+ slice.clear();
+
+ if (lastBlockIndex >= start) {
+ if (lastBlockIndex < start+length) {
+ lastBlockIndex = start;
+ } else {
+ lastBlockIndex -= length;
+ }
+ }
+
+ // if (DEBUG) {
+ // System.out.println(" fpEnd=" + out.getFilePointer());
+ // }
+
+ return new PendingBlock(prefix, startFP, termCount != 0, isFloor, floorLeadByte, subIndices);
+ }
+
+ TermsWriter(FieldInfo fieldInfo) {
+ this.fieldInfo = fieldInfo;
+ docsSeen = new FixedBitSet(maxDoc);
+
+ noOutputs = NoOutputs.getSingleton();
+
+ // This Builder is just used transiently to fragment
+ // terms into "good" blocks; we don't save the
+ // resulting FST:
+ blockBuilder = new Builder<Object>(FST.INPUT_TYPE.BYTE1,
+ 0, 0, true,
+ true, Integer.MAX_VALUE,
+ noOutputs,
+ new FindBlocks(), false,
+ PackedInts.COMPACT,
+ true, 15);
+
+ this.longsSize = postingsWriter.setField(fieldInfo);
+ }
+
+ private final IntsRef scratchIntsRef = new IntsRef();
+
+ /** Writes one term's worth of postings. */
+ public void write(BytesRef text, TermsEnum termsEnum) throws IOException {
+
+ BlockTermState state = postingsWriter.writeTerm(text, termsEnum, docsSeen);
+ if (state != null) {
+ assert state.docFreq != 0;
+ assert fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY || state.totalTermFreq >= state.docFreq: "postingsWriter=" + postingsWriter;
+ sumDocFreq += state.docFreq;
+ sumTotalTermFreq += state.totalTermFreq;
+ blockBuilder.add(Util.toIntsRef(text, scratchIntsRef), noOutputs.getNoOutput());
+
+ PendingTerm term = new PendingTerm(BytesRef.deepCopyOf(text), state);
+ pending.add(term);
+ numTerms++;
+ }
+ }
+
+ // Finishes all terms in this field
+ public void finish() throws IOException {
+ if (numTerms > 0) {
+ blockBuilder.finish();
+
+ // We better have one final "root" block:
+ assert pending.size() == 1 && !pending.get(0).isTerm: "pending.size()=" + pending.size() + " pending=" + pending;
+ final PendingBlock root = (PendingBlock) pending.get(0);
+ assert root.prefix.length == 0;
+ assert root.index.getEmptyOutput() != null;
+
+ // Write FST to index
+ indexStartFP = indexOut.getFilePointer();
+ root.index.save(indexOut);
+ //System.out.println(" write FST " + indexStartFP + " field=" + fieldInfo.name);
+
+ // if (SAVE_DOT_FILES || DEBUG) {
+ // final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
+ // Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName));
+ // Util.toDot(root.index, w, false, false);
+ // System.out.println("SAVED to " + dotFileName);
+ // w.close();
+ // }
+
+ fields.add(new FieldMetaData(fieldInfo,
+ ((PendingBlock) pending.get(0)).index.getEmptyOutput(),
+ numTerms,
+ indexStartFP,
+ sumTotalTermFreq,
+ sumDocFreq,
+ docsSeen.cardinality(),
+ longsSize));
+ } else {
+ assert sumTotalTermFreq == 0 || fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY && sumTotalTermFreq == -1;
+ assert sumDocFreq == 0;
+ assert docsSeen.cardinality() == 0;
+ }
+ }
+
+ private final RAMOutputStream suffixWriter = new RAMOutputStream();
+ private final RAMOutputStream statsWriter = new RAMOutputStream();
+ private final RAMOutputStream metaWriter = new RAMOutputStream();
+ private final RAMOutputStream bytesWriter = new RAMOutputStream();
+ }
+
+ @Override
+ public void close() throws IOException {
+
+ IOException ioe = null;
+ try {
+
+ final long dirStart = out.getFilePointer();
+ final long indexDirStart = indexOut.getFilePointer();
+
+ out.writeVInt(fields.size());
+
+ for(FieldMetaData field : fields) {
+ //System.out.println(" field " + field.fieldInfo.name + " " + field.numTerms + " terms");
+ out.writeVInt(field.fieldInfo.number);
+ out.writeVLong(field.numTerms);
+ out.writeVInt(field.rootCode.length);
+ out.writeBytes(field.rootCode.bytes, field.rootCode.offset, field.rootCode.length);
+ if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
+ out.writeVLong(field.sumTotalTermFreq);
+ }
+ out.writeVLong(field.sumDocFreq);
+ out.writeVInt(field.docCount);
+ if (TERMS_VERSION_CURRENT >= TERMS_VERSION_META_ARRAY) {
+ out.writeVInt(field.longsSize);
+ }
+ indexOut.writeVLong(field.indexStartFP);
+ }
+ writeTrailer(out, dirStart);
+ writeIndexTrailer(indexOut, indexDirStart);
+ } catch (IOException ioe2) {
+ ioe = ioe2;
+ } finally {
+ IOUtils.closeWhileHandlingException(ioe, out, indexOut, postingsWriter);
+ }
+ }
+}
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/FieldsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/FieldsConsumer.java?rev=1531949&r1=1531948&r2=1531949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/FieldsConsumer.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/FieldsConsumer.java Mon Oct 14 15:55:57 2013
@@ -48,16 +48,12 @@ public abstract class FieldsConsumer {
/** Write all fields, terms and postings. This the "pull"
* API, allowing you to iterate more than once over the
* postings, somewhat analogous to using a DOM API to
- * traverse an XML tree. Alternatively, if you subclass
- * {@link PushFieldsConsumer}, then all postings are
- * pushed in a single pass, somewhat analogous to using a
- * SAX API to traverse an XML tree.
+ * traverse an XML tree.
*
- * <p>This API is has certain restrictions vs {@link
- * PushFieldsConsumer}:
+ * <p><b>Notes</b>:
*
* <ul>
- * <li> You must compute index statistics yourself,
+ * <li> You must compute index statistics,
* including each Term's docFreq and totalTermFreq,
* as well as the summary sumTotalTermFreq,
* sumTotalDocFreq and docCount.
Added: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsBaseFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsBaseFormat.java?rev=1531949&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsBaseFormat.java (added)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsBaseFormat.java Mon Oct 14 15:55:57 2013
@@ -0,0 +1,55 @@
+package org.apache.lucene.codecs;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.SegmentReadState;
+
+/**
+ * Provides a {@link PostingsReaderBase} and {@link
+ * PostingsWriterBase}.
+ *
+ * @lucene.experimental */
+
+// TODO: find a better name; this defines the API that the
+// terms dict impls use to talk to a postings impl.
+// TermsDict + PostingsReader/WriterBase == PostingsConsumer/Producer
+
+// can we clean this up and do this some other way?
+// refactor some of these classes and use covariant return?
+public abstract class PostingsBaseFormat {
+
+ /** Unique name that's used to retrieve this codec when
+ * reading the index */
+ public final String name;
+
+ /** Sole constructor. */
+ protected PostingsBaseFormat(String name) {
+ this.name = name;
+ }
+
+ /** Creates the {@link PostingsReaderBase} for this
+ * format. */
+ public abstract PostingsReaderBase postingsReaderBase(SegmentReadState state) throws IOException;
+
+ /** Creates the {@link PostingsWriterBase} for this
+ * format. */
+ public abstract PostingsWriterBase postingsWriterBase(SegmentWriteState state) throws IOException;
+}
Added: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java?rev=1531949&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java (added)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java Mon Oct 14 15:55:57 2013
@@ -0,0 +1,89 @@
+package org.apache.lucene.codecs;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Closeable;
+import java.io.IOException;
+
+import org.apache.lucene.index.DocsAndPositionsEnum; // javadocs
+import org.apache.lucene.index.DocsEnum; // javadocs
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
+
+/**
+ * Class that plugs into term dictionaries, such as {@link
+ * BlockTreeTermsWriter}, and handles writing postings.
+ *
+ * @see PostingsReaderBase
+ * @lucene.experimental
+ */
+// TODO: find a better name; this defines the API that the
+// terms dict impls use to talk to a postings impl.
+// TermsDict + PostingsReader/WriterBase == FieldsProducer/Consumer
+public abstract class PostingsWriterBase implements Closeable {
+
+ /** Sole constructor. (For invocation by subclass
+ * constructors, typically implicit.) */
+ protected PostingsWriterBase() {
+ }
+
+ /** Called once after startup, before any terms have been
+ * added. Implementations typically write a header to
+ * the provided {@code termsOut}. */
+ public abstract void init(IndexOutput termsOut) throws IOException;
+
+ /** Write all postings for one term; use the provided
+ * {@link TermsEnum} to pull a {@link DocsEnum} or {@link
+ * DocsAndPositionsEnum}. This method should not
+ * re-position the {@code TermsEnum}! It is already
+ * positioned on the term that should be written. This
+ * method must set the bit in the provided {@link
+ * FixedBitSet} for every docID written. If no docs
+ * were written, this method should return null, and the
+ * terms dict will skip the term. */
+ public abstract BlockTermState writeTerm(BytesRef term, TermsEnum termsEnum, FixedBitSet docsSeen) throws IOException;
+
+ /**
+ * Encode metadata as long[] and byte[]. {@code absolute} controls whether
+ * current term is delta encoded according to latest term.
+ * Usually elements in {@code longs} are file pointers, so each one always
+ * increases when a new term is consumed. {@code out} is used to write generic
+ * bytes, which are not monotonic.
+ *
+ * NOTE: sometimes long[] might contain "don't care" values that are unused, e.g.
+ * the pointer to postings list may not be defined for some terms but is defined
+ * for others, if it is designed to inline some postings data in term dictionary.
+ * In this case, the postings writer should always use the last value, so that each
+ * element in metadata long[] remains monotonic.
+ */
+ public abstract void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState state, boolean absolute) throws IOException;
+
+ /**
+ * Sets the current field for writing, and returns the
+ * fixed length of long[] metadata (which is fixed per
+ * field), called when the writing switches to another field. */
+ // TODO: better name?
+ public abstract int setField(FieldInfo fieldInfo);
+
+ @Override
+ public abstract void close() throws IOException;
+}
Added: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java?rev=1531949&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java (added)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java Mon Oct 14 15:55:57 2013
@@ -0,0 +1,219 @@
+package org.apache.lucene.codecs;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
+
+/**
+ * Extension of {@link PostingsWriterBase}, adding a push
+ * API for writing each element of the postings. This API
+ * is somewhat analagous to an XML SAX API, while {@link
+ * PostingsWriterBase} is more like an XML DOM API.
+ *
+ * @see PostingsReaderBase
+ * @lucene.experimental
+ */
+// TODO: find a better name; this defines the API that the
+// terms dict impls use to talk to a postings impl.
+// TermsDict + PostingsReader/WriterBase == PostingsConsumer/Producer
+public abstract class PushPostingsWriterBase extends PostingsWriterBase {
+
+ // Reused in writeTerm
+ private DocsEnum docsEnum;
+ private DocsAndPositionsEnum posEnum;
+ private int enumFlags;
+
+ /** {@link FieldInfo} of current field being written. */
+ protected FieldInfo fieldInfo;
+
+ /** {@link IndexOptions} of current field being
+ written */
+ protected IndexOptions indexOptions;
+
+ /** True if the current field writes freqs. */
+ protected boolean writeFreqs;
+
+ /** True if the current field writes positions. */
+ protected boolean writePositions;
+
+ /** True if the current field writes payloads. */
+ protected boolean writePayloads;
+
+ /** True if the current field writes offsets. */
+ protected boolean writeOffsets;
+
+ /** Sole constructor. (For invocation by subclass
+ * constructors, typically implicit.) */
+ protected PushPostingsWriterBase() {
+ }
+
+ /** Called once after startup, before any terms have been
+ * added. Implementations typically write a header to
+ * the provided {@code termsOut}. */
+ public abstract void init(IndexOutput termsOut) throws IOException;
+
+ /** Return a newly created empty TermState */
+ public abstract BlockTermState newTermState() throws IOException;
+
+ /** Start a new term. Note that a matching call to {@link
+ * #finishTerm(BlockTermState)} is done, only if the term has at least one
+ * document. */
+ public abstract void startTerm() throws IOException;
+
+ /** Finishes the current term. The provided {@link
+ * BlockTermState} contains the term's summary statistics,
+ * and will holds metadata from PBF when returned */
+ public abstract void finishTerm(BlockTermState state) throws IOException;
+
+ /**
+ * Encode metadata as long[] and byte[]. {@code absolute} controls whether
+ * current term is delta encoded according to latest term.
+ * Usually elements in {@code longs} are file pointers, so each one always
+ * increases when a new term is consumed. {@code out} is used to write generic
+ * bytes, which are not monotonic.
+ *
+ * NOTE: sometimes long[] might contain "don't care" values that are unused, e.g.
+ * the pointer to postings list may not be defined for some terms but is defined
+ * for others, if it is designed to inline some postings data in term dictionary.
+ * In this case, the postings writer should always use the last value, so that each
+ * element in metadata long[] remains monotonic.
+ */
+ public abstract void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState state, boolean absolute) throws IOException;
+
+ /**
+ * Sets the current field for writing, and returns the
+ * fixed length of long[] metadata (which is fixed per
+ * field), called when the writing switches to another field. */
+ // TODO: better name?
+ public int setField(FieldInfo fieldInfo) {
+ this.fieldInfo = fieldInfo;
+ indexOptions = fieldInfo.getIndexOptions();
+
+ writeFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
+ writePositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
+ writeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
+ writePayloads = fieldInfo.hasPayloads();
+
+ if (writeFreqs == false) {
+ enumFlags = 0;
+ } else if (writePositions == false) {
+ enumFlags = DocsEnum.FLAG_FREQS;
+ } else if (writeOffsets == false) {
+ if (writePayloads) {
+ enumFlags = DocsAndPositionsEnum.FLAG_PAYLOADS;
+ } else {
+ enumFlags = 0;
+ }
+ } else {
+ if (writePayloads) {
+ enumFlags = DocsAndPositionsEnum.FLAG_PAYLOADS | DocsAndPositionsEnum.FLAG_OFFSETS;
+ } else {
+ enumFlags = DocsAndPositionsEnum.FLAG_OFFSETS;
+ }
+ }
+
+ return 0;
+ }
+
+ @Override
+ public final BlockTermState writeTerm(BytesRef term, TermsEnum termsEnum, FixedBitSet docsSeen) throws IOException {
+ startTerm();
+ if (writePositions == false) {
+ docsEnum = termsEnum.docs(null, docsEnum, enumFlags);
+ } else {
+ posEnum = termsEnum.docsAndPositions(null, posEnum, enumFlags);
+ docsEnum = posEnum;
+ }
+ assert docsEnum != null;
+
+ int docFreq = 0;
+ long totalTermFreq = 0;
+ while (true) {
+ int docID = docsEnum.nextDoc();
+ if (docID == DocsEnum.NO_MORE_DOCS) {
+ break;
+ }
+ docFreq++;
+ docsSeen.set(docID);
+ int freq;
+ if (writeFreqs) {
+ freq = docsEnum.freq();
+ totalTermFreq += freq;
+ } else {
+ freq = -1;
+ }
+ startDoc(docID, freq);
+
+ if (writePositions) {
+ for(int i=0;i<freq;i++) {
+ int pos = posEnum.nextPosition();
+ BytesRef payload = writePayloads ? posEnum.getPayload() : null;
+ int startOffset;
+ int endOffset;
+ if (writeOffsets) {
+ startOffset = posEnum.startOffset();
+ endOffset = posEnum.endOffset();
+ } else {
+ startOffset = -1;
+ endOffset = -1;
+ }
+ addPosition(pos, payload, startOffset, endOffset);
+ }
+ }
+
+ finishDoc();
+ }
+
+ if (docFreq == 0) {
+ return null;
+ } else {
+ BlockTermState state = newTermState();
+ state.docFreq = docFreq;
+ state.totalTermFreq = writeFreqs ? totalTermFreq : -1;
+ finishTerm(state);
+ return state;
+ }
+ }
+
+ /** Adds a new doc in this term.
+ * <code>freq</code> will be -1 when term frequencies are omitted
+ * for the field. */
+ public abstract void startDoc(int docID, int freq) throws IOException;
+
+ /** Add a new position & payload, and start/end offset. A
+ * null payload means no payload; a non-null payload with
+ * zero length also means no payload. Caller may reuse
+ * the {@link BytesRef} for the payload between calls
+ * (method must fully consume the payload). <code>startOffset</code>
+ * and <code>endOffset</code> will be -1 when offsets are not indexed. */
+ public abstract void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException;
+
+ /** Called when we are done adding positions & payloads
+ * for each doc. */
+ public abstract void finishDoc() throws IOException;
+}
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java?rev=1531949&r1=1531948&r2=1531949&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java Mon Oct 14 15:55:57 2013
@@ -26,7 +26,7 @@ import org.apache.lucene.codecs.FieldsCo
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
-import org.apache.lucene.codecs.PostingsWriterBase;
+import org.apache.lucene.codecs.PostingsWriterBase; // javadocs
import org.apache.lucene.index.DocsEnum; // javadocs
import org.apache.lucene.index.FieldInfo.IndexOptions; // javadocs
import org.apache.lucene.index.FieldInfos; // javadocs
Added: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsBaseFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsBaseFormat.java?rev=1531949&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsBaseFormat.java (added)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsBaseFormat.java Mon Oct 14 15:55:57 2013
@@ -0,0 +1,51 @@
+package org.apache.lucene.codecs.lucene41;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.PostingsBaseFormat;
+import org.apache.lucene.codecs.PostingsReaderBase;
+import org.apache.lucene.codecs.PostingsWriterBase;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+
+/**
+ * Provides a {@link PostingsReaderBase} and {@link
+ * PostingsWriterBase}.
+ *
+ * @lucene.experimental */
+
+// TODO: should these also be named / looked up via SPI?
+public final class Lucene41PostingsBaseFormat extends PostingsBaseFormat {
+
+ /** Sole constructor. */
+ public Lucene41PostingsBaseFormat() {
+ super("Lucene41");
+ }
+
+ @Override
+ public PostingsReaderBase postingsReaderBase(SegmentReadState state) throws IOException {
+ return new Lucene41PostingsReader(state.directory, state.fieldInfos, state.segmentInfo, state.context, state.segmentSuffix);
+ }
+
+ @Override
+ public PostingsWriterBase postingsWriterBase(SegmentWriteState state) throws IOException {
+ return new Lucene41PostingsWriter(state);
+ }
+}