You are viewing a plain text version of this content. The canonical link for it is here.
Posted to blur-commits@incubator.apache.org by am...@apache.org on 2016/08/30 01:57:57 UTC
[12/13] git commit: Removing thread local variables.
http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/991fb043/blur-store/src/main/java/org/apache/lucene/codecs/BlockTreeTermsReader.java
----------------------------------------------------------------------
diff --git a/blur-store/src/main/java/org/apache/lucene/codecs/BlockTreeTermsReader.java b/blur-store/src/main/java/org/apache/lucene/codecs/BlockTreeTermsReader.java
new file mode 100644
index 0000000..4ef65ff
--- /dev/null
+++ b/blur-store/src/main/java/org/apache/lucene/codecs/BlockTreeTermsReader.java
@@ -0,0 +1,2939 @@
+package org.apache.lucene.codecs;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.io.UnsupportedEncodingException;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.Locale;
+import java.util.TreeMap;
+
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.index.TermState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.StringHelper;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
+import org.apache.lucene.util.automaton.RunAutomaton;
+import org.apache.lucene.util.automaton.Transition;
+import org.apache.lucene.util.fst.ByteSequenceOutputs;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.Outputs;
+import org.apache.lucene.util.fst.Util;
+
+/** A block-based terms index and dictionary that assigns
+ * terms to variable length blocks according to how they
+ * share prefixes. The terms index is a prefix trie
+ * whose leaves are term blocks. The advantage of this
+ * approach is that seekExact is often able to
+ * determine a term cannot exist without doing any IO, and
+ * intersection with Automata is very fast. Note that this
+ * terms dictionary has it's own fixed terms index (ie, it
+ * does not support a pluggable terms index
+ * implementation).
+ *
+ * <p><b>NOTE</b>: this terms dictionary does not support
+ * index divisor when opening an IndexReader. Instead, you
+ * can change the min/maxItemsPerBlock during indexing.</p>
+ *
+ * <p>The data structure used by this implementation is very
+ * similar to a burst trie
+ * (http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.18.3499),
+ * but with added logic to break up too-large blocks of all
+ * terms sharing a given prefix into smaller ones.</p>
+ *
+ * <p>Use {@link org.apache.lucene.index.CheckIndex} with the <code>-verbose</code>
+ * option to see summary statistics on the blocks in the
+ * dictionary.
+ *
+ * See {@link BlockTreeTermsWriter}.
+ *
+ * @lucene.experimental
+ */
+
+public class BlockTreeTermsReader extends FieldsProducer {
+
+ // Open input to the main terms dict file (_X.tib)
+ private final IndexInput in;
+
+ //private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG;
+
+ // Reads the terms dict entries, to gather state to
+ // produce DocsEnum on demand
+ private final PostingsReaderBase postingsReader;
+
+ private final TreeMap<String,FieldReader> fields = new TreeMap<String,FieldReader>();
+
+ /** File offset where the directory starts in the terms file. */
+ private long dirOffset;
+
+ /** File offset where the directory starts in the index file. */
+ private long indexDirOffset;
+
+ private String segment;
+
+ private final int version;
+
+ private final IndexInput indexIn;
+
+ /** Sole constructor. */
+ public BlockTreeTermsReader(Directory dir, FieldInfos fieldInfos, SegmentInfo info,
+ PostingsReaderBase postingsReader, IOContext ioContext,
+ String segmentSuffix, int indexDivisor)
+ throws IOException {
+
+ this.postingsReader = postingsReader;
+
+ this.segment = info.name;
+ in = dir.openInput(IndexFileNames.segmentFileName(segment, segmentSuffix, BlockTreeTermsWriter.TERMS_EXTENSION),
+ ioContext);
+
+ boolean success = false;
+
+ try {
+ version = readHeader(in);
+ if (indexDivisor != -1) {
+ indexIn = dir.openInput(IndexFileNames.segmentFileName(segment, segmentSuffix, BlockTreeTermsWriter.TERMS_INDEX_EXTENSION),
+ ioContext);
+ int indexVersion = readIndexHeader(indexIn);
+ if (indexVersion != version) {
+ throw new CorruptIndexException("mixmatched version files: " + in + "=" + version + "," + indexIn + "=" + indexVersion);
+ }
+ } else {
+ indexIn = null;
+ }
+
+ // Have PostingsReader init itself
+ postingsReader.init(in);
+
+ // Read per-field details
+ seekDir(in, dirOffset);
+ if (indexDivisor != -1) {
+ seekDir(indexIn, indexDirOffset);
+ }
+
+ final int numFields = in.readVInt();
+ if (numFields < 0) {
+ throw new CorruptIndexException("invalid numFields: " + numFields + " (resource=" + in + ")");
+ }
+
+ for(int i=0;i<numFields;i++) {
+ final int field = in.readVInt();
+ final long numTerms = in.readVLong();
+ assert numTerms >= 0;
+ final int numBytes = in.readVInt();
+ final BytesRef rootCode = new BytesRef(new byte[numBytes]);
+ in.readBytes(rootCode.bytes, 0, numBytes);
+ rootCode.length = numBytes;
+ final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
+ assert fieldInfo != null: "field=" + field;
+ final long sumTotalTermFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY ? -1 : in.readVLong();
+ final long sumDocFreq = in.readVLong();
+ final int docCount = in.readVInt();
+ if (docCount < 0 || docCount > info.getDocCount()) { // #docs with field must be <= #docs
+ throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + info.getDocCount() + " (resource=" + in + ")");
+ }
+ if (sumDocFreq < docCount) { // #postings must be >= #docs with field
+ throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount + " (resource=" + in + ")");
+ }
+ if (sumTotalTermFreq != -1 && sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings
+ throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq + " (resource=" + in + ")");
+ }
+ final long indexStartFP = indexDivisor != -1 ? indexIn.readVLong() : 0;
+ FieldReader previous = fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount, indexStartFP, indexIn));
+ if (previous != null) {
+ throw new CorruptIndexException("duplicate field: " + fieldInfo.name + " (resource=" + in + ")");
+ }
+ }
+
+ success = true;
+ } finally {
+ if (!success) {
+ // this.close() will close in:
+ IOUtils.closeWhileHandlingException(this);
+ }
+ }
+ }
+
+ /** Reads terms file header. */
+ protected int readHeader(IndexInput input) throws IOException {
+ int version = CodecUtil.checkHeader(input, BlockTreeTermsWriter.TERMS_CODEC_NAME,
+ BlockTreeTermsWriter.TERMS_VERSION_START,
+ BlockTreeTermsWriter.TERMS_VERSION_CURRENT);
+ if (version < BlockTreeTermsWriter.TERMS_VERSION_APPEND_ONLY) {
+ dirOffset = input.readLong();
+ }
+ return version;
+ }
+
+ /** Reads index file header. */
+ protected int readIndexHeader(IndexInput input) throws IOException {
+ int version = CodecUtil.checkHeader(input, BlockTreeTermsWriter.TERMS_INDEX_CODEC_NAME,
+ BlockTreeTermsWriter.TERMS_INDEX_VERSION_START,
+ BlockTreeTermsWriter.TERMS_INDEX_VERSION_CURRENT);
+ if (version < BlockTreeTermsWriter.TERMS_INDEX_VERSION_APPEND_ONLY) {
+ indexDirOffset = input.readLong();
+ }
+ return version;
+ }
+
+ /** Seek {@code input} to the directory offset. */
+ protected void seekDir(IndexInput input, long dirOffset)
+ throws IOException {
+ if (version >= BlockTreeTermsWriter.TERMS_INDEX_VERSION_APPEND_ONLY) {
+ input.seek(input.length() - 8);
+ dirOffset = input.readLong();
+ }
+ input.seek(dirOffset);
+ }
+
+ // for debugging
+ // private static String toHex(int v) {
+ // return "0x" + Integer.toHexString(v);
+ // }
+
+ @Override
+ public void close() throws IOException {
+ try {
+ IOUtils.close(indexIn, in, postingsReader);
+ } finally {
+ // Clear so refs to terms index is GCable even if
+ // app hangs onto us:
+ fields.clear();
+ }
+ }
+
+ @Override
+ public Iterator<String> iterator() {
+ return Collections.unmodifiableSet(fields.keySet()).iterator();
+ }
+
+ @Override
+ public Terms terms(String field) throws IOException {
+ assert field != null;
+ return fields.get(field);
+ }
+
+ @Override
+ public int size() {
+ return fields.size();
+ }
+
+ // for debugging
+ String brToString(BytesRef b) {
+ if (b == null) {
+ return "null";
+ } else {
+ try {
+ return b.utf8ToString() + " " + b;
+ } catch (Throwable t) {
+ // If BytesRef isn't actually UTF8, or it's eg a
+ // prefix of UTF8 that ends mid-unicode-char, we
+ // fallback to hex:
+ return b.toString();
+ }
+ }
+ }
+
+ /**
+ * BlockTree statistics for a single field
+ * returned by {@link FieldReader#computeStats()}.
+ */
+ public static class Stats {
+ /** How many nodes in the index FST. */
+ public long indexNodeCount;
+
+ /** How many arcs in the index FST. */
+ public long indexArcCount;
+
+ /** Byte size of the index. */
+ public long indexNumBytes;
+
+ /** Total number of terms in the field. */
+ public long totalTermCount;
+
+ /** Total number of bytes (sum of term lengths) across all terms in the field. */
+ public long totalTermBytes;
+
+ /** The number of normal (non-floor) blocks in the terms file. */
+ public int nonFloorBlockCount;
+
+ /** The number of floor blocks (meta-blocks larger than the
+ * allowed {@code maxItemsPerBlock}) in the terms file. */
+ public int floorBlockCount;
+
+ /** The number of sub-blocks within the floor blocks. */
+ public int floorSubBlockCount;
+
+ /** The number of "internal" blocks (that have both
+ * terms and sub-blocks). */
+ public int mixedBlockCount;
+
+ /** The number of "leaf" blocks (blocks that have only
+ * terms). */
+ public int termsOnlyBlockCount;
+
+ /** The number of "internal" blocks that do not contain
+ * terms (have only sub-blocks). */
+ public int subBlocksOnlyBlockCount;
+
+ /** Total number of blocks. */
+ public int totalBlockCount;
+
+ /** Number of blocks at each prefix depth. */
+ public int[] blockCountByPrefixLen = new int[10];
+ private int startBlockCount;
+ private int endBlockCount;
+
+ /** Total number of bytes used to store term suffixes. */
+ public long totalBlockSuffixBytes;
+
+ /** Total number of bytes used to store term stats (not
+ * including what the {@link PostingsBaseFormat}
+ * stores. */
+ public long totalBlockStatsBytes;
+
+ /** Total bytes stored by the {@link PostingsBaseFormat},
+ * plus the other few vInts stored in the frame. */
+ public long totalBlockOtherBytes;
+
+ /** Segment name. */
+ public final String segment;
+
+ /** Field name. */
+ public final String field;
+
+ Stats(String segment, String field) {
+ this.segment = segment;
+ this.field = field;
+ }
+
+ void startBlock(FieldReader.SegmentTermsEnum.Frame frame, boolean isFloor) {
+ totalBlockCount++;
+ if (isFloor) {
+ if (frame.fp == frame.fpOrig) {
+ floorBlockCount++;
+ }
+ floorSubBlockCount++;
+ } else {
+ nonFloorBlockCount++;
+ }
+
+ if (blockCountByPrefixLen.length <= frame.prefix) {
+ blockCountByPrefixLen = ArrayUtil.grow(blockCountByPrefixLen, 1+frame.prefix);
+ }
+ blockCountByPrefixLen[frame.prefix]++;
+ startBlockCount++;
+ totalBlockSuffixBytes += frame.suffixesReader.length();
+ totalBlockStatsBytes += frame.statsReader.length();
+ }
+
+ void endBlock(FieldReader.SegmentTermsEnum.Frame frame) {
+ final int termCount = frame.isLeafBlock ? frame.entCount : frame.state.termBlockOrd;
+ final int subBlockCount = frame.entCount - termCount;
+ totalTermCount += termCount;
+ if (termCount != 0 && subBlockCount != 0) {
+ mixedBlockCount++;
+ } else if (termCount != 0) {
+ termsOnlyBlockCount++;
+ } else if (subBlockCount != 0) {
+ subBlocksOnlyBlockCount++;
+ } else {
+ throw new IllegalStateException();
+ }
+ endBlockCount++;
+ final long otherBytes = frame.fpEnd - frame.fp - frame.suffixesReader.length() - frame.statsReader.length();
+ assert otherBytes > 0 : "otherBytes=" + otherBytes + " frame.fp=" + frame.fp + " frame.fpEnd=" + frame.fpEnd;
+ totalBlockOtherBytes += otherBytes;
+ }
+
+ void term(BytesRef term) {
+ totalTermBytes += term.length;
+ }
+
+ void finish() {
+ assert startBlockCount == endBlockCount: "startBlockCount=" + startBlockCount + " endBlockCount=" + endBlockCount;
+ assert totalBlockCount == floorSubBlockCount + nonFloorBlockCount: "floorSubBlockCount=" + floorSubBlockCount + " nonFloorBlockCount=" + nonFloorBlockCount + " totalBlockCount=" + totalBlockCount;
+ assert totalBlockCount == mixedBlockCount + termsOnlyBlockCount + subBlocksOnlyBlockCount: "totalBlockCount=" + totalBlockCount + " mixedBlockCount=" + mixedBlockCount + " subBlocksOnlyBlockCount=" + subBlocksOnlyBlockCount + " termsOnlyBlockCount=" + termsOnlyBlockCount;
+ }
+
+ @Override
+ public String toString() {
+ final ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
+ PrintStream out;
+ try {
+ out = new PrintStream(bos, false, "UTF-8");
+ } catch (UnsupportedEncodingException bogus) {
+ throw new RuntimeException(bogus);
+ }
+
+ out.println(" index FST:");
+ out.println(" " + indexNodeCount + " nodes");
+ out.println(" " + indexArcCount + " arcs");
+ out.println(" " + indexNumBytes + " bytes");
+ out.println(" terms:");
+ out.println(" " + totalTermCount + " terms");
+ out.println(" " + totalTermBytes + " bytes" + (totalTermCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalTermBytes)/totalTermCount) + " bytes/term)" : ""));
+ out.println(" blocks:");
+ out.println(" " + totalBlockCount + " blocks");
+ out.println(" " + termsOnlyBlockCount + " terms-only blocks");
+ out.println(" " + subBlocksOnlyBlockCount + " sub-block-only blocks");
+ out.println(" " + mixedBlockCount + " mixed blocks");
+ out.println(" " + floorBlockCount + " floor blocks");
+ out.println(" " + (totalBlockCount-floorSubBlockCount) + " non-floor blocks");
+ out.println(" " + floorSubBlockCount + " floor sub-blocks");
+ out.println(" " + totalBlockSuffixBytes + " term suffix bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockSuffixBytes)/totalBlockCount) + " suffix-bytes/block)" : ""));
+ out.println(" " + totalBlockStatsBytes + " term stats bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockStatsBytes)/totalBlockCount) + " stats-bytes/block)" : ""));
+ out.println(" " + totalBlockOtherBytes + " other bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockOtherBytes)/totalBlockCount) + " other-bytes/block)" : ""));
+ if (totalBlockCount != 0) {
+ out.println(" by prefix length:");
+ int total = 0;
+ for(int prefix=0;prefix<blockCountByPrefixLen.length;prefix++) {
+ final int blockCount = blockCountByPrefixLen[prefix];
+ total += blockCount;
+ if (blockCount != 0) {
+ out.println(" " + String.format(Locale.ROOT, "%2d", prefix) + ": " + blockCount);
+ }
+ }
+ assert totalBlockCount == total;
+ }
+
+ try {
+ return bos.toString("UTF-8");
+ } catch (UnsupportedEncodingException bogus) {
+ throw new RuntimeException(bogus);
+ }
+ }
+ }
+
+ final Outputs<BytesRef> fstOutputs = ByteSequenceOutputs.getSingleton();
+ final BytesRef NO_OUTPUT = fstOutputs.getNoOutput();
+
+ /** BlockTree's implementation of {@link Terms}. */
+ public final class FieldReader extends Terms {
+ final long numTerms;
+ final FieldInfo fieldInfo;
+ final long sumTotalTermFreq;
+ final long sumDocFreq;
+ final int docCount;
+ final long indexStartFP;
+ final long rootBlockFP;
+ final BytesRef rootCode;
+ private final FST<BytesRef> index;
+
+ //private boolean DEBUG;
+
+ FieldReader(FieldInfo fieldInfo, long numTerms, BytesRef rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount, long indexStartFP, IndexInput indexIn) throws IOException {
+ assert numTerms > 0;
+ this.fieldInfo = fieldInfo;
+ //DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id");
+ this.numTerms = numTerms;
+ this.sumTotalTermFreq = sumTotalTermFreq;
+ this.sumDocFreq = sumDocFreq;
+ this.docCount = docCount;
+ this.indexStartFP = indexStartFP;
+ this.rootCode = rootCode;
+ // if (DEBUG) {
+ // System.out.println("BTTR: seg=" + segment + " field=" + fieldInfo.name + " rootBlockCode=" + rootCode + " divisor=" + indexDivisor);
+ // }
+
+ rootBlockFP = (new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong() >>> BlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS;
+
+ if (indexIn != null) {
+ final IndexInput clone = indexIn.clone();
+ //System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name);
+ clone.seek(indexStartFP);
+ index = new FST<BytesRef>(clone, ByteSequenceOutputs.getSingleton());
+
+ /*
+ if (false) {
+ final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
+ Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName));
+ Util.toDot(index, w, false, false);
+ System.out.println("FST INDEX: SAVED to " + dotFileName);
+ w.close();
+ }
+ */
+ } else {
+ index = null;
+ }
+ }
+
+ /** For debugging -- used by CheckIndex too*/
+ // TODO: maybe push this into Terms?
+ public Stats computeStats() throws IOException {
+ return new SegmentTermsEnum().computeBlockStats();
+ }
+
+ @Override
+ public Comparator<BytesRef> getComparator() {
+ return BytesRef.getUTF8SortedAsUnicodeComparator();
+ }
+
+ @Override
+ public boolean hasOffsets() {
+ return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
+ }
+
+ @Override
+ public boolean hasPositions() {
+ return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
+ }
+
+ @Override
+ public boolean hasPayloads() {
+ return fieldInfo.hasPayloads();
+ }
+
+ @Override
+ public TermsEnum iterator(TermsEnum reuse) throws IOException {
+ return new SegmentTermsEnum();
+ }
+
+ @Override
+ public long size() {
+ return numTerms;
+ }
+
+ @Override
+ public long getSumTotalTermFreq() {
+ return sumTotalTermFreq;
+ }
+
+ @Override
+ public long getSumDocFreq() {
+ return sumDocFreq;
+ }
+
+ @Override
+ public int getDocCount() {
+ return docCount;
+ }
+
+ @Override
+ public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
+ if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
+ throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
+ }
+ return new IntersectEnum(compiled, startTerm);
+ }
+
+ // NOTE: cannot seek!
+ private final class IntersectEnum extends TermsEnum {
+ private final IndexInput in;
+
+ private Frame[] stack;
+
+ @SuppressWarnings({"rawtypes","unchecked"}) private FST.Arc<BytesRef>[] arcs = new FST.Arc[5];
+
+ private final RunAutomaton runAutomaton;
+ private final CompiledAutomaton compiledAutomaton;
+
+ private Frame currentFrame;
+
+ private final BytesRef term = new BytesRef();
+
+ private final FST.BytesReader fstReader;
+
+ // TODO: can we share this with the frame in STE?
+ private final class Frame {
+ final int ord;
+ long fp;
+ long fpOrig;
+ long fpEnd;
+ long lastSubFP;
+
+ // State in automaton
+ int state;
+
+ int metaDataUpto;
+
+ byte[] suffixBytes = new byte[128];
+ final ByteArrayDataInput suffixesReader = new ByteArrayDataInput();
+
+ byte[] statBytes = new byte[64];
+ final ByteArrayDataInput statsReader = new ByteArrayDataInput();
+
+ byte[] floorData = new byte[32];
+ final ByteArrayDataInput floorDataReader = new ByteArrayDataInput();
+
+ // Length of prefix shared by all terms in this block
+ int prefix;
+
+ // Number of entries (term or sub-block) in this block
+ int entCount;
+
+ // Which term we will next read
+ int nextEnt;
+
+ // True if this block is either not a floor block,
+ // or, it's the last sub-block of a floor block
+ boolean isLastInFloor;
+
+ // True if all entries are terms
+ boolean isLeafBlock;
+
+ int numFollowFloorBlocks;
+ int nextFloorLabel;
+
+ Transition[] transitions;
+ int curTransitionMax;
+ int transitionIndex;
+
+ FST.Arc<BytesRef> arc;
+
+ final BlockTermState termState;
+
+ // Cumulative output so far
+ BytesRef outputPrefix;
+
+ private int startBytePos;
+ private int suffix;
+
+ public Frame(int ord) throws IOException {
+ this.ord = ord;
+ termState = postingsReader.newTermState();
+ termState.totalTermFreq = -1;
+ }
+
+ void loadNextFloorBlock() throws IOException {
+ assert numFollowFloorBlocks > 0;
+ //if (DEBUG) System.out.println(" loadNextFoorBlock trans=" + transitions[transitionIndex]);
+
+ do {
+ fp = fpOrig + (floorDataReader.readVLong() >>> 1);
+ numFollowFloorBlocks--;
+ // if (DEBUG) System.out.println(" skip floor block2! nextFloorLabel=" + (char) nextFloorLabel + " vs target=" + (char) transitions[transitionIndex].getMin() + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks);
+ if (numFollowFloorBlocks != 0) {
+ nextFloorLabel = floorDataReader.readByte() & 0xff;
+ } else {
+ nextFloorLabel = 256;
+ }
+ // if (DEBUG) System.out.println(" nextFloorLabel=" + (char) nextFloorLabel);
+ } while (numFollowFloorBlocks != 0 && nextFloorLabel <= transitions[transitionIndex].getMin());
+
+ load(null);
+ }
+
+ public void setState(int state) {
+ this.state = state;
+ transitionIndex = 0;
+ transitions = compiledAutomaton.sortedTransitions[state];
+ if (transitions.length != 0) {
+ curTransitionMax = transitions[0].getMax();
+ } else {
+ curTransitionMax = -1;
+ }
+ }
+
+ void load(BytesRef frameIndexData) throws IOException {
+
+ // if (DEBUG) System.out.println(" load fp=" + fp + " fpOrig=" + fpOrig + " frameIndexData=" + frameIndexData + " trans=" + (transitions.length != 0 ? transitions[0] : "n/a" + " state=" + state));
+
+ if (frameIndexData != null && transitions.length != 0) {
+ // Floor frame
+ if (floorData.length < frameIndexData.length) {
+ this.floorData = new byte[ArrayUtil.oversize(frameIndexData.length, 1)];
+ }
+ System.arraycopy(frameIndexData.bytes, frameIndexData.offset, floorData, 0, frameIndexData.length);
+ floorDataReader.reset(floorData, 0, frameIndexData.length);
+ // Skip first long -- has redundant fp, hasTerms
+ // flag, isFloor flag
+ final long code = floorDataReader.readVLong();
+ if ((code & BlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0) {
+ numFollowFloorBlocks = floorDataReader.readVInt();
+ nextFloorLabel = floorDataReader.readByte() & 0xff;
+ // if (DEBUG) System.out.println(" numFollowFloorBlocks=" + numFollowFloorBlocks + " nextFloorLabel=" + nextFloorLabel);
+
+ // If current state is accept, we must process
+ // first block in case it has empty suffix:
+ if (!runAutomaton.isAccept(state)) {
+ // Maybe skip floor blocks:
+ while (numFollowFloorBlocks != 0 && nextFloorLabel <= transitions[0].getMin()) {
+ fp = fpOrig + (floorDataReader.readVLong() >>> 1);
+ numFollowFloorBlocks--;
+ // if (DEBUG) System.out.println(" skip floor block! nextFloorLabel=" + (char) nextFloorLabel + " vs target=" + (char) transitions[0].getMin() + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks);
+ if (numFollowFloorBlocks != 0) {
+ nextFloorLabel = floorDataReader.readByte() & 0xff;
+ } else {
+ nextFloorLabel = 256;
+ }
+ }
+ }
+ }
+ }
+
+ in.seek(fp);
+ int code = in.readVInt();
+ entCount = code >>> 1;
+ assert entCount > 0;
+ isLastInFloor = (code & 1) != 0;
+
+ // term suffixes:
+ code = in.readVInt();
+ isLeafBlock = (code & 1) != 0;
+ int numBytes = code >>> 1;
+ // if (DEBUG) System.out.println(" entCount=" + entCount + " lastInFloor?=" + isLastInFloor + " leafBlock?=" + isLeafBlock + " numSuffixBytes=" + numBytes);
+ if (suffixBytes.length < numBytes) {
+ suffixBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
+ }
+ in.readBytes(suffixBytes, 0, numBytes);
+ suffixesReader.reset(suffixBytes, 0, numBytes);
+
+ // stats
+ numBytes = in.readVInt();
+ if (statBytes.length < numBytes) {
+ statBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
+ }
+ in.readBytes(statBytes, 0, numBytes);
+ statsReader.reset(statBytes, 0, numBytes);
+ metaDataUpto = 0;
+
+ termState.termBlockOrd = 0;
+ nextEnt = 0;
+
+ postingsReader.readTermsBlock(in, fieldInfo, termState);
+
+ if (!isLastInFloor) {
+ // Sub-blocks of a single floor block are always
+ // written one after another -- tail recurse:
+ fpEnd = in.getFilePointer();
+ }
+ }
+
+ // TODO: maybe add scanToLabel; should give perf boost
+
+ public boolean next() {
+ return isLeafBlock ? nextLeaf() : nextNonLeaf();
+ }
+
+ // Decodes next entry; returns true if it's a sub-block
+ public boolean nextLeaf() {
+ //if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount);
+ assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
+ nextEnt++;
+ suffix = suffixesReader.readVInt();
+ startBytePos = suffixesReader.getPosition();
+ suffixesReader.skipBytes(suffix);
+ return false;
+ }
+
+ public boolean nextNonLeaf() {
+ //if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount);
+ assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
+ nextEnt++;
+ final int code = suffixesReader.readVInt();
+ suffix = code >>> 1;
+ startBytePos = suffixesReader.getPosition();
+ suffixesReader.skipBytes(suffix);
+ if ((code & 1) == 0) {
+ // A normal term
+ termState.termBlockOrd++;
+ return false;
+ } else {
+ // A sub-block; make sub-FP absolute:
+ lastSubFP = fp - suffixesReader.readVLong();
+ return true;
+ }
+ }
+
+ public int getTermBlockOrd() {
+ return isLeafBlock ? nextEnt : termState.termBlockOrd;
+ }
+
+ public void decodeMetaData() throws IOException {
+
+ // lazily catch up on metadata decode:
+ final int limit = getTermBlockOrd();
+ assert limit > 0;
+
+ // We must set/incr state.termCount because
+ // postings impl can look at this
+ termState.termBlockOrd = metaDataUpto;
+
+ // TODO: better API would be "jump straight to term=N"???
+ while (metaDataUpto < limit) {
+
+ // TODO: we could make "tiers" of metadata, ie,
+ // decode docFreq/totalTF but don't decode postings
+ // metadata; this way caller could get
+ // docFreq/totalTF w/o paying decode cost for
+ // postings
+
+ // TODO: if docFreq were bulk decoded we could
+ // just skipN here:
+ termState.docFreq = statsReader.readVInt();
+ //if (DEBUG) System.out.println(" dF=" + state.docFreq);
+ if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
+ termState.totalTermFreq = termState.docFreq + statsReader.readVLong();
+ //if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq);
+ }
+
+ postingsReader.nextTerm(fieldInfo, termState);
+ metaDataUpto++;
+ termState.termBlockOrd++;
+ }
+ }
+ }
+
+ private BytesRef savedStartTerm;
+
+ // TODO: in some cases we can filter by length? eg
+ // regexp foo*bar must be at least length 6 bytes
+ public IntersectEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
+ // if (DEBUG) {
+ // System.out.println("\nintEnum.init seg=" + segment + " commonSuffix=" + brToString(compiled.commonSuffixRef));
+ // }
+ runAutomaton = compiled.runAutomaton;
+ compiledAutomaton = compiled;
+ in = BlockTreeTermsReader.this.in.clone();
+ stack = new Frame[5];
+ for(int idx=0;idx<stack.length;idx++) {
+ stack[idx] = new Frame(idx);
+ }
+ for(int arcIdx=0;arcIdx<arcs.length;arcIdx++) {
+ arcs[arcIdx] = new FST.Arc<BytesRef>();
+ }
+
+ if (index == null) {
+ fstReader = null;
+ } else {
+ fstReader = index.getBytesReader();
+ }
+
+ // TODO: if the automaton is "smallish" we really
+ // should use the terms index to seek at least to
+ // the initial term and likely to subsequent terms
+ // (or, maybe just fallback to ATE for such cases).
+ // Else the seek cost of loading the frames will be
+ // too costly.
+
+ final FST.Arc<BytesRef> arc = index.getFirstArc(arcs[0]);
+ // Empty string prefix must have an output in the index!
+ assert arc.isFinal();
+
+ // Special pushFrame since it's the first one:
+ final Frame f = stack[0];
+ f.fp = f.fpOrig = rootBlockFP;
+ f.prefix = 0;
+ f.setState(runAutomaton.getInitialState());
+ f.arc = arc;
+ f.outputPrefix = arc.output;
+ f.load(rootCode);
+
+ // for assert:
+ assert setSavedStartTerm(startTerm);
+
+ currentFrame = f;
+ if (startTerm != null) {
+ seekToStartTerm(startTerm);
+ }
+ }
+
+ // only for assert:
+ private boolean setSavedStartTerm(BytesRef startTerm) {
+ savedStartTerm = startTerm == null ? null : BytesRef.deepCopyOf(startTerm);
+ return true;
+ }
+
+ @Override
+ public TermState termState() throws IOException {
+ currentFrame.decodeMetaData();
+ return currentFrame.termState.clone();
+ }
+
+ private Frame getFrame(int ord) throws IOException {
+ if (ord >= stack.length) {
+ final Frame[] next = new Frame[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+ System.arraycopy(stack, 0, next, 0, stack.length);
+ for(int stackOrd=stack.length;stackOrd<next.length;stackOrd++) {
+ next[stackOrd] = new Frame(stackOrd);
+ }
+ stack = next;
+ }
+ assert stack[ord].ord == ord;
+ return stack[ord];
+ }
+
+ private FST.Arc<BytesRef> getArc(int ord) {
+ if (ord >= arcs.length) {
+ @SuppressWarnings({"rawtypes","unchecked"}) final FST.Arc<BytesRef>[] next =
+ new FST.Arc[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+ System.arraycopy(arcs, 0, next, 0, arcs.length);
+ for(int arcOrd=arcs.length;arcOrd<next.length;arcOrd++) {
+ next[arcOrd] = new FST.Arc<BytesRef>();
+ }
+ arcs = next;
+ }
+ return arcs[ord];
+ }
+
+ private Frame pushFrame(int state) throws IOException {
+ final Frame f = getFrame(currentFrame == null ? 0 : 1+currentFrame.ord);
+
+ f.fp = f.fpOrig = currentFrame.lastSubFP;
+ f.prefix = currentFrame.prefix + currentFrame.suffix;
+ // if (DEBUG) System.out.println(" pushFrame state=" + state + " prefix=" + f.prefix);
+ f.setState(state);
+
+ // Walk the arc through the index -- we only
+ // "bother" with this so we can get the floor data
+ // from the index and skip floor blocks when
+ // possible:
+ FST.Arc<BytesRef> arc = currentFrame.arc;
+ int idx = currentFrame.prefix;
+ assert currentFrame.suffix > 0;
+ BytesRef output = currentFrame.outputPrefix;
+ while (idx < f.prefix) {
+ final int target = term.bytes[idx] & 0xff;
+ // TODO: we could be more efficient for the next()
+ // case by using current arc as starting point,
+ // passed to findTargetArc
+ arc = index.findTargetArc(target, arc, getArc(1+idx), fstReader);
+ assert arc != null;
+ output = fstOutputs.add(output, arc.output);
+ idx++;
+ }
+
+ f.arc = arc;
+ f.outputPrefix = output;
+ assert arc.isFinal();
+ f.load(fstOutputs.add(output, arc.nextFinalOutput));
+ return f;
+ }
+
+ @Override
+ public BytesRef term() {
+ return term;
+ }
+
+ @Override
+ public int docFreq() throws IOException {
+ //if (DEBUG) System.out.println("BTIR.docFreq");
+ currentFrame.decodeMetaData();
+ //if (DEBUG) System.out.println(" return " + currentFrame.termState.docFreq);
+ return currentFrame.termState.docFreq;
+ }
+
+ @Override
+ public long totalTermFreq() throws IOException {
+ currentFrame.decodeMetaData();
+ return currentFrame.termState.totalTermFreq;
+ }
+
+ @Override
+ public DocsEnum docs(Bits skipDocs, DocsEnum reuse, int flags) throws IOException {
+ currentFrame.decodeMetaData();
+ return postingsReader.docs(fieldInfo, currentFrame.termState, skipDocs, reuse, flags);
+ }
+
+ @Override
+ public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse, int flags) throws IOException {
+ if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
+ // Positions were not indexed:
+ return null;
+ }
+
+ currentFrame.decodeMetaData();
+ return postingsReader.docsAndPositions(fieldInfo, currentFrame.termState, skipDocs, reuse, flags);
+ }
+
+ private int getState() {
+ int state = currentFrame.state;
+ for(int idx=0;idx<currentFrame.suffix;idx++) {
+ state = runAutomaton.step(state, currentFrame.suffixBytes[currentFrame.startBytePos+idx] & 0xff);
+ assert state != -1;
+ }
+ return state;
+ }
+
+ // NOTE: specialized to only doing the first-time
+ // seek, but we could generalize it to allow
+ // arbitrary seekExact/Ceil. Note that this is a
+ // seekFloor!
+ private void seekToStartTerm(BytesRef target) throws IOException {
+ //if (DEBUG) System.out.println("seek to startTerm=" + target.utf8ToString());
+ assert currentFrame.ord == 0;
+ if (term.length < target.length) {
+ term.bytes = ArrayUtil.grow(term.bytes, target.length);
+ }
+ FST.Arc<BytesRef> arc = arcs[0];
+ assert arc == currentFrame.arc;
+
+ for(int idx=0;idx<=target.length;idx++) {
+
+ while (true) {
+ final int savePos = currentFrame.suffixesReader.getPosition();
+ final int saveStartBytePos = currentFrame.startBytePos;
+ final int saveSuffix = currentFrame.suffix;
+ final long saveLastSubFP = currentFrame.lastSubFP;
+ final int saveTermBlockOrd = currentFrame.termState.termBlockOrd;
+
+ final boolean isSubBlock = currentFrame.next();
+
+ //if (DEBUG) System.out.println(" cycle ent=" + currentFrame.nextEnt + " (of " + currentFrame.entCount + ") prefix=" + currentFrame.prefix + " suffix=" + currentFrame.suffix + " isBlock=" + isSubBlock + " firstLabel=" + (currentFrame.suffix == 0 ? "" : (currentFrame.suffixBytes[currentFrame.startBytePos])&0xff));
+ term.length = currentFrame.prefix + currentFrame.suffix;
+ if (term.bytes.length < term.length) {
+ term.bytes = ArrayUtil.grow(term.bytes, term.length);
+ }
+ System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix);
+
+ if (isSubBlock && StringHelper.startsWith(target, term)) {
+ // Recurse
+ //if (DEBUG) System.out.println(" recurse!");
+ currentFrame = pushFrame(getState());
+ break;
+ } else {
+ final int cmp = term.compareTo(target);
+ if (cmp < 0) {
+ if (currentFrame.nextEnt == currentFrame.entCount) {
+ if (!currentFrame.isLastInFloor) {
+ //if (DEBUG) System.out.println(" load floorBlock");
+ currentFrame.loadNextFloorBlock();
+ continue;
+ } else {
+ //if (DEBUG) System.out.println(" return term=" + brToString(term));
+ return;
+ }
+ }
+ continue;
+ } else if (cmp == 0) {
+ //if (DEBUG) System.out.println(" return term=" + brToString(term));
+ return;
+ } else {
+ // Fallback to prior entry: the semantics of
+ // this method is that the first call to
+ // next() will return the term after the
+ // requested term
+ currentFrame.nextEnt--;
+ currentFrame.lastSubFP = saveLastSubFP;
+ currentFrame.startBytePos = saveStartBytePos;
+ currentFrame.suffix = saveSuffix;
+ currentFrame.suffixesReader.setPosition(savePos);
+ currentFrame.termState.termBlockOrd = saveTermBlockOrd;
+ System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix);
+ term.length = currentFrame.prefix + currentFrame.suffix;
+ // If the last entry was a block we don't
+ // need to bother recursing and pushing to
+ // the last term under it because the first
+ // next() will simply skip the frame anyway
+ return;
+ }
+ }
+ }
+ }
+
+ assert false;
+ }
+
+ @Override
+ public BytesRef next() throws IOException {
+
+ // if (DEBUG) {
+ // System.out.println("\nintEnum.next seg=" + segment);
+ // System.out.println(" frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix);
+ // }
+
+ nextTerm:
+ while(true) {
+ // Pop finished frames
+ while (currentFrame.nextEnt == currentFrame.entCount) {
+ if (!currentFrame.isLastInFloor) {
+ //if (DEBUG) System.out.println(" next-floor-block");
+ currentFrame.loadNextFloorBlock();
+ //if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix);
+ } else {
+ //if (DEBUG) System.out.println(" pop frame");
+ if (currentFrame.ord == 0) {
+ return null;
+ }
+ final long lastFP = currentFrame.fpOrig;
+ currentFrame = stack[currentFrame.ord-1];
+ assert currentFrame.lastSubFP == lastFP;
+ //if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix);
+ }
+ }
+
+ final boolean isSubBlock = currentFrame.next();
+ // if (DEBUG) {
+ // final BytesRef suffixRef = new BytesRef();
+ // suffixRef.bytes = currentFrame.suffixBytes;
+ // suffixRef.offset = currentFrame.startBytePos;
+ // suffixRef.length = currentFrame.suffix;
+ // System.out.println(" " + (isSubBlock ? "sub-block" : "term") + " " + currentFrame.nextEnt + " (of " + currentFrame.entCount + ") suffix=" + brToString(suffixRef));
+ // }
+
+ if (currentFrame.suffix != 0) {
+ final int label = currentFrame.suffixBytes[currentFrame.startBytePos] & 0xff;
+ while (label > currentFrame.curTransitionMax) {
+ if (currentFrame.transitionIndex >= currentFrame.transitions.length-1) {
+ // Stop processing this frame -- no further
+ // matches are possible because we've moved
+ // beyond what the max transition will allow
+ //if (DEBUG) System.out.println(" break: trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]));
+
+ // sneaky! forces a pop above
+ currentFrame.isLastInFloor = true;
+ currentFrame.nextEnt = currentFrame.entCount;
+ continue nextTerm;
+ }
+ currentFrame.transitionIndex++;
+ currentFrame.curTransitionMax = currentFrame.transitions[currentFrame.transitionIndex].getMax();
+ //if (DEBUG) System.out.println(" next trans=" + currentFrame.transitions[currentFrame.transitionIndex]);
+ }
+ }
+
+ // First test the common suffix, if set:
+ if (compiledAutomaton.commonSuffixRef != null && !isSubBlock) {
+ final int termLen = currentFrame.prefix + currentFrame.suffix;
+ if (termLen < compiledAutomaton.commonSuffixRef.length) {
+ // No match
+ // if (DEBUG) {
+ // System.out.println(" skip: common suffix length");
+ // }
+ continue nextTerm;
+ }
+
+ final byte[] suffixBytes = currentFrame.suffixBytes;
+ final byte[] commonSuffixBytes = compiledAutomaton.commonSuffixRef.bytes;
+
+ final int lenInPrefix = compiledAutomaton.commonSuffixRef.length - currentFrame.suffix;
+ assert compiledAutomaton.commonSuffixRef.offset == 0;
+ int suffixBytesPos;
+ int commonSuffixBytesPos = 0;
+
+ if (lenInPrefix > 0) {
+ // A prefix of the common suffix overlaps with
+ // the suffix of the block prefix so we first
+ // test whether the prefix part matches:
+ final byte[] termBytes = term.bytes;
+ int termBytesPos = currentFrame.prefix - lenInPrefix;
+ assert termBytesPos >= 0;
+ final int termBytesPosEnd = currentFrame.prefix;
+ while (termBytesPos < termBytesPosEnd) {
+ if (termBytes[termBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) {
+ // if (DEBUG) {
+ // System.out.println(" skip: common suffix mismatch (in prefix)");
+ // }
+ continue nextTerm;
+ }
+ }
+ suffixBytesPos = currentFrame.startBytePos;
+ } else {
+ suffixBytesPos = currentFrame.startBytePos + currentFrame.suffix - compiledAutomaton.commonSuffixRef.length;
+ }
+
+ // Test overlapping suffix part:
+ final int commonSuffixBytesPosEnd = compiledAutomaton.commonSuffixRef.length;
+ while (commonSuffixBytesPos < commonSuffixBytesPosEnd) {
+ if (suffixBytes[suffixBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) {
+ // if (DEBUG) {
+ // System.out.println(" skip: common suffix mismatch");
+ // }
+ continue nextTerm;
+ }
+ }
+ }
+
+ // TODO: maybe we should do the same linear test
+ // that AutomatonTermsEnum does, so that if we
+ // reach a part of the automaton where .* is
+ // "temporarily" accepted, we just blindly .next()
+ // until the limit
+
+ // See if the term prefix matches the automaton:
+ int state = currentFrame.state;
+ for (int idx=0;idx<currentFrame.suffix;idx++) {
+ state = runAutomaton.step(state, currentFrame.suffixBytes[currentFrame.startBytePos+idx] & 0xff);
+ if (state == -1) {
+ // No match
+ //System.out.println(" no s=" + state);
+ continue nextTerm;
+ } else {
+ //System.out.println(" c s=" + state);
+ }
+ }
+
+ if (isSubBlock) {
+ // Match! Recurse:
+ //if (DEBUG) System.out.println(" sub-block match to state=" + state + "; recurse fp=" + currentFrame.lastSubFP);
+ copyTerm();
+ currentFrame = pushFrame(state);
+ //if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix);
+ } else if (runAutomaton.isAccept(state)) {
+ copyTerm();
+ //if (DEBUG) System.out.println(" term match to state=" + state + "; return term=" + brToString(term));
+ assert savedStartTerm == null || term.compareTo(savedStartTerm) > 0: "saveStartTerm=" + savedStartTerm.utf8ToString() + " term=" + term.utf8ToString();
+ return term;
+ } else {
+ //System.out.println(" no s=" + state);
+ }
+ }
+ }
+
+ private void copyTerm() {
+ //System.out.println(" copyTerm cur.prefix=" + currentFrame.prefix + " cur.suffix=" + currentFrame.suffix + " first=" + (char) currentFrame.suffixBytes[currentFrame.startBytePos]);
+ final int len = currentFrame.prefix + currentFrame.suffix;
+ if (term.bytes.length < len) {
+ term.bytes = ArrayUtil.grow(term.bytes, len);
+ }
+ System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix);
+ term.length = len;
+ }
+
+ @Override
+ public Comparator<BytesRef> getComparator() {
+ return BytesRef.getUTF8SortedAsUnicodeComparator();
+ }
+
+ @Override
+ public boolean seekExact(BytesRef text, boolean useCache) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void seekExact(long ord) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public long ord() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public SeekStatus seekCeil(BytesRef text, boolean useCache) {
+ throw new UnsupportedOperationException();
+ }
+ }
+
+ // Iterates through terms in this field
+ private final class SegmentTermsEnum extends TermsEnum {
+ private IndexInput in;
+
+ private Frame[] stack;
+ private final Frame staticFrame;
+ private Frame currentFrame;
+ private boolean termExists;
+
+ private int targetBeforeCurrentLength;
+
+ private final ByteArrayDataInput scratchReader = new ByteArrayDataInput();
+
+ // What prefix of the current term was present in the index:
+ private int validIndexPrefix;
+
+ // assert only:
+ private boolean eof;
+
+ final BytesRef term = new BytesRef();
+ private final FST.BytesReader fstReader;
+
+ @SuppressWarnings({"rawtypes","unchecked"}) private FST.Arc<BytesRef>[] arcs =
+ new FST.Arc[1];
+
+ public SegmentTermsEnum() throws IOException {
+ //if (DEBUG) System.out.println("BTTR.init seg=" + segment);
+ stack = new Frame[0];
+
+ // Used to hold seek by TermState, or cached seek
+ staticFrame = new Frame(-1);
+
+ if (index == null) {
+ fstReader = null;
+ } else {
+ fstReader = index.getBytesReader();
+ }
+
+ // Init w/ root block; don't use index since it may
+ // not (and need not) have been loaded
+ for(int arcIdx=0;arcIdx<arcs.length;arcIdx++) {
+ arcs[arcIdx] = new FST.Arc<BytesRef>();
+ }
+
+ currentFrame = staticFrame;
+ final FST.Arc<BytesRef> arc;
+ if (index != null) {
+ arc = index.getFirstArc(arcs[0]);
+ // Empty string prefix must have an output in the index!
+ assert arc.isFinal();
+ } else {
+ arc = null;
+ }
+ currentFrame = staticFrame;
+ //currentFrame = pushFrame(arc, rootCode, 0);
+ //currentFrame.loadBlock();
+ validIndexPrefix = 0;
+ // if (DEBUG) {
+ // System.out.println("init frame state " + currentFrame.ord);
+ // printSeekState();
+ // }
+
+ //System.out.println();
+ // computeBlockStats().print(System.out);
+ }
+
+ // Not private to avoid synthetic access$NNN methods
+ void initIndexInput() {
+ if (this.in == null) {
+ this.in = BlockTreeTermsReader.this.in.clone();
+ }
+ }
+
+ /** Runs next() through the entire terms dict,
+ * computing aggregate statistics. */
+ public Stats computeBlockStats() throws IOException {
+
+ Stats stats = new Stats(segment, fieldInfo.name);
+ if (index != null) {
+ stats.indexNodeCount = index.getNodeCount();
+ stats.indexArcCount = index.getArcCount();
+ stats.indexNumBytes = index.sizeInBytes();
+ }
+
+ currentFrame = staticFrame;
+ FST.Arc<BytesRef> arc;
+ if (index != null) {
+ arc = index.getFirstArc(arcs[0]);
+ // Empty string prefix must have an output in the index!
+ assert arc.isFinal();
+ } else {
+ arc = null;
+ }
+
+ // Empty string prefix must have an output in the
+ // index!
+ currentFrame = pushFrame(arc, rootCode, 0);
+ currentFrame.fpOrig = currentFrame.fp;
+ currentFrame.loadBlock();
+ validIndexPrefix = 0;
+
+ stats.startBlock(currentFrame, !currentFrame.isLastInFloor);
+
+ allTerms:
+ while (true) {
+
+ // Pop finished blocks
+ while (currentFrame.nextEnt == currentFrame.entCount) {
+ stats.endBlock(currentFrame);
+ if (!currentFrame.isLastInFloor) {
+ currentFrame.loadNextFloorBlock();
+ stats.startBlock(currentFrame, true);
+ } else {
+ if (currentFrame.ord == 0) {
+ break allTerms;
+ }
+ final long lastFP = currentFrame.fpOrig;
+ currentFrame = stack[currentFrame.ord-1];
+ assert lastFP == currentFrame.lastSubFP;
+ // if (DEBUG) {
+ // System.out.println(" reset validIndexPrefix=" + validIndexPrefix);
+ // }
+ }
+ }
+
+ while(true) {
+ if (currentFrame.next()) {
+ // Push to new block:
+ currentFrame = pushFrame(null, currentFrame.lastSubFP, term.length);
+ currentFrame.fpOrig = currentFrame.fp;
+ // This is a "next" frame -- even if it's
+ // floor'd we must pretend it isn't so we don't
+ // try to scan to the right floor frame:
+ currentFrame.isFloor = false;
+ //currentFrame.hasTerms = true;
+ currentFrame.loadBlock();
+ stats.startBlock(currentFrame, !currentFrame.isLastInFloor);
+ } else {
+ stats.term(term);
+ break;
+ }
+ }
+ }
+
+ stats.finish();
+
+ // Put root frame back:
+ currentFrame = staticFrame;
+ if (index != null) {
+ arc = index.getFirstArc(arcs[0]);
+ // Empty string prefix must have an output in the index!
+ assert arc.isFinal();
+ } else {
+ arc = null;
+ }
+ currentFrame = pushFrame(arc, rootCode, 0);
+ currentFrame.rewind();
+ currentFrame.loadBlock();
+ validIndexPrefix = 0;
+ term.length = 0;
+
+ return stats;
+ }
+
+ private Frame getFrame(int ord) throws IOException {
+ if (ord >= stack.length) {
+ final Frame[] next = new Frame[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+ System.arraycopy(stack, 0, next, 0, stack.length);
+ for(int stackOrd=stack.length;stackOrd<next.length;stackOrd++) {
+ next[stackOrd] = new Frame(stackOrd);
+ }
+ stack = next;
+ }
+ assert stack[ord].ord == ord;
+ return stack[ord];
+ }
+
+ private FST.Arc<BytesRef> getArc(int ord) {
+ if (ord >= arcs.length) {
+ @SuppressWarnings({"rawtypes","unchecked"}) final FST.Arc<BytesRef>[] next =
+ new FST.Arc[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+ System.arraycopy(arcs, 0, next, 0, arcs.length);
+ for(int arcOrd=arcs.length;arcOrd<next.length;arcOrd++) {
+ next[arcOrd] = new FST.Arc<BytesRef>();
+ }
+ arcs = next;
+ }
+ return arcs[ord];
+ }
+
+ @Override
+ public Comparator<BytesRef> getComparator() {
+ return BytesRef.getUTF8SortedAsUnicodeComparator();
+ }
+
+ // Pushes a frame we seek'd to
+ Frame pushFrame(FST.Arc<BytesRef> arc, BytesRef frameData, int length) throws IOException {
+ scratchReader.reset(frameData.bytes, frameData.offset, frameData.length);
+ final long code = scratchReader.readVLong();
+ final long fpSeek = code >>> BlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS;
+ final Frame f = getFrame(1+currentFrame.ord);
+ f.hasTerms = (code & BlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS) != 0;
+ f.hasTermsOrig = f.hasTerms;
+ f.isFloor = (code & BlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0;
+ if (f.isFloor) {
+ f.setFloorData(scratchReader, frameData);
+ }
+ pushFrame(arc, fpSeek, length);
+
+ return f;
+ }
+
+ // Pushes next'd frame or seek'd frame; we later
+ // lazy-load the frame only when needed
+ Frame pushFrame(FST.Arc<BytesRef> arc, long fp, int length) throws IOException {
+ final Frame f = getFrame(1+currentFrame.ord);
+ f.arc = arc;
+ if (f.fpOrig == fp && f.nextEnt != -1) {
+ //if (DEBUG) System.out.println(" push reused frame ord=" + f.ord + " fp=" + f.fp + " isFloor?=" + f.isFloor + " hasTerms=" + f.hasTerms + " pref=" + term + " nextEnt=" + f.nextEnt + " targetBeforeCurrentLength=" + targetBeforeCurrentLength + " term.length=" + term.length + " vs prefix=" + f.prefix);
+ if (f.prefix > targetBeforeCurrentLength) {
+ f.rewind();
+ } else {
+ // if (DEBUG) {
+ // System.out.println(" skip rewind!");
+ // }
+ }
+ assert length == f.prefix;
+ } else {
+ f.nextEnt = -1;
+ f.prefix = length;
+ f.state.termBlockOrd = 0;
+ f.fpOrig = f.fp = fp;
+ f.lastSubFP = -1;
+ // if (DEBUG) {
+ // final int sav = term.length;
+ // term.length = length;
+ // System.out.println(" push new frame ord=" + f.ord + " fp=" + f.fp + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " pref=" + brToString(term));
+ // term.length = sav;
+ // }
+ }
+
+ return f;
+ }
+
+ // asserts only
+ private boolean clearEOF() {
+ eof = false;
+ return true;
+ }
+
+ // asserts only
+ private boolean setEOF() {
+ eof = true;
+ return true;
+ }
+
+ @Override
+ public boolean seekExact(final BytesRef target, final boolean useCache) throws IOException {
+
+ if (index == null) {
+ throw new IllegalStateException("terms index was not loaded");
+ }
+
+ if (term.bytes.length <= target.length) {
+ term.bytes = ArrayUtil.grow(term.bytes, 1+target.length);
+ }
+
+ assert clearEOF();
+
+ // if (DEBUG) {
+ // System.out.println("\nBTTR.seekExact seg=" + segment + " target=" + fieldInfo.name + ":" + brToString(target) + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix=" + validIndexPrefix);
+ // printSeekState();
+ // }
+
+ FST.Arc<BytesRef> arc;
+ int targetUpto;
+ BytesRef output;
+
+ targetBeforeCurrentLength = currentFrame.ord;
+
+ if (currentFrame != staticFrame) {
+
+ // We are already seek'd; find the common
+ // prefix of new seek term vs current term and
+ // re-use the corresponding seek state. For
+ // example, if app first seeks to foobar, then
+ // seeks to foobaz, we can re-use the seek state
+ // for the first 5 bytes.
+
+ // if (DEBUG) {
+ // System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix);
+ // }
+
+ arc = arcs[0];
+ assert arc.isFinal();
+ output = arc.output;
+ targetUpto = 0;
+
+ Frame lastFrame = stack[0];
+ assert validIndexPrefix <= term.length;
+
+ final int targetLimit = Math.min(target.length, validIndexPrefix);
+
+ int cmp = 0;
+
+ // TODO: reverse vLong byte order for better FST
+ // prefix output sharing
+
+ // First compare up to valid seek frames:
+ while (targetUpto < targetLimit) {
+ cmp = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF);
+ // if (DEBUG) {
+ // System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")" + " arc.output=" + arc.output + " output=" + output);
+ // }
+ if (cmp != 0) {
+ break;
+ }
+ arc = arcs[1+targetUpto];
+ //if (arc.label != (target.bytes[target.offset + targetUpto] & 0xFF)) {
+ //System.out.println("FAIL: arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF));
+ //}
+ assert arc.label == (target.bytes[target.offset + targetUpto] & 0xFF): "arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF);
+ if (arc.output != NO_OUTPUT) {
+ output = fstOutputs.add(output, arc.output);
+ }
+ if (arc.isFinal()) {
+ lastFrame = stack[1+lastFrame.ord];
+ }
+ targetUpto++;
+ }
+
+ if (cmp == 0) {
+ final int targetUptoMid = targetUpto;
+
+ // Second compare the rest of the term, but
+ // don't save arc/output/frame; we only do this
+ // to find out if the target term is before,
+ // equal or after the current term
+ final int targetLimit2 = Math.min(target.length, term.length);
+ while (targetUpto < targetLimit2) {
+ cmp = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF);
+ // if (DEBUG) {
+ // System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")");
+ // }
+ if (cmp != 0) {
+ break;
+ }
+ targetUpto++;
+ }
+
+ if (cmp == 0) {
+ cmp = term.length - target.length;
+ }
+ targetUpto = targetUptoMid;
+ }
+
+ if (cmp < 0) {
+ // Common case: target term is after current
+ // term, ie, app is seeking multiple terms
+ // in sorted order
+ // if (DEBUG) {
+ // System.out.println(" target is after current (shares prefixLen=" + targetUpto + "); frame.ord=" + lastFrame.ord);
+ // }
+ currentFrame = lastFrame;
+
+ } else if (cmp > 0) {
+ // Uncommon case: target term
+ // is before current term; this means we can
+ // keep the currentFrame but we must rewind it
+ // (so we scan from the start)
+ targetBeforeCurrentLength = 0;
+ // if (DEBUG) {
+ // System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); rewind frame ord=" + lastFrame.ord);
+ // }
+ currentFrame = lastFrame;
+ currentFrame.rewind();
+ } else {
+ // Target is exactly the same as current term
+ assert term.length == target.length;
+ if (termExists) {
+ // if (DEBUG) {
+ // System.out.println(" target is same as current; return true");
+ // }
+ return true;
+ } else {
+ // if (DEBUG) {
+ // System.out.println(" target is same as current but term doesn't exist");
+ // }
+ }
+ //validIndexPrefix = currentFrame.depth;
+ //term.length = target.length;
+ //return termExists;
+ }
+
+ } else {
+
+ targetBeforeCurrentLength = -1;
+ arc = index.getFirstArc(arcs[0]);
+
+ // Empty string prefix must have an output (block) in the index!
+ assert arc.isFinal();
+ assert arc.output != null;
+
+ // if (DEBUG) {
+ // System.out.println(" no seek state; push root frame");
+ // }
+
+ output = arc.output;
+
+ currentFrame = staticFrame;
+
+ //term.length = 0;
+ targetUpto = 0;
+ currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), 0);
+ }
+
+ // if (DEBUG) {
+ // System.out.println(" start index loop targetUpto=" + targetUpto + " output=" + output + " currentFrame.ord=" + currentFrame.ord + " targetBeforeCurrentLength=" + targetBeforeCurrentLength);
+ // }
+
+ while (targetUpto < target.length) {
+
+ final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF;
+
+ final FST.Arc<BytesRef> nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto), fstReader);
+
+ if (nextArc == null) {
+
+ // Index is exhausted
+ // if (DEBUG) {
+ // System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + toHex(targetLabel));
+ // }
+
+ validIndexPrefix = currentFrame.prefix;
+ //validIndexPrefix = targetUpto;
+
+ currentFrame.scanToFloorFrame(target);
+
+ if (!currentFrame.hasTerms) {
+ termExists = false;
+ term.bytes[targetUpto] = (byte) targetLabel;
+ term.length = 1+targetUpto;
+ // if (DEBUG) {
+ // System.out.println(" FAST NOT_FOUND term=" + brToString(term));
+ // }
+ return false;
+ }
+
+ currentFrame.loadBlock();
+
+ final SeekStatus result = currentFrame.scanToTerm(target, true);
+ if (result == SeekStatus.FOUND) {
+ // if (DEBUG) {
+ // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term);
+ // }
+ return true;
+ } else {
+ // if (DEBUG) {
+ // System.out.println(" got " + result + "; return NOT_FOUND term=" + brToString(term));
+ // }
+ return false;
+ }
+ } else {
+ // Follow this arc
+ arc = nextArc;
+ term.bytes[targetUpto] = (byte) targetLabel;
+ // Aggregate output as we go:
+ assert arc.output != null;
+ if (arc.output != NO_OUTPUT) {
+ output = fstOutputs.add(output, arc.output);
+ }
+
+ // if (DEBUG) {
+ // System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput);
+ // }
+ targetUpto++;
+
+ if (arc.isFinal()) {
+ //if (DEBUG) System.out.println(" arc is final!");
+ currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), targetUpto);
+ //if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms);
+ }
+ }
+ }
+
+ //validIndexPrefix = targetUpto;
+ validIndexPrefix = currentFrame.prefix;
+
+ currentFrame.scanToFloorFrame(target);
+
+ // Target term is entirely contained in the index:
+ if (!currentFrame.hasTerms) {
+ termExists = false;
+ term.length = targetUpto;
+ // if (DEBUG) {
+ // System.out.println(" FAST NOT_FOUND term=" + brToString(term));
+ // }
+ return false;
+ }
+
+ currentFrame.loadBlock();
+
+ final SeekStatus result = currentFrame.scanToTerm(target, true);
+ if (result == SeekStatus.FOUND) {
+ // if (DEBUG) {
+ // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term);
+ // }
+ return true;
+ } else {
+ // if (DEBUG) {
+ // System.out.println(" got result " + result + "; return NOT_FOUND term=" + term.utf8ToString());
+ // }
+
+ return false;
+ }
+ }
+
+ @Override
+ public SeekStatus seekCeil(final BytesRef target, final boolean useCache) throws IOException {
+ if (index == null) {
+ throw new IllegalStateException("terms index was not loaded");
+ }
+
+ if (term.bytes.length <= target.length) {
+ term.bytes = ArrayUtil.grow(term.bytes, 1+target.length);
+ }
+
+ assert clearEOF();
+
+ //if (DEBUG) {
+ //System.out.println("\nBTTR.seekCeil seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix= " + validIndexPrefix);
+ //printSeekState();
+ //}
+
+ FST.Arc<BytesRef> arc;
+ int targetUpto;
+ BytesRef output;
+
+ targetBeforeCurrentLength = currentFrame.ord;
+
+ if (currentFrame != staticFrame) {
+
+ // We are already seek'd; find the common
+ // prefix of new seek term vs current term and
+ // re-use the corresponding seek state. For
+ // example, if app first seeks to foobar, then
+ // seeks to foobaz, we can re-use the seek state
+ // for the first 5 bytes.
+
+ //if (DEBUG) {
+ //System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix);
+ //}
+
+ arc = arcs[0];
+ assert arc.isFinal();
+ output = arc.output;
+ targetUpto = 0;
+
+ Frame lastFrame = stack[0];
+ assert validIndexPrefix <= term.length;
+
+ final int targetLimit = Math.min(target.length, validIndexPrefix);
+
+ int cmp = 0;
+
+ // TOOD: we should write our vLong backwards (MSB
+ // first) to get better sharing from the FST
+
+ // First compare up to valid seek frames:
+ while (targetUpto < targetLimit) {
+ cmp = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF);
+ //if (DEBUG) {
+ //System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")" + " arc.output=" + arc.output + " output=" + output);
+ //}
+ if (cmp != 0) {
+ break;
+ }
+ arc = arcs[1+targetUpto];
+ assert arc.label == (target.bytes[target.offset + targetUpto] & 0xFF): "arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF);
+ // TOOD: we could save the outputs in local
+ // byte[][] instead of making new objs ever
+ // seek; but, often the FST doesn't have any
+ // shared bytes (but this could change if we
+ // reverse vLong byte order)
+ if (arc.output != NO_OUTPUT) {
+ output = fstOutputs.add(output, arc.output);
+ }
+ if (arc.isFinal()) {
+ lastFrame = stack[1+lastFrame.ord];
+ }
+ targetUpto++;
+ }
+
+
+ if (cmp == 0) {
+ final int targetUptoMid = targetUpto;
+ // Second compare the rest of the term, but
+ // don't save arc/output/frame:
+ final int targetLimit2 = Math.min(target.length, term.length);
+ while (targetUpto < targetLimit2) {
+ cmp = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF);
+ //if (DEBUG) {
+ //System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")");
+ //}
+ if (cmp != 0) {
+ break;
+ }
+ targetUpto++;
+ }
+
+ if (cmp == 0) {
+ cmp = term.length - target.length;
+ }
+ targetUpto = targetUptoMid;
+ }
+
+ if (cmp < 0) {
+ // Common case: target term is after current
+ // term, ie, app is seeking multiple terms
+ // in sorted order
+ //if (DEBUG) {
+ //System.out.println(" target is after current (shares prefixLen=" + targetUpto + "); clear frame.scanned ord=" + lastFrame.ord);
+ //}
+ currentFrame = lastFrame;
+
+ } else if (cmp > 0) {
+ // Uncommon case: target term
+ // is before current term; this means we can
+ // keep the currentFrame but we must rewind it
+ // (so we scan from the start)
+ targetBeforeCurrentLength = 0;
+ //if (DEBUG) {
+ //System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); rewind frame ord=" + lastFrame.ord);
+ //}
+ currentFrame = lastFrame;
+ currentFrame.rewind();
+ } else {
+ // Target is exactly the same as current term
+ assert term.length == target.length;
+ if (termExists) {
+ //if (DEBUG) {
+ //System.out.println(" target is same as current; return FOUND");
+ //}
+ return SeekStatus.FOUND;
+ } else {
+ //if (DEBUG) {
+ //System.out.println(" target is same as current but term doesn't exist");
+ //}
+ }
+ }
+
+ } else {
+
+ targetBeforeCurrentLength = -1;
+ arc = index.getFirstArc(arcs[0]);
+
+ // Empty string prefix must have an output (block) in the index!
+ assert arc.isFinal();
+ assert arc.output != null;
+
+ //if (DEBUG) {
+ //System.out.println(" no seek state; push root frame");
+ //}
+
+ output = arc.output;
+
+ currentFrame = staticFrame;
+
+ //term.length = 0;
+ targetUpto = 0;
+ currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), 0);
+ }
+
+ //if (DEBUG) {
+ //System.out.println(" start index loop targetUpto=" + targetUpto + " output=" + output + " currentFrame.ord+1=" + currentFrame.ord + " targetBeforeCurrentLength=" + targetBeforeCurrentLength);
+ //}
+
+ while (targetUpto < target.length) {
+
+ final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF;
+
+ final FST.Arc<BytesRef> nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto), fstReader);
+
+ if (nextArc == null) {
+
+ // Index is exhausted
+ // if (DEBUG) {
+ // System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + toHex(targetLabel));
+ // }
+
+ validIndexPrefix = currentFrame.prefix;
+ //validIndexPrefix = targetUpto;
+
+ currentFrame.scanToFloorFrame(target);
+
+ currentFrame.loadBlock();
+
+ final SeekStatus result = currentFrame.scanToTerm(target, false);
+ if (result == SeekStatus.END) {
+ term.copyBytes(target);
+ termExists = false;
+
+ if (next() != null) {
+ //if (DEBUG) {
+ //System.out.println(" return NOT_FOUND term=" + brToString(term) + " " + term);
+ //}
+ return SeekStatus.NOT_FOUND;
+ } else {
+ //if (DEBUG) {
+ //System.out.println(" return END");
+ //}
+ return SeekStatus.END;
+ }
+ } else {
+ //if (DEBUG) {
+ //System.out.println(" return " + result + " term=" + brToString(term) + " " + term);
+ //}
+ return result;
+ }
+ } else {
+ // Follow this arc
+ term.bytes[targetUpto] = (byte) targetLabel;
+ arc = nextArc;
+ // Aggregate output as we go:
+ assert arc.output != null;
+ if (arc.output != NO_OUTPUT) {
+ output = fstOutputs.add(output, arc.output);
+ }
+
+ //if (DEBUG) {
+ //System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput);
+ //}
+ targetUpto++;
+
+ if (arc.isFinal()) {
+ //if (DEBUG) System.out.println(" arc is final!");
+ currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), targetUpto);
+ //if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms);
+ }
+ }
+ }
+
+ //validIndexPrefix = targetUpto;
+ validIndexPrefix = currentFrame.prefix;
+
+ currentFrame.scanToFloorFrame(target);
+
+ currentFrame.loadBlock();
+
+ final SeekStatus result = currentFrame.scanToTerm(target, false);
+
+ if (result == SeekStatus.END) {
+ term.copyBytes(target);
+ termExists = false;
+ if (next() != null) {
+ //if (DEBUG) {
+ //System.out.println(" return NOT_FOUND term=" + term.utf8ToString() + " " + term);
+ //}
+ return SeekStatus.NOT_FOUND;
+ } else {
+ //if (DEBUG) {
+ //System.out.println(" return END");
+ //}
+ return SeekStatus.END;
+ }
+ } else {
+ return result;
+ }
+ }
+
+ @SuppressWarnings("unused")
+ private void printSeekState(PrintStream out) throws IOException {
+ if (currentFrame == staticFrame) {
+ out.println(" no prior seek");
+ } else {
+ out.println(" prior seek state:");
+ int ord = 0;
+ boolean isSeekFrame = true;
+ while(true) {
+ Frame f = getFrame(ord);
+ assert f != null;
+ final BytesRef prefix = new BytesRef(term.bytes, 0, f.prefix);
+ if (f.nextEnt == -1) {
+ out.println(" frame " + (isSeekFrame ? "(seek)" : "(next)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + prefix + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp<<BlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? BlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS:0) + (f.isFloor ? BlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR:0)) + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd());
+ } else {
+ out.println(" frame " + (isSeekFrame ? "(seek, loaded)" : "(next, loaded)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + prefix + " nextEnt=" + f.nextEnt + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp<<BlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? BlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS:0) + (f.isFloor ? BlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR:0)) + " lastSubFP=" + f.lastSubFP + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd());
+ }
+ if (index != null) {
+ assert !isSeekFrame || f.arc != null: "isSeekFrame=" + isSeekFrame + " f.arc=" + f.arc;
+ if (f.prefix > 0 && isSeekFrame && f.arc.label != (term.bytes[f.prefix-1]&0xFF)) {
+ out.println(" broken seek state: arc.label=" + (char) f.arc.label + " vs term byte=" + (char) (term.bytes[f.prefix-1]&0xFF));
+ throw new RuntimeException("seek state is broken");
+ }
+ BytesRef output = Util.get(index, prefix);
+ if (output == null) {
+ out.println(" broken seek state: prefix is not final in index");
+ throw new RuntimeException("seek state is broken");
+ } else if (isSeekFrame && !f.isFloor) {
+ final ByteArrayDataInput reader = new ByteArrayDataInput(output.bytes, output.offset, output.length);
+ final long codeOrig = reader.readVLong();
+ final long code = (f.fp << BlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) | (f.hasTerms ? BlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS:0) | (f.isFloor ? BlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR:0);
+ if (codeOrig != code) {
+ out.println(" broken seek state: output code=" + codeOrig + " doesn't match frame code=" + code);
+ throw new RuntimeException("seek state is broken");
+ }
+ }
+ }
+ if (f == currentFrame) {
+ break;
+ }
+ if (f.prefix == validIndexPrefix) {
+ isSeekFrame = false;
+ }
+ ord++;
+ }
+ }
+ }
+
+ /* Decodes only the term bytes of the next term. If caller then asks for
+ metadata, ie docFreq, totalTermFreq or pulls a D/&PEnum, we then (lazily)
+ decode all metadata up to the current term. */
+ @Override
+ public BytesRef next() throws IOException {
+
+ if (in == null) {
+ // Fresh TermsEnum; seek to first term:
+ final FST.Arc<BytesRef> arc;
+ if (index != null) {
+ arc = index.getFirstArc(arcs[0]);
+ // Empty string prefix must have an output in the index!
+ assert arc.isFinal();
+ } else {
+ arc = null;
+ }
+ currentFrame = pushFrame(arc, rootCode, 0);
+ currentFrame.loadBlock();
+ }
+
+ targetBeforeCurrentLength = currentFrame.ord;
+
+ assert !eof;
+ //if (DEBUG) {
+ //System.out.println("\nBTTR.next seg=" + segment + " term=" + brToString(term) + " termExists?=" + termExists + " field=" + fieldInfo.name + " termBlockOrd=" + currentFrame.state.termBlockOrd + " validIndexPrefix=" + validIndexPrefix);
+ //printSeekState();
+ //}
+
+ if (currentFrame == staticFrame) {
+ // If seek was previously called and the term was
+ // cached, or seek(TermState) was called, usually
+ // caller is just going to pull a D/&PEnum or get
+ // docFreq, etc. But, if they then call next(),
+ // this method catches up all internal state so next()
+ // works properly:
+ //if (DEBUG) System.out.println(" re-seek to pending term=" + term.utf8ToString() + " " + term);
+ final boolean result = seekExact(term, false);
+ assert result;
+ }
+
+ // Pop finished blocks
+ while (currentFrame.nextEnt == currentFrame.entCount) {
+ if (!currentFrame.isLastInFloor) {
+ currentFrame.loadNextFloorBlock();
+ } else {
+ //if (DEBUG) System.out.println(" pop frame");
+ if (currentFrame.ord == 0) {
+ //if (DEBUG) System.out.println(" return null");
+ assert setEOF();
+ term.length = 0;
+ validIndexPrefix = 0;
+ currentFrame.rewind();
+ termExists = false;
+ return null;
+ }
+ final long lastFP = currentFrame.fpOrig;
+ currentFrame = stack[currentFrame.ord-1];
+
+ if (currentFrame.nextEnt == -1 || currentFrame.lastSubFP != lastFP) {
+ // We popped into a frame that's not loaded
+ // yet or not scan'd to the right entry
+ currentFrame.scanToFloorFrame(term);
+ currentFrame.loadBlock();
+ currentFrame.scanToSubBlock(lastFP);
+ }
+
+ // Note that the seek state (last seek) has been
+ // invalidated beyond this depth
+ validIndexPrefix = Math.min(validIndexPrefix, currentFrame.prefix);
+ //if (DEBUG) {
+ //System.out.println(" reset validIndexPrefix=" + validIndexPrefix);
+ //}
+ }
+ }
+
+ while(true) {
+ if (currentFrame.next()) {
+ // Push to new block:
+ //if (DEBUG) System.out.println(" push frame");
+ currentFrame = pushFrame(null, currentFrame.lastSubFP, term.length);
+ // This is a "next" frame -- even if it's
+ // floor'd we must pretend it isn't so we don't
+ // try to scan to the right floor frame:
+ currentFrame.isFloor = false;
+ //currentFrame.hasTerms = true;
+ currentFrame.loadBlock();
+ } else {
+ //if (DEBUG) System.out.println(" return term=" + term.utf8ToString() + " " + term + " currentFrame.ord=" + currentFrame.ord);
+ return term;
+ }
+ }
+ }
+
+ @Override
+ public BytesRef term() {
+ assert !eof;
+ return term;
+ }
+
+ @Override
+ public int docFreq() throws IOException {
+ assert !eof;
+ //if (DEBUG) System.out.println("BTR.docFreq");
+ currentFrame.decodeMetaData();
+ //if (DEBUG) System.out.println(" return " + currentFrame.state.docFreq);
+ return currentFrame.state.docFreq;
+ }
+
+ @Override
+ public long totalTermFreq() throws IOException {
+ assert !eof;
+ currentFrame.decodeMetaData();
+ return currentFrame.state.totalTermFreq;
+ }
+
+ @Override
+ public DocsEnum docs(Bits skipDocs, DocsEnum reuse, int flags) throws IOException {
+ assert !eof;
+ //if (DEBUG) {
+ //System.out.println("BTTR.docs seg=" + segment);
+
<TRUNCATED>