You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2014/07/20 14:08:33 UTC
svn commit: r1612080 [3/3] - in /lucene/dev/trunk/lucene: ./
codecs/src/java/org/apache/lucene/codecs/blocktreeords/
codecs/src/resources/META-INF/services/
codecs/src/test/org/apache/lucene/codecs/blocktreeords/
core/src/java/org/apache/lucene/codecs/...
Added: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnumFrame.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnumFrame.java?rev=1612080&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnumFrame.java (added)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnumFrame.java Sun Jul 20 12:08:32 2014
@@ -0,0 +1,851 @@
+package org.apache.lucene.codecs.blocktreeords;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.BlockTermState;
+import org.apache.lucene.codecs.blocktreeords.FSTOrdsOutputs.Output;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.index.TermsEnum.SeekStatus;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.fst.FST;
+
+final class OrdsSegmentTermsEnumFrame {
+ // Our index in stack[]:
+ final int ord;
+ // final boolean DEBUG = true;
+
+ boolean hasTerms;
+ boolean hasTermsOrig;
+ boolean isFloor;
+
+ // static boolean DEBUG = OrdsSegmentTermsEnum.DEBUG;
+
+ FST.Arc<Output> arc;
+
+ // File pointer where this block was loaded from
+ long fp;
+ long fpOrig;
+ long fpEnd;
+
+ byte[] suffixBytes = new byte[128];
+ final ByteArrayDataInput suffixesReader = new ByteArrayDataInput();
+
+ byte[] statBytes = new byte[64];
+ final ByteArrayDataInput statsReader = new ByteArrayDataInput();
+
+ byte[] floorData = new byte[32];
+ final ByteArrayDataInput floorDataReader = new ByteArrayDataInput();
+
+ // Length of prefix shared by all terms in this block
+ int prefix;
+
+ // Number of entries (term or sub-block) in this block
+ int entCount;
+
+ // Which term we will next read, or -1 if the block
+ // isn't loaded yet
+ int nextEnt;
+
+ // Starting termOrd for this frame, used to reset termOrd in rewind()
+ long termOrdOrig;
+
+ // 1 + ordinal of the current term
+ long termOrd;
+
+ // True if this block is either not a floor block,
+ // or, it's the last sub-block of a floor block
+ boolean isLastInFloor;
+
+ // True if all entries are terms
+ boolean isLeafBlock;
+
+ long lastSubFP;
+
+ // Starting byte of next floor block:
+ int nextFloorLabel;
+
+ // Starting termOrd of next floor block:
+ long nextFloorTermOrd;
+
+ int numFollowFloorBlocks;
+
+ // Next term to decode metaData; we decode metaData
+ // lazily so that scanning to find the matching term is
+ // fast and only if you find a match and app wants the
+ // stats or docs/positions enums, will we decode the
+ // metaData
+ int metaDataUpto;
+
+ final BlockTermState state;
+
+ // metadata buffer, holding monotonic values
+ public long[] longs;
+ // metadata buffer, holding general values
+ public byte[] bytes;
+ ByteArrayDataInput bytesReader;
+
+ private final OrdsSegmentTermsEnum ste;
+
+ public OrdsSegmentTermsEnumFrame(OrdsSegmentTermsEnum ste, int ord) throws IOException {
+ this.ste = ste;
+ this.ord = ord;
+ this.state = ste.fr.parent.postingsReader.newTermState();
+ this.state.totalTermFreq = -1;
+ this.longs = new long[ste.fr.longsSize];
+ }
+
+ public void setFloorData(ByteArrayDataInput in, BytesRef source) {
+ final int numBytes = source.length - (in.getPosition() - source.offset);
+ assert numBytes > 0;
+ if (numBytes > floorData.length) {
+ floorData = new byte[ArrayUtil.oversize(numBytes, 1)];
+ }
+ System.arraycopy(source.bytes, source.offset+in.getPosition(), floorData, 0, numBytes);
+ floorDataReader.reset(floorData, 0, numBytes);
+ numFollowFloorBlocks = floorDataReader.readVInt();
+ nextFloorLabel = floorDataReader.readByte() & 0xff;
+ nextFloorTermOrd = termOrdOrig + floorDataReader.readVLong();
+ // System.out.println(" setFloorData ord=" + ord + " nextFloorTermOrd=" + nextFloorTermOrd + " shift=" + (nextFloorTermOrd-termOrdOrig));
+
+ //if (DEBUG) {
+ //System.out.println(" setFloorData fpOrig=" + fpOrig + " bytes=" + new BytesRef(source.bytes, source.offset + in.getPosition(), numBytes) + " numFollowFloorBlocks=" + numFollowFloorBlocks + " nextFloorLabel=" + toHex(nextFloorLabel));
+ //}
+ }
+
+ public int getTermBlockOrd() {
+ return isLeafBlock ? nextEnt : state.termBlockOrd;
+ }
+
+ void loadNextFloorBlock() throws IOException {
+ //if (DEBUG) {
+ //System.out.println(" loadNextFloorBlock fp=" + fp + " fpEnd=" + fpEnd);
+ //}
+ assert arc == null || isFloor: "arc=" + arc + " isFloor=" + isFloor;
+ // NOTE: we don't need to touch termOrd here, because we fully scanned this current frame
+ fp = fpEnd;
+ nextEnt = -1;
+ loadBlock();
+ }
+
+ /* Does initial decode of next block of terms; this
+ doesn't actually decode the docFreq, totalTermFreq,
+ postings details (frq/prx offset, etc.) metadata;
+ it just loads them as byte[] blobs which are then
+ decoded on-demand if the metadata is ever requested
+ for any term in this block. This enables terms-only
+ intensive consumes (eg certain MTQs, respelling) to
+ not pay the price of decoding metadata they won't
+ use. */
+ void loadBlock() throws IOException {
+
+ // Clone the IndexInput lazily, so that consumers
+ // that just pull a TermsEnum to
+ // seekExact(TermState) don't pay this cost:
+ ste.initIndexInput();
+
+ if (nextEnt != -1) {
+ // Already loaded
+ return;
+ }
+ // System.out.println("loadBlock ord=" + ord + " termOrdOrig=" + termOrdOrig + " termOrd=" + termOrd + " fp=" + fp);
+
+ ste.in.seek(fp);
+ int code = ste.in.readVInt();
+ entCount = code >>> 1;
+ assert entCount > 0;
+ isLastInFloor = (code & 1) != 0;
+ assert arc == null || (isLastInFloor || isFloor);
+
+ // TODO: if suffixes were stored in random-access
+ // array structure, then we could do binary search
+ // instead of linear scan to find target term; eg
+ // we could have simple array of offsets
+
+ // term suffixes:
+ code = ste.in.readVInt();
+ isLeafBlock = (code & 1) != 0;
+ int numBytes = code >>> 1;
+ if (suffixBytes.length < numBytes) {
+ suffixBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
+ }
+ ste.in.readBytes(suffixBytes, 0, numBytes);
+ suffixesReader.reset(suffixBytes, 0, numBytes);
+
+ /*if (DEBUG) {
+ if (arc == null) {
+ System.out.println(" loadBlock (next) fp=" + fp + " entCount=" + entCount + " prefixLen=" + prefix + " isLastInFloor=" + isLastInFloor + " leaf?=" + isLeafBlock);
+ } else {
+ System.out.println(" loadBlock (seek) fp=" + fp + " entCount=" + entCount + " prefixLen=" + prefix + " hasTerms?=" + hasTerms + " isFloor?=" + isFloor + " isLastInFloor=" + isLastInFloor + " leaf?=" + isLeafBlock);
+ }
+ }*/
+
+ // stats
+ numBytes = ste.in.readVInt();
+ if (statBytes.length < numBytes) {
+ statBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
+ }
+ // System.out.println("READ stats numBytes=" + numBytes + " fp=" + ste.in.getFilePointer());
+ ste.in.readBytes(statBytes, 0, numBytes);
+ statsReader.reset(statBytes, 0, numBytes);
+ metaDataUpto = 0;
+
+ state.termBlockOrd = 0;
+ nextEnt = 0;
+ lastSubFP = -1;
+
+ // TODO: we could skip this if !hasTerms; but
+ // that's rare so won't help much
+ // metadata
+ numBytes = ste.in.readVInt();
+ if (bytes == null) {
+ bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
+ bytesReader = new ByteArrayDataInput();
+ } else if (bytes.length < numBytes) {
+ bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
+ }
+ ste.in.readBytes(bytes, 0, numBytes);
+ bytesReader.reset(bytes, 0, numBytes);
+
+ // Sub-blocks of a single floor block are always
+ // written one after another -- tail recurse:
+ fpEnd = ste.in.getFilePointer();
+ // if (DEBUG) {
+ // System.out.println(" fpEnd=" + fpEnd);
+ // }
+ }
+
+ void rewind() {
+
+ // Force reload:
+ fp = fpOrig;
+ termOrd = termOrdOrig;
+ nextEnt = -1;
+ hasTerms = hasTermsOrig;
+ if (isFloor) {
+ floorDataReader.rewind();
+ numFollowFloorBlocks = floorDataReader.readVInt();
+ assert numFollowFloorBlocks > 0;
+ nextFloorLabel = floorDataReader.readByte() & 0xff;
+ nextFloorTermOrd = termOrdOrig + floorDataReader.readVLong();
+ //System.out.println(" frame.rewind nextFloorTermOrd=" + nextFloorTermOrd);
+ }
+
+ /*
+ //System.out.println("rewind");
+ // Keeps the block loaded, but rewinds its state:
+ if (nextEnt > 0 || fp != fpOrig) {
+ if (DEBUG) {
+ System.out.println(" rewind frame ord=" + ord + " fpOrig=" + fpOrig + " fp=" + fp + " hasTerms?=" + hasTerms + " isFloor?=" + isFloor + " nextEnt=" + nextEnt + " prefixLen=" + prefix);
+ }
+ if (fp != fpOrig) {
+ fp = fpOrig;
+ nextEnt = -1;
+ } else {
+ nextEnt = 0;
+ }
+ hasTerms = hasTermsOrig;
+ if (isFloor) {
+ floorDataReader.rewind();
+ numFollowFloorBlocks = floorDataReader.readVInt();
+ nextFloorLabel = floorDataReader.readByte() & 0xff;
+ }
+ assert suffixBytes != null;
+ suffixesReader.rewind();
+ assert statBytes != null;
+ statsReader.rewind();
+ metaDataUpto = 0;
+ state.termBlockOrd = 0;
+ // TODO: skip this if !hasTerms? Then postings
+ // impl wouldn't have to write useless 0 byte
+ postingsReader.resetTermsBlock(fieldInfo, state);
+ lastSubFP = -1;
+ } else if (DEBUG) {
+ System.out.println(" skip rewind fp=" + fp + " fpOrig=" + fpOrig + " nextEnt=" + nextEnt + " ord=" + ord);
+ }
+ */
+ }
+
+ public boolean next() {
+ return isLeafBlock ? nextLeaf() : nextNonLeaf();
+ }
+
+ // Decodes next entry; returns true if it's a sub-block
+ public boolean nextLeaf() {
+ //if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount);
+ assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp + " termOrd=" + termOrd;
+ nextEnt++;
+ termOrd++;
+ suffix = suffixesReader.readVInt();
+ startBytePos = suffixesReader.getPosition();
+ ste.term.length = prefix + suffix;
+ if (ste.term.bytes.length < ste.term.length) {
+ ste.term.grow(ste.term.length);
+ }
+ suffixesReader.readBytes(ste.term.bytes, prefix, suffix);
+ // A normal term
+ ste.termExists = true;
+ return false;
+ }
+
+ public boolean nextNonLeaf() {
+ // if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount);
+ assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
+ nextEnt++;
+ final int code = suffixesReader.readVInt();
+ suffix = code >>> 1;
+ startBytePos = suffixesReader.getPosition();
+ ste.term.length = prefix + suffix;
+ if (ste.term.bytes.length < ste.term.length) {
+ ste.term.grow(ste.term.length);
+ }
+ suffixesReader.readBytes(ste.term.bytes, prefix, suffix);
+ if ((code & 1) == 0) {
+ // A normal term
+ ste.termExists = true;
+ subCode = 0;
+ state.termBlockOrd++;
+ termOrd++;
+ return false;
+ } else {
+ // A sub-block; make sub-FP absolute:
+ ste.termExists = false;
+ subCode = suffixesReader.readVLong();
+ termOrd += suffixesReader.readVLong();
+ lastSubFP = fp - subCode;
+ // if (DEBUG) {
+ // System.out.println(" lastSubFP=" + lastSubFP);
+ // }
+ return true;
+ }
+ }
+
+ // TODO: make this array'd so we can do bin search?
+ // likely not worth it? need to measure how many
+ // floor blocks we "typically" get
+ public void scanToFloorFrame(BytesRef target) {
+
+ if (!isFloor || target.length <= prefix) {
+ // if (DEBUG) {
+ // System.out.println(" scanToFloorFrame skip: isFloor=" + isFloor + " target.length=" + target.length + " vs prefix=" + prefix);
+ // }
+ return;
+ }
+
+ final int targetLabel = target.bytes[target.offset + prefix] & 0xFF;
+
+ // if (DEBUG) {
+ // System.out.println(" scanToFloorFrame fpOrig=" + fpOrig + " targetLabel=" + ((char) targetLabel) + " vs nextFloorLabel=" + ((char) nextFloorLabel) + " numFollowFloorBlocks=" + numFollowFloorBlocks);
+ // }
+
+ if (targetLabel < nextFloorLabel) {
+ // if (DEBUG) {
+ // System.out.println(" already on correct block");
+ // }
+ return;
+ }
+
+ assert numFollowFloorBlocks != 0;
+
+ long newFP = fpOrig;
+ long lastFloorTermOrd = nextFloorTermOrd;
+ while (true) {
+ final long code = floorDataReader.readVLong();
+ newFP = fpOrig + (code >>> 1);
+ hasTerms = (code & 1) != 0;
+ // if (DEBUG) {
+ // System.out.println(" label=" + ((char) nextFloorLabel) + " fp=" + newFP + " hasTerms?=" + hasTerms + " numFollowFloor=" + numFollowFloorBlocks);
+ // }
+
+ isLastInFloor = numFollowFloorBlocks == 1;
+ numFollowFloorBlocks--;
+
+ lastFloorTermOrd = nextFloorTermOrd;
+
+ if (isLastInFloor) {
+ nextFloorLabel = 256;
+ nextFloorTermOrd = Long.MAX_VALUE;
+ // if (DEBUG) {
+ // System.out.println(" stop! last block nextFloorLabel=" + ((char) nextFloorLabel));
+ // }
+ break;
+ } else {
+ nextFloorLabel = floorDataReader.readByte() & 0xff;
+ nextFloorTermOrd += floorDataReader.readVLong();
+ //System.out.println(" scanToFloorFrame: nextFloorTermOrd=" + nextFloorTermOrd);
+ if (targetLabel < nextFloorLabel) {
+ // if (DEBUG) {
+ // System.out.println(" stop! nextFloorLabel=" + ((char) nextFloorLabel));
+ // }
+ break;
+ }
+ }
+ }
+
+ if (newFP != fp) {
+ // Force re-load of the block:
+ // if (DEBUG) {
+ // System.out.println(" force switch to fp=" + newFP + " oldFP=" + fp);
+ // }
+ nextEnt = -1;
+ termOrd = lastFloorTermOrd;
+ fp = newFP;
+ } else {
+ // if (DEBUG) {
+ // System.out.println(" stay on same fp=" + newFP);
+ // }
+ }
+ }
+
+ // TODO: make this array'd so we can do bin search?
+ // likely not worth it? need to measure how many
+ // floor blocks we "typically" get
+ public void scanToFloorFrame(long targetOrd) {
+ // System.out.println(" scanToFloorFrame targetOrd=" + targetOrd + " vs nextFloorTermOrd=" + nextFloorTermOrd + " numFollowFloorBlocks=" + numFollowFloorBlocks + " termOrdOrig=" + termOrdOrig);
+
+ if (!isFloor || targetOrd < nextFloorTermOrd) {
+ return;
+ }
+
+ assert numFollowFloorBlocks != 0;
+ long lastFloorTermOrd = nextFloorTermOrd;
+
+ long newFP = fpOrig;
+ while (true) {
+ final long code = floorDataReader.readVLong();
+ newFP = fpOrig + (code >>> 1);
+ hasTerms = (code & 1) != 0;
+ // if (DEBUG) {
+ // System.out.println(" label=" + ((char) nextFloorLabel) + " fp=" + newFP + " hasTerms?=" + hasTerms + " numFollowFloor=" + numFollowFloorBlocks);
+ // }
+
+ isLastInFloor = numFollowFloorBlocks == 1;
+ numFollowFloorBlocks--;
+
+ lastFloorTermOrd = nextFloorTermOrd;
+
+ if (isLastInFloor) {
+ nextFloorLabel = 256;
+ nextFloorTermOrd = Long.MAX_VALUE;
+ // if (DEBUG) {
+ // System.out.println(" stop! last block nextFloorLabel=" + ((char) nextFloorLabel));
+ // }
+ break;
+ } else {
+ nextFloorLabel = floorDataReader.readByte() & 0xff;
+ nextFloorTermOrd += floorDataReader.readVLong();
+ if (targetOrd < nextFloorTermOrd) {
+ // if (DEBUG) {
+ // System.out.println(" stop! nextFloorLabel=" + ((char) nextFloorLabel));
+ // }
+ break;
+ }
+ }
+ }
+ // System.out.println(" after: lastFloorTermOrd=" + lastFloorTermOrd + " newFP=" + newFP + " vs fp=" + fp + " lastFloorTermOrd=" + lastFloorTermOrd);
+
+ if (newFP != fp) {
+ // Force re-load of the block:
+ // if (DEBUG) {
+ // System.out.println(" force switch to fp=" + newFP + " oldFP=" + fp);
+ // }
+ nextEnt = -1;
+ termOrd = lastFloorTermOrd;
+ fp = newFP;
+ } else {
+ // if (DEBUG) {
+ // System.out.println(" stay on same fp=" + newFP);
+ // }
+ }
+ }
+
+ public void decodeMetaData() throws IOException {
+
+ assert nextEnt >= 0;
+
+ // lazily catch up on metadata decode:
+ final int limit = getTermBlockOrd();
+ boolean absolute = metaDataUpto == 0;
+ assert limit > 0: "limit=" + limit + " isLeafBlock=" + isLeafBlock + " nextEnt=" + nextEnt;
+
+ // if (DEBUG) System.out.println("\nBTTR.decodeMetadata seg=" + ste.fr.parent.segment + " mdUpto=" + metaDataUpto + " vs termBlockOrd=" + state.termBlockOrd + " limit=" + limit);
+
+ // TODO: better API would be "jump straight to term=N"???
+ while (metaDataUpto < limit) {
+
+ // TODO: we could make "tiers" of metadata, ie,
+ // decode docFreq/totalTF but don't decode postings
+ // metadata; this way caller could get
+ // docFreq/totalTF w/o paying decode cost for
+ // postings
+
+ // TODO: if docFreq were bulk decoded we could
+ // just skipN here:
+
+ // stats
+ state.docFreq = statsReader.readVInt();
+ //if (DEBUG) System.out.println(" dF=" + state.docFreq);
+ if (ste.fr.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
+ state.totalTermFreq = state.docFreq + statsReader.readVLong();
+ //if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq);
+ }
+ //if (DEBUG) System.out.println(" longsSize=" + ste.fr.longsSize);
+
+ // metadata
+ for (int i = 0; i < ste.fr.longsSize; i++) {
+ longs[i] = bytesReader.readVLong();
+ }
+ ste.fr.parent.postingsReader.decodeTerm(longs, bytesReader, ste.fr.fieldInfo, state, absolute);
+
+ metaDataUpto++;
+ absolute = false;
+ }
+ state.termBlockOrd = metaDataUpto;
+ }
+
+ // Used only by assert
+ private boolean prefixMatches(BytesRef target) {
+ for(int bytePos=0;bytePos<prefix;bytePos++) {
+ if (target.bytes[target.offset + bytePos] != ste.term.bytes[bytePos]) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ // Scans to sub-block that has this target fp; only
+ // called by next(); NOTE: does not set
+ // startBytePos/suffix as a side effect
+ public void scanToSubBlock(long subFP) {
+ assert !isLeafBlock;
+ //if (DEBUG) System.out.println(" scanToSubBlock fp=" + fp + " subFP=" + subFP + " entCount=" + entCount + " lastSubFP=" + lastSubFP);
+ //assert nextEnt == 0;
+ if (lastSubFP == subFP) {
+ //if (DEBUG) System.out.println(" already positioned");
+ return;
+ }
+ assert subFP < fp : "fp=" + fp + " subFP=" + subFP;
+ final long targetSubCode = fp - subFP;
+ //if (DEBUG) System.out.println(" targetSubCode=" + targetSubCode);
+ while(true) {
+ assert nextEnt < entCount;
+ nextEnt++;
+ final int code = suffixesReader.readVInt();
+ suffixesReader.skipBytes(isLeafBlock ? code : code >>> 1);
+ //if (DEBUG) System.out.println(" " + nextEnt + " (of " + entCount + ") ent isSubBlock=" + ((code&1)==1));
+ if ((code & 1) != 0) {
+ final long subCode = suffixesReader.readVLong();
+ termOrd += suffixesReader.readVLong();
+
+ //if (DEBUG) System.out.println(" subCode=" + subCode);
+ if (targetSubCode == subCode) {
+ //if (DEBUG) System.out.println(" match!");
+ lastSubFP = subFP;
+ return;
+ }
+ } else {
+ state.termBlockOrd++;
+ termOrd++;
+ }
+ }
+ }
+
+ // NOTE: sets startBytePos/suffix as a side effect
+ public SeekStatus scanToTerm(BytesRef target, boolean exactOnly) throws IOException {
+ return isLeafBlock ? scanToTermLeaf(target, exactOnly) : scanToTermNonLeaf(target, exactOnly);
+ }
+
+ private int startBytePos;
+ private int suffix;
+ private long subCode;
+
+ // Target's prefix matches this block's prefix; we
+ // scan the entries check if the suffix matches.
+ public SeekStatus scanToTermLeaf(BytesRef target, boolean exactOnly) throws IOException {
+
+ // if (DEBUG) System.out.println(" scanToTermLeaf: block fp=" + fp + " prefix=" + prefix + " nextEnt=" + nextEnt + " (of " + entCount + ") target=" + OrdsSegmentTermsEnum.brToString(target) + " term=" + OrdsSegmentTermsEnum.brToString(ste.term));
+
+ assert nextEnt != -1;
+
+ ste.termExists = true;
+ subCode = 0;
+
+ if (nextEnt == entCount) {
+ if (exactOnly) {
+ fillTerm();
+ }
+ return SeekStatus.END;
+ }
+
+ assert prefixMatches(target);
+
+ // Loop over each entry (term or sub-block) in this block:
+ //nextTerm: while(nextEnt < entCount) {
+ nextTerm: while (true) {
+ nextEnt++;
+ termOrd++;
+
+ suffix = suffixesReader.readVInt();
+
+ // if (DEBUG) {
+ // BytesRef suffixBytesRef = new BytesRef();
+ // suffixBytesRef.bytes = suffixBytes;
+ // suffixBytesRef.offset = suffixesReader.getPosition();
+ // suffixBytesRef.length = suffix;
+ // System.out.println(" cycle: term " + (nextEnt-1) + " (of " + entCount + ") suffix=" + OrdsSegmentTermsEnum.brToString(suffixBytesRef));
+ // }
+
+ final int termLen = prefix + suffix;
+ startBytePos = suffixesReader.getPosition();
+ suffixesReader.skipBytes(suffix);
+
+ final int targetLimit = target.offset + (target.length < termLen ? target.length : termLen);
+ int targetPos = target.offset + prefix;
+
+ // Loop over bytes in the suffix, comparing to
+ // the target
+ int bytePos = startBytePos;
+ while(true) {
+ final int cmp;
+ final boolean stop;
+ if (targetPos < targetLimit) {
+ cmp = (suffixBytes[bytePos++]&0xFF) - (target.bytes[targetPos++]&0xFF);
+ stop = false;
+ } else {
+ assert targetPos == targetLimit;
+ cmp = termLen - target.length;
+ stop = true;
+ }
+
+ if (cmp < 0) {
+ // Current entry is still before the target;
+ // keep scanning
+
+ if (nextEnt == entCount) {
+ if (exactOnly) {
+ fillTerm();
+ }
+ // We are done scanning this block
+ break nextTerm;
+ } else {
+ continue nextTerm;
+ }
+ } else if (cmp > 0) {
+
+ // Done! Current entry is after target --
+ // return NOT_FOUND:
+ fillTerm();
+
+ if (!exactOnly && !ste.termExists) {
+ // We are on a sub-block, and caller wants
+ // us to position to the next term after
+ // the target, so we must recurse into the
+ // sub-frame(s):
+ ste.currentFrame = ste.pushFrame(null, ste.currentFrame.lastSubFP, termLen, ste.currentFrame.termOrd);
+ ste.currentFrame.loadBlock();
+ while (ste.currentFrame.next()) {
+ ste.currentFrame = ste.pushFrame(null, ste.currentFrame.lastSubFP, ste.term.length, ste.currentFrame.termOrd);
+ ste.currentFrame.loadBlock();
+ }
+ }
+
+ //if (DEBUG) System.out.println(" not found");
+ return SeekStatus.NOT_FOUND;
+ } else if (stop) {
+ // Exact match!
+
+ // This cannot be a sub-block because we
+ // would have followed the index to this
+ // sub-block from the start:
+
+ assert ste.termExists;
+ fillTerm();
+ //if (DEBUG) System.out.println(" found!");
+ return SeekStatus.FOUND;
+ }
+ }
+ }
+
+ // It is possible (and OK) that terms index pointed us
+ // at this block, but, we scanned the entire block and
+ // did not find the term to position to. This happens
+ // when the target is after the last term in the block
+ // (but, before the next term in the index). EG
+ // target could be foozzz, and terms index pointed us
+ // to the foo* block, but the last term in this block
+ // was fooz (and, eg, first term in the next block will
+ // bee fop).
+ //if (DEBUG) System.out.println(" block end");
+ if (exactOnly) {
+ fillTerm();
+ }
+
+ // TODO: not consistent that in the
+ // not-exact case we don't next() into the next
+ // frame here
+ return SeekStatus.END;
+ }
+
+ // Target's prefix matches this block's prefix; we
+ // scan the entries check if the suffix matches.
+ public SeekStatus scanToTermNonLeaf(BytesRef target, boolean exactOnly) throws IOException {
+
+ // if (DEBUG) System.out.println(" scanToTermNonLeaf: block fp=" + fp + " prefix=" + prefix + " nextEnt=" + nextEnt + " (of " + entCount + ") target=" + OrdsSegmentTermsEnum.brToString(target) + " term=" + OrdsSegmentTermsEnum.brToString(ste.term));
+
+ assert nextEnt != -1;
+
+ if (nextEnt == entCount) {
+ if (exactOnly) {
+ fillTerm();
+ ste.termExists = subCode == 0;
+ }
+ return SeekStatus.END;
+ }
+
+ assert prefixMatches(target);
+
+ // Loop over each entry (term or sub-block) in this block:
+ //nextTerm: while(nextEnt < entCount) {
+ nextTerm: while (true) {
+ nextEnt++;
+
+ final int code = suffixesReader.readVInt();
+ suffix = code >>> 1;
+ // if (DEBUG) {
+ // BytesRef suffixBytesRef = new BytesRef();
+ // suffixBytesRef.bytes = suffixBytes;
+ // suffixBytesRef.offset = suffixesReader.getPosition();
+ // suffixBytesRef.length = suffix;
+ // System.out.println(" cycle: " + ((code&1)==1 ? "sub-block" : "term") + " " + (nextEnt-1) + " (of " + entCount + ") suffix=" + brToString(suffixBytesRef));
+ // }
+
+ ste.termExists = (code & 1) == 0;
+ final int termLen = prefix + suffix;
+ startBytePos = suffixesReader.getPosition();
+ suffixesReader.skipBytes(suffix);
+ if (ste.termExists) {
+ state.termBlockOrd++;
+ termOrd++;
+ subCode = 0;
+ } else {
+ subCode = suffixesReader.readVLong();
+ termOrd += suffixesReader.readVLong();
+ lastSubFP = fp - subCode;
+ }
+
+ final int targetLimit = target.offset + (target.length < termLen ? target.length : termLen);
+ int targetPos = target.offset + prefix;
+
+ // Loop over bytes in the suffix, comparing to
+ // the target
+ int bytePos = startBytePos;
+ while(true) {
+ final int cmp;
+ final boolean stop;
+ if (targetPos < targetLimit) {
+ cmp = (suffixBytes[bytePos++]&0xFF) - (target.bytes[targetPos++]&0xFF);
+ stop = false;
+ } else {
+ assert targetPos == targetLimit;
+ cmp = termLen - target.length;
+ stop = true;
+ }
+
+ if (cmp < 0) {
+ // Current entry is still before the target;
+ // keep scanning
+
+ if (nextEnt == entCount) {
+ if (exactOnly) {
+ fillTerm();
+ //termExists = true;
+ }
+ // We are done scanning this block
+ break nextTerm;
+ } else {
+ continue nextTerm;
+ }
+ } else if (cmp > 0) {
+
+ // Done! Current entry is after target --
+ // return NOT_FOUND:
+ fillTerm();
+
+ if (!exactOnly && !ste.termExists) {
+ // We are on a sub-block, and caller wants
+ // us to position to the next term after
+ // the target, so we must recurse into the
+ // sub-frame(s):
+ ste.currentFrame = ste.pushFrame(null, ste.currentFrame.lastSubFP, termLen, ste.currentFrame.termOrd);
+ ste.currentFrame.loadBlock();
+ while (ste.currentFrame.next()) {
+ ste.currentFrame = ste.pushFrame(null, ste.currentFrame.lastSubFP, ste.term.length, ste.currentFrame.termOrd);
+ ste.currentFrame.loadBlock();
+ }
+ }
+
+ //if (DEBUG) System.out.println(" not found");
+ return SeekStatus.NOT_FOUND;
+ } else if (stop) {
+ // Exact match!
+
+ // This cannot be a sub-block because we
+ // would have followed the index to this
+ // sub-block from the start:
+
+ assert ste.termExists;
+ fillTerm();
+ //if (DEBUG) System.out.println(" found!");
+ return SeekStatus.FOUND;
+ }
+ }
+ }
+
+ // It is possible (and OK) that terms index pointed us
+ // at this block, but, we scanned the entire block and
+ // did not find the term to position to. This happens
+ // when the target is after the last term in the block
+ // (but, before the next term in the index). EG
+ // target could be foozzz, and terms index pointed us
+ // to the foo* block, but the last term in this block
+ // was fooz (and, eg, first term in the next block will
+ // bee fop).
+ //if (DEBUG) System.out.println(" block end");
+ if (exactOnly) {
+ fillTerm();
+ }
+
+ // TODO: not consistent that in the
+ // not-exact case we don't next() into the next
+ // frame here
+ return SeekStatus.END;
+ }
+
+ private void fillTerm() {
+ final int termLength = prefix + suffix;
+ ste.term.length = prefix + suffix;
+ if (ste.term.bytes.length < termLength) {
+ ste.term.grow(termLength);
+ }
+ System.arraycopy(suffixBytes, startBytePos, ste.term.bytes, prefix, suffix);
+ }
+}
Added: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/package.html?rev=1612080&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/package.html (added)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/package.html Sun Jul 20 12:08:32 2014
@@ -0,0 +1,27 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+<head>
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+</head>
+<body>
+Same postings format as Lucene41, except the terms dictionary also
+supports ords, i.e. returning which ord the enum is seeked to, and
+seeking by ord.
+</body>
+</html>
Modified: lucene/dev/trunk/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat?rev=1612080&r1=1612079&r2=1612080&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat (original)
+++ lucene/dev/trunk/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat Sun Jul 20 12:08:32 2014
@@ -13,12 +13,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-org.apache.lucene.codecs.pulsing.Pulsing41PostingsFormat
-org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat
-org.apache.lucene.codecs.memory.MemoryPostingsFormat
+org.apache.lucene.codecs.blocktreeords.Ords41PostingsFormat
org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat
org.apache.lucene.codecs.memory.DirectPostingsFormat
-org.apache.lucene.codecs.memory.FSTPulsing41PostingsFormat
+org.apache.lucene.codecs.memory.FSTOrdPostingsFormat
org.apache.lucene.codecs.memory.FSTOrdPulsing41PostingsFormat
org.apache.lucene.codecs.memory.FSTPostingsFormat
-org.apache.lucene.codecs.memory.FSTOrdPostingsFormat
+org.apache.lucene.codecs.memory.FSTPulsing41PostingsFormat
+org.apache.lucene.codecs.memory.MemoryPostingsFormat
+org.apache.lucene.codecs.pulsing.Pulsing41PostingsFormat
+org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat
Added: lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/blocktreeords/TestOrdsBlockTree.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/blocktreeords/TestOrdsBlockTree.java?rev=1612080&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/blocktreeords/TestOrdsBlockTree.java (added)
+++ lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/blocktreeords/TestOrdsBlockTree.java Sun Jul 20 12:08:32 2014
@@ -0,0 +1,360 @@
+package org.apache.lucene.codecs.blocktreeords;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.BasePostingsFormatTestCase;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.MultiFields;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.TestUtil;
+
+public class TestOrdsBlockTree extends BasePostingsFormatTestCase {
+ private final Codec codec = TestUtil.alwaysPostingsFormat(new Ords41PostingsFormat());
+
+ @Override
+ protected Codec getCodec() {
+ return codec;
+ }
+
+ public void testBasic() throws Exception {
+ Directory dir = newDirectory();
+ RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+ Document doc = new Document();
+ doc.add(newTextField("field", "a b c", Field.Store.NO));
+ w.addDocument(doc);
+ IndexReader r = w.getReader();
+ TermsEnum te = MultiFields.getTerms(r, "field").iterator(null);
+
+ // Test next()
+ assertEquals(new BytesRef("a"), te.next());
+ assertEquals(0L, te.ord());
+ assertEquals(new BytesRef("b"), te.next());
+ assertEquals(1L, te.ord());
+ assertEquals(new BytesRef("c"), te.next());
+ assertEquals(2L, te.ord());
+ assertNull(te.next());
+
+ // Test seekExact by term
+ assertTrue(te.seekExact(new BytesRef("b")));
+ assertEquals(1, te.ord());
+ assertTrue(te.seekExact(new BytesRef("a")));
+ assertEquals(0, te.ord());
+ assertTrue(te.seekExact(new BytesRef("c")));
+ assertEquals(2, te.ord());
+
+ // Test seekExact by ord
+ te.seekExact(1);
+ assertEquals(new BytesRef("b"), te.term());
+ te.seekExact(0);
+ assertEquals(new BytesRef("a"), te.term());
+ te.seekExact(2);
+ assertEquals(new BytesRef("c"), te.term());
+
+ r.close();
+ w.close();
+ dir.close();
+ }
+
+ public void testTwoBlocks() throws Exception {
+ Directory dir = newDirectory();
+ RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+ List<String> terms = new ArrayList<>();
+ for(int i=0;i<36;i++) {
+ Document doc = new Document();
+ String term = "" + (char) (97+i);
+ terms.add(term);
+ if (VERBOSE) {
+ System.out.println("i=" + i + " term=" + term);
+ }
+ doc.add(newTextField("field", term, Field.Store.NO));
+ w.addDocument(doc);
+ }
+ for(int i=0;i<36;i++) {
+ Document doc = new Document();
+ String term = "m" + (char) (97+i);
+ terms.add(term);
+ if (VERBOSE) {
+ System.out.println("i=" + i + " term=" + term);
+ }
+ doc.add(newTextField("field", term, Field.Store.NO));
+ w.addDocument(doc);
+ }
+ w.forceMerge(1);
+ IndexReader r = w.getReader();
+ TermsEnum te = MultiFields.getTerms(r, "field").iterator(null);
+
+ assertTrue(te.seekExact(new BytesRef("mo")));
+ assertEquals(27, te.ord());
+
+ te.seekExact(54);
+ assertEquals(new BytesRef("s"), te.term());
+
+ Collections.sort(terms);
+
+ for(int i=terms.size()-1;i>=0;i--) {
+ te.seekExact(i);
+ assertEquals(i, te.ord());
+ assertEquals(terms.get(i), te.term().utf8ToString());
+ }
+
+ int iters = atLeast(1000);
+ for(int iter=0;iter<iters;iter++) {
+ int ord = random().nextInt(terms.size());
+ BytesRef term = new BytesRef(terms.get(ord));
+ if (random().nextBoolean()) {
+ if (VERBOSE) {
+ System.out.println("TEST: iter=" + iter + " seek to ord=" + ord + " of " + terms.size());
+ }
+ te.seekExact(ord);
+ } else {
+ if (VERBOSE) {
+ System.out.println("TEST: iter=" + iter + " seek to term=" + terms.get(ord) + " ord=" + ord + " of " + terms.size());
+ }
+ te.seekExact(term);
+ }
+ assertEquals(ord, te.ord());
+ assertEquals(term, te.term());
+ }
+
+ r.close();
+ w.close();
+ dir.close();
+ }
+
+ public void testThreeBlocks() throws Exception {
+ Directory dir = newDirectory();
+ RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+ List<String> terms = new ArrayList<>();
+ for(int i=0;i<36;i++) {
+ Document doc = new Document();
+ String term = "" + (char) (97+i);
+ terms.add(term);
+ if (VERBOSE) {
+ System.out.println("i=" + i + " term=" + term);
+ }
+ doc.add(newTextField("field", term, Field.Store.NO));
+ w.addDocument(doc);
+ }
+ for(int i=0;i<36;i++) {
+ Document doc = new Document();
+ String term = "m" + (char) (97+i);
+ terms.add(term);
+ if (VERBOSE) {
+ System.out.println("i=" + i + " term=" + term);
+ }
+ doc.add(newTextField("field", term, Field.Store.NO));
+ w.addDocument(doc);
+ }
+ for(int i=0;i<36;i++) {
+ Document doc = new Document();
+ String term = "mo" + (char) (97+i);
+ terms.add(term);
+ if (VERBOSE) {
+ System.out.println("i=" + i + " term=" + term);
+ }
+ doc.add(newTextField("field", term, Field.Store.NO));
+ w.addDocument(doc);
+ }
+ w.forceMerge(1);
+ IndexReader r = w.getReader();
+ TermsEnum te = MultiFields.getTerms(r, "field").iterator(null);
+
+ if (VERBOSE) {
+ while (te.next() != null) {
+ System.out.println("TERM: " + te.ord() + " " + te.term().utf8ToString());
+ }
+ }
+
+ assertTrue(te.seekExact(new BytesRef("mo")));
+ assertEquals(27, te.ord());
+
+ te.seekExact(90);
+ assertEquals(new BytesRef("s"), te.term());
+
+ testEnum(te, terms);
+
+ r.close();
+ w.close();
+ dir.close();
+ }
+
+ private void testEnum(TermsEnum te, List<String> terms) throws IOException {
+ Collections.sort(terms);
+ for(int i=terms.size()-1;i>=0;i--) {
+ if (VERBOSE) {
+ System.out.println("TEST: seek to ord=" + i);
+ }
+ te.seekExact(i);
+ assertEquals(i, te.ord());
+ assertEquals(terms.get(i), te.term().utf8ToString());
+ }
+
+ int iters = atLeast(1000);
+ for(int iter=0;iter<iters;iter++) {
+ int ord = random().nextInt(terms.size());
+ if (random().nextBoolean()) {
+ te.seekExact(ord);
+ assertEquals(terms.get(ord), te.term().utf8ToString());
+ } else {
+ te.seekExact(new BytesRef(terms.get(ord)));
+ assertEquals(ord, te.ord());
+ }
+ }
+ }
+
+ public void testFloorBlocks() throws Exception {
+ Directory dir = newDirectory();
+ IndexWriterConfig iwc = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+ IndexWriter w = new IndexWriter(dir, iwc);
+ for(int i=0;i<128;i++) {
+ Document doc = new Document();
+ String term = "" + (char) i;
+ if (VERBOSE) {
+ System.out.println("i=" + i + " term=" + term + " bytes=" + new BytesRef(term));
+ }
+ doc.add(newStringField("field", term, Field.Store.NO));
+ w.addDocument(doc);
+ }
+ w.forceMerge(1);
+ IndexReader r = DirectoryReader.open(w, true);
+ TermsEnum te = MultiFields.getTerms(r, "field").iterator(null);
+
+ if (VERBOSE) {
+ BytesRef term;
+ while ((term = te.next()) != null) {
+ System.out.println(" " + te.ord() + ": " + term.utf8ToString());
+ }
+ }
+
+ assertTrue(te.seekExact(new BytesRef("a")));
+ assertEquals(97, te.ord());
+
+ te.seekExact(98);
+ assertEquals(new BytesRef("b"), te.term());
+
+ assertTrue(te.seekExact(new BytesRef("z")));
+ assertEquals(122, te.ord());
+
+ r.close();
+ w.close();
+ dir.close();
+ }
+
+ public void testNonRootFloorBlocks() throws Exception {
+ Directory dir = newDirectory();
+ IndexWriterConfig iwc = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+ IndexWriter w = new IndexWriter(dir, iwc);
+ List<String> terms = new ArrayList<>();
+ for(int i=0;i<36;i++) {
+ Document doc = new Document();
+ String term = "" + (char) (97+i);
+ terms.add(term);
+ if (VERBOSE) {
+ System.out.println("i=" + i + " term=" + term);
+ }
+ doc.add(newTextField("field", term, Field.Store.NO));
+ w.addDocument(doc);
+ }
+ for(int i=0;i<128;i++) {
+ Document doc = new Document();
+ String term = "m" + (char) i;
+ terms.add(term);
+ if (VERBOSE) {
+ System.out.println("i=" + i + " term=" + term + " bytes=" + new BytesRef(term));
+ }
+ doc.add(newStringField("field", term, Field.Store.NO));
+ w.addDocument(doc);
+ }
+ w.forceMerge(1);
+ IndexReader r = DirectoryReader.open(w, true);
+ TermsEnum te = MultiFields.getTerms(r, "field").iterator(null);
+
+ BytesRef term;
+ int ord = 0;
+ while ((term = te.next()) != null) {
+ if (VERBOSE) {
+ System.out.println("TEST: " + te.ord() + ": " + term.utf8ToString());
+ }
+ assertEquals(ord, te.ord());
+ ord++;
+ }
+
+ testEnum(te, terms);
+
+ r.close();
+ w.close();
+ dir.close();
+ }
+
+ public void testSeveralNonRootBlocks() throws Exception {
+ Directory dir = newDirectory();
+ IndexWriterConfig iwc = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+ IndexWriter w = new IndexWriter(dir, iwc);
+ List<String> terms = new ArrayList<>();
+ for(int i=0;i<30;i++) {
+ for(int j=0;j<30;j++) {
+ Document doc = new Document();
+ String term = "" + (char) (97+i) + (char) (97+j);
+ terms.add(term);
+ if (VERBOSE) {
+ System.out.println("term=" + term);
+ }
+ doc.add(newTextField("body", term, Field.Store.NO));
+ w.addDocument(doc);
+ }
+ }
+ w.forceMerge(1);
+ IndexReader r = DirectoryReader.open(w, true);
+ TermsEnum te = MultiFields.getTerms(r, "body").iterator(null);
+
+ for(int i=0;i<30;i++) {
+ for(int j=0;j<30;j++) {
+ String term = "" + (char) (97+i) + (char) (97+j);
+ if (VERBOSE) {
+ System.out.println("TEST: check term=" + term);
+ }
+ assertEquals(term, te.next().utf8ToString());
+ assertEquals(30*i+j, te.ord());
+ }
+ }
+
+ testEnum(te, terms);
+
+ te.seekExact(0);
+ assertEquals("aa", te.term().utf8ToString());
+
+ r.close();
+ w.close();
+ dir.close();
+ }
+}
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java?rev=1612080&r1=1612079&r2=1612080&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java Sun Jul 20 12:08:32 2014
@@ -222,6 +222,7 @@ final class SegmentTermsEnumFrame {
if (isFloor) {
floorDataReader.rewind();
numFollowFloorBlocks = floorDataReader.readVInt();
+ assert numFollowFloorBlocks > 0;
nextFloorLabel = floorDataReader.readByte() & 0xff;
}
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java?rev=1612080&r1=1612079&r2=1612080&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java Sun Jul 20 12:08:32 2014
@@ -43,6 +43,7 @@ import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CommandLineUtil;
import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LongBitSet;
import org.apache.lucene.util.StringHelper;
@@ -329,6 +330,20 @@ public class CheckIndex {
return crossCheckTermVectors;
}
+ private boolean failFast;
+
+ /** If true, just throw the original exception immediately when
+ * corruption is detected, rather than continuing to iterate to other
+ * segments looking for more corruption. */
+ public void setFailFast(boolean v) {
+ failFast = v;
+ }
+
+ /** See {@link #setFailFast}. */
+ public boolean getFailFast() {
+ return failFast;
+ }
+
private boolean verbose;
/** Set infoStream where messages should go. If null, no
@@ -382,6 +397,9 @@ public class CheckIndex {
try {
sis.read(dir);
} catch (Throwable t) {
+ if (failFast) {
+ IOUtils.reThrow(t);
+ }
msg(infoStream, "ERROR: could not read any segments file in directory");
result.missingSegments = true;
if (infoStream != null)
@@ -417,6 +435,9 @@ public class CheckIndex {
try {
input = dir.openInput(segmentsFileName, IOContext.READONCE);
} catch (Throwable t) {
+ if (failFast) {
+ IOUtils.reThrow(t);
+ }
msg(infoStream, "ERROR: could not open segments file in directory");
if (infoStream != null)
t.printStackTrace(infoStream);
@@ -427,6 +448,9 @@ public class CheckIndex {
try {
format = input.readInt();
} catch (Throwable t) {
+ if (failFast) {
+ IOUtils.reThrow(t);
+ }
msg(infoStream, "ERROR: could not read segment file version in directory");
if (infoStream != null)
t.printStackTrace(infoStream);
@@ -607,18 +631,18 @@ public class CheckIndex {
segInfoStat.numFields = fieldInfos.size();
// Test Field Norms
- segInfoStat.fieldNormStatus = testFieldNorms(reader, infoStream);
+ segInfoStat.fieldNormStatus = testFieldNorms(reader, infoStream, failFast);
// Test the Term Index
- segInfoStat.termIndexStatus = testPostings(reader, infoStream, verbose);
+ segInfoStat.termIndexStatus = testPostings(reader, infoStream, verbose, failFast);
// Test Stored Fields
- segInfoStat.storedFieldStatus = testStoredFields(reader, infoStream);
+ segInfoStat.storedFieldStatus = testStoredFields(reader, infoStream, failFast);
// Test Term Vectors
- segInfoStat.termVectorStatus = testTermVectors(reader, infoStream, verbose, crossCheckTermVectors);
+ segInfoStat.termVectorStatus = testTermVectors(reader, infoStream, verbose, crossCheckTermVectors, failFast);
- segInfoStat.docValuesStatus = testDocValues(reader, infoStream);
+ segInfoStat.docValuesStatus = testDocValues(reader, infoStream, failFast);
// Rethrow the first exception we encountered
// This will cause stats for failed segments to be incremented properly
@@ -637,6 +661,9 @@ public class CheckIndex {
msg(infoStream, "");
} catch (Throwable t) {
+ if (failFast) {
+ IOUtils.reThrow(t);
+ }
msg(infoStream, "FAILED");
String comment;
comment = "fixIndex() would remove reference to this segment";
@@ -678,7 +705,7 @@ public class CheckIndex {
* Test field norms.
* @lucene.experimental
*/
- public static Status.FieldNormStatus testFieldNorms(AtomicReader reader, PrintStream infoStream) {
+ public static Status.FieldNormStatus testFieldNorms(AtomicReader reader, PrintStream infoStream, boolean failFast) throws IOException {
final Status.FieldNormStatus status = new Status.FieldNormStatus();
try {
@@ -699,6 +726,9 @@ public class CheckIndex {
msg(infoStream, "OK [" + status.totFields + " fields]");
} catch (Throwable e) {
+ if (failFast) {
+ IOUtils.reThrow(e);
+ }
msg(infoStream, "ERROR [" + String.valueOf(e.getMessage()) + "]");
status.error = e;
if (infoStream != null) {
@@ -825,6 +855,7 @@ public class CheckIndex {
long sumTotalTermFreq = 0;
long sumDocFreq = 0;
+ long upto = 0;
FixedBitSet visitedDocs = new FixedBitSet(maxDoc);
while(true) {
@@ -832,7 +863,7 @@ public class CheckIndex {
if (term == null) {
break;
}
-
+
assert term.isValid();
// make sure terms arrive in order according to
@@ -1267,15 +1298,15 @@ public class CheckIndex {
* Test the term index.
* @lucene.experimental
*/
- public static Status.TermIndexStatus testPostings(AtomicReader reader, PrintStream infoStream) {
- return testPostings(reader, infoStream, false);
+ public static Status.TermIndexStatus testPostings(AtomicReader reader, PrintStream infoStream) throws IOException {
+ return testPostings(reader, infoStream, false, false);
}
/**
* Test the term index.
* @lucene.experimental
*/
- public static Status.TermIndexStatus testPostings(AtomicReader reader, PrintStream infoStream, boolean verbose) {
+ public static Status.TermIndexStatus testPostings(AtomicReader reader, PrintStream infoStream, boolean verbose, boolean failFast) throws IOException {
// TODO: we should go and verify term vectors match, if
// crossCheckTermVectors is on...
@@ -1299,6 +1330,9 @@ public class CheckIndex {
checkFields(fields, null, maxDoc, fieldInfos, true, false, infoStream, verbose);
}
} catch (Throwable e) {
+ if (failFast) {
+ IOUtils.reThrow(e);
+ }
msg(infoStream, "ERROR: " + e);
status = new Status.TermIndexStatus();
status.error = e;
@@ -1314,7 +1348,7 @@ public class CheckIndex {
* Test stored fields.
* @lucene.experimental
*/
- public static Status.StoredFieldStatus testStoredFields(AtomicReader reader, PrintStream infoStream) {
+ public static Status.StoredFieldStatus testStoredFields(AtomicReader reader, PrintStream infoStream, boolean failFast) throws IOException {
final Status.StoredFieldStatus status = new Status.StoredFieldStatus();
try {
@@ -1342,6 +1376,9 @@ public class CheckIndex {
msg(infoStream, "OK [" + status.totFields + " total field count; avg " +
NumberFormat.getInstance(Locale.ROOT).format((((float) status.totFields)/status.docCount)) + " fields per doc]");
} catch (Throwable e) {
+ if (failFast) {
+ IOUtils.reThrow(e);
+ }
msg(infoStream, "ERROR [" + String.valueOf(e.getMessage()) + "]");
status.error = e;
if (infoStream != null) {
@@ -1357,7 +1394,8 @@ public class CheckIndex {
* @lucene.experimental
*/
public static Status.DocValuesStatus testDocValues(AtomicReader reader,
- PrintStream infoStream) {
+ PrintStream infoStream,
+ boolean failFast) throws IOException {
final Status.DocValuesStatus status = new Status.DocValuesStatus();
try {
if (infoStream != null) {
@@ -1385,6 +1423,9 @@ public class CheckIndex {
+ status.totalSortedNumericFields + " SORTED_NUMERIC; "
+ status.totalSortedSetFields + " SORTED_SET]");
} catch (Throwable e) {
+ if (failFast) {
+ IOUtils.reThrow(e);
+ }
msg(infoStream, "ERROR [" + String.valueOf(e.getMessage()) + "]");
status.error = e;
if (infoStream != null) {
@@ -1624,15 +1665,15 @@ public class CheckIndex {
* Test term vectors.
* @lucene.experimental
*/
- public static Status.TermVectorStatus testTermVectors(AtomicReader reader, PrintStream infoStream) {
- return testTermVectors(reader, infoStream, false, false);
+ public static Status.TermVectorStatus testTermVectors(AtomicReader reader, PrintStream infoStream) throws IOException {
+ return testTermVectors(reader, infoStream, false, false, false);
}
/**
* Test term vectors.
* @lucene.experimental
*/
- public static Status.TermVectorStatus testTermVectors(AtomicReader reader, PrintStream infoStream, boolean verbose, boolean crossCheckTermVectors) {
+ public static Status.TermVectorStatus testTermVectors(AtomicReader reader, PrintStream infoStream, boolean verbose, boolean crossCheckTermVectors, boolean failFast) throws IOException {
final Status.TermVectorStatus status = new Status.TermVectorStatus();
final FieldInfos fieldInfos = reader.getFieldInfos();
final Bits onlyDocIsDeleted = new FixedBitSet(1);
@@ -1844,6 +1885,9 @@ public class CheckIndex {
msg(infoStream, "OK [" + status.totVectors + " total vector count; avg " +
NumberFormat.getInstance(Locale.ROOT).format(vectorAvg) + " term/freq vector fields per doc]");
} catch (Throwable e) {
+ if (failFast) {
+ IOUtils.reThrow(e);
+ }
msg(infoStream, "ERROR [" + String.valueOf(e.getMessage()) + "]");
status.error = e;
if (infoStream != null) {
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/ByteSequenceOutputs.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/ByteSequenceOutputs.java?rev=1612080&r1=1612079&r2=1612080&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/ByteSequenceOutputs.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/ByteSequenceOutputs.java Sun Jul 20 12:08:32 2014
@@ -23,6 +23,7 @@ import org.apache.lucene.store.DataInput
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.StringHelper;
/**
* An FST {@link Outputs} implementation where each output
@@ -80,13 +81,16 @@ public final class ByteSequenceOutputs e
if (inc == NO_OUTPUT) {
// no prefix removed
return output;
- } else if (inc.length == output.length) {
- // entire output removed
- return NO_OUTPUT;
} else {
- assert inc.length < output.length: "inc.length=" + inc.length + " vs output.length=" + output.length;
- assert inc.length > 0;
- return new BytesRef(output.bytes, output.offset + inc.length, output.length-inc.length);
+ assert StringHelper.startsWith(output, inc);
+ if (inc.length == output.length) {
+ // entire output removed
+ return NO_OUTPUT;
+ } else {
+ assert inc.length < output.length: "inc.length=" + inc.length + " vs output.length=" + output.length;
+ assert inc.length > 0;
+ return new BytesRef(output.bytes, output.offset + inc.length, output.length-inc.length);
+ }
}
}
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/FST.java?rev=1612080&r1=1612079&r2=1612080&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/FST.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/FST.java Sun Jul 20 12:08:32 2014
@@ -85,7 +85,10 @@ public final class FST<T> implements Acc
// TODO: we can free up a bit if we can nuke this:
final static int BIT_STOP_NODE = 1 << 3;
- final static int BIT_ARC_HAS_OUTPUT = 1 << 4;
+
+ /** This flag is set if the arc has an output. */
+ public final static int BIT_ARC_HAS_OUTPUT = 1 << 4;
+
final static int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5;
// Arcs are stored as fixed-size (per entry) array, so
@@ -196,11 +199,22 @@ public final class FST<T> implements Acc
// address (into the byte[]), or ord/address if label == END_LABEL
long nextArc;
- // This is non-zero if current arcs are fixed array:
- long posArcsStart;
- int bytesPerArc;
- int arcIdx;
- int numArcs;
+ /** Where the first arc in the array starts; only valid if
+ * bytesPerArc != 0 */
+ public long posArcsStart;
+
+ /** Non-zero if this arc is part of an array, which means all
+ * arcs for the node are encoded with a fixed number of bytes so
+ * that we can random access by index. We do when there are enough
+ * arcs leaving one node. It wastes some bytes but gives faster
+ * lookups. */
+ public int bytesPerArc;
+
+ /** Where we are in the array; only valid if bytesPerArc != 0. */
+ public int arcIdx;
+
+ /** How many arcs in the array; only valid if bytesPerArc != 0. */
+ public int numArcs;
/** Returns this */
public Arc<T> copyFrom(Arc<T> other) {
@@ -644,7 +658,8 @@ public final class FST<T> implements Acc
}
}
- int readLabel(DataInput in) throws IOException {
+ /** Reads one BYTE1/2/4 label from the provided {@link DataInput}. */
+ public int readLabel(DataInput in) throws IOException {
final int v;
if (inputType == INPUT_TYPE.BYTE1) {
// Unsigned byte:
Modified: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/mockrandom/MockRandomPostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/mockrandom/MockRandomPostingsFormat.java?rev=1612080&r1=1612079&r2=1612080&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/mockrandom/MockRandomPostingsFormat.java (original)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/mockrandom/MockRandomPostingsFormat.java Sun Jul 20 12:08:32 2014
@@ -36,6 +36,8 @@ import org.apache.lucene.codecs.blockter
import org.apache.lucene.codecs.blockterms.VariableGapTermsIndexWriter;
import org.apache.lucene.codecs.blocktree.BlockTreeTermsReader;
import org.apache.lucene.codecs.blocktree.BlockTreeTermsWriter;
+import org.apache.lucene.codecs.blocktreeords.OrdsBlockTreeTermsReader;
+import org.apache.lucene.codecs.blocktreeords.OrdsBlockTreeTermsWriter;
import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader;
import org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter;
import org.apache.lucene.codecs.memory.FSTOrdTermsReader;
@@ -128,7 +130,7 @@ public final class MockRandomPostingsFor
}
final FieldsConsumer fields;
- final int t1 = random.nextInt(4);
+ final int t1 = random.nextInt(5);
if (t1 == 0) {
boolean success = false;
@@ -171,7 +173,7 @@ public final class MockRandomPostingsFor
postingsWriter.close();
}
}
- } else {
+ } else if (t1 == 3) {
if (LuceneTestCase.VERBOSE) {
System.out.println("MockRandomCodec: writing Block terms dict");
@@ -241,6 +243,30 @@ public final class MockRandomPostingsFor
}
}
}
+ } else if (t1 == 4) {
+ // Use OrdsBlockTree terms dict
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println("MockRandomCodec: writing OrdsBlockTree");
+ }
+
+ // TODO: would be nice to allow 1 but this is very
+ // slow to write
+ final int minTermsInBlock = TestUtil.nextInt(random, 2, 100);
+ final int maxTermsInBlock = Math.max(2, (minTermsInBlock-1)*2 + random.nextInt(100));
+
+ boolean success = false;
+ try {
+ fields = new OrdsBlockTreeTermsWriter(state, postingsWriter, minTermsInBlock, maxTermsInBlock);
+ success = true;
+ } finally {
+ if (!success) {
+ postingsWriter.close();
+ }
+ }
+
+ } else {
+ // BUG!
+ throw new AssertionError();
}
return fields;
@@ -275,7 +301,7 @@ public final class MockRandomPostingsFor
}
final FieldsProducer fields;
- final int t1 = random.nextInt(4);
+ final int t1 = random.nextInt(5);
if (t1 == 0) {
boolean success = false;
try {
@@ -316,7 +342,7 @@ public final class MockRandomPostingsFor
postingsReader.close();
}
}
- } else {
+ } else if (t1 == 3) {
if (LuceneTestCase.VERBOSE) {
System.out.println("MockRandomCodec: reading Block terms dict");
@@ -380,6 +406,29 @@ public final class MockRandomPostingsFor
}
}
}
+ } else if (t1 == 4) {
+ // Use OrdsBlockTree terms dict
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println("MockRandomCodec: reading OrdsBlockTree terms dict");
+ }
+
+ boolean success = false;
+ try {
+ fields = new OrdsBlockTreeTermsReader(state.directory,
+ state.fieldInfos,
+ state.segmentInfo,
+ postingsReader,
+ state.context,
+ state.segmentSuffix);
+ success = true;
+ } finally {
+ if (!success) {
+ postingsReader.close();
+ }
+ }
+ } else {
+ // BUG!
+ throw new AssertionError();
}
return fields;
Modified: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java?rev=1612080&r1=1612079&r2=1612080&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java (original)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java Sun Jul 20 12:08:32 2014
@@ -59,6 +59,11 @@ import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LineFileDocs;
import org.apache.lucene.util.RamUsageTester;
import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.AutomatonTestUtil.RandomAcceptedStrings;
+import org.apache.lucene.util.automaton.AutomatonTestUtil;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.junit.AfterClass;
import org.junit.BeforeClass;
@@ -293,17 +298,28 @@ public abstract class BasePostingsFormat
}
private static class FieldAndTerm {
- String field;
- BytesRef term;
+ final String field;
+ final BytesRef term;
+ final long ord;
- public FieldAndTerm(String field, BytesRef term) {
+ public FieldAndTerm(String field, BytesRef term, long ord) {
this.field = field;
this.term = BytesRef.deepCopyOf(term);
+ this.ord = ord;
+ }
+ }
+
+ private static class SeedAndOrd {
+ final long seed;
+ long ord;
+
+ public SeedAndOrd(long seed) {
+ this.seed = seed;
}
}
// Holds all postings:
- private static Map<String,SortedMap<BytesRef,Long>> fields;
+ private static Map<String,SortedMap<BytesRef,SeedAndOrd>> fields;
private static FieldInfos fieldInfos;
@@ -359,7 +375,7 @@ public abstract class BasePostingsFormat
null, DocValuesType.NUMERIC, -1, null);
fieldUpto++;
- SortedMap<BytesRef,Long> postings = new TreeMap<>();
+ SortedMap<BytesRef,SeedAndOrd> postings = new TreeMap<>();
fields.put(field, postings);
Set<String> seenTerms = new HashSet<>();
@@ -370,7 +386,9 @@ public abstract class BasePostingsFormat
numTerms = TestUtil.nextInt(random(), 2, 20);
}
- for(int termUpto=0;termUpto<numTerms;termUpto++) {
+ while (postings.size() < numTerms) {
+ int termUpto = postings.size();
+ // Cannot contain surrogates else default Java string sort order (by UTF16 code unit) is different from Lucene:
String term = TestUtil.randomSimpleString(random());
if (seenTerms.contains(term)) {
continue;
@@ -392,7 +410,7 @@ public abstract class BasePostingsFormat
}
long termSeed = random().nextLong();
- postings.put(new BytesRef(term), termSeed);
+ postings.put(new BytesRef(term), new SeedAndOrd(termSeed));
// NOTE: sort of silly: we enum all the docs just to
// get the maxDoc
@@ -404,6 +422,12 @@ public abstract class BasePostingsFormat
}
maxDoc = Math.max(lastDoc, maxDoc);
}
+
+ // assign ords
+ long ord = 0;
+ for(SeedAndOrd ent : postings.values()) {
+ ent.ord = ord++;
+ }
}
fieldInfos = new FieldInfos(fieldInfoArray);
@@ -420,10 +444,11 @@ public abstract class BasePostingsFormat
}
allTerms = new ArrayList<>();
- for(Map.Entry<String,SortedMap<BytesRef,Long>> fieldEnt : fields.entrySet()) {
+ for(Map.Entry<String,SortedMap<BytesRef,SeedAndOrd>> fieldEnt : fields.entrySet()) {
String field = fieldEnt.getKey();
- for(Map.Entry<BytesRef,Long> termEnt : fieldEnt.getValue().entrySet()) {
- allTerms.add(new FieldAndTerm(field, termEnt.getKey()));
+ long ord = 0;
+ for(Map.Entry<BytesRef,SeedAndOrd> termEnt : fieldEnt.getValue().entrySet()) {
+ allTerms.add(new FieldAndTerm(field, termEnt.getKey(), ord++));
}
}
@@ -441,12 +466,12 @@ public abstract class BasePostingsFormat
}
private static class SeedFields extends Fields {
- final Map<String,SortedMap<BytesRef,Long>> fields;
+ final Map<String,SortedMap<BytesRef,SeedAndOrd>> fields;
final FieldInfos fieldInfos;
final IndexOptions maxAllowed;
final boolean allowPayloads;
- public SeedFields(Map<String,SortedMap<BytesRef,Long>> fields, FieldInfos fieldInfos, IndexOptions maxAllowed, boolean allowPayloads) {
+ public SeedFields(Map<String,SortedMap<BytesRef,SeedAndOrd>> fields, FieldInfos fieldInfos, IndexOptions maxAllowed, boolean allowPayloads) {
this.fields = fields;
this.fieldInfos = fieldInfos;
this.maxAllowed = maxAllowed;
@@ -460,7 +485,7 @@ public abstract class BasePostingsFormat
@Override
public Terms terms(String field) {
- SortedMap<BytesRef,Long> terms = fields.get(field);
+ SortedMap<BytesRef,SeedAndOrd> terms = fields.get(field);
if (terms == null) {
return null;
} else {
@@ -475,12 +500,12 @@ public abstract class BasePostingsFormat
}
private static class SeedTerms extends Terms {
- final SortedMap<BytesRef,Long> terms;
+ final SortedMap<BytesRef,SeedAndOrd> terms;
final FieldInfo fieldInfo;
final IndexOptions maxAllowed;
final boolean allowPayloads;
- public SeedTerms(SortedMap<BytesRef,Long> terms, FieldInfo fieldInfo, IndexOptions maxAllowed, boolean allowPayloads) {
+ public SeedTerms(SortedMap<BytesRef,SeedAndOrd> terms, FieldInfo fieldInfo, IndexOptions maxAllowed, boolean allowPayloads) {
this.terms = terms;
this.fieldInfo = fieldInfo;
this.maxAllowed = maxAllowed;
@@ -545,15 +570,15 @@ public abstract class BasePostingsFormat
}
private static class SeedTermsEnum extends TermsEnum {
- final SortedMap<BytesRef,Long> terms;
+ final SortedMap<BytesRef,SeedAndOrd> terms;
final IndexOptions maxAllowed;
final boolean allowPayloads;
- private Iterator<Map.Entry<BytesRef,Long>> iterator;
+ private Iterator<Map.Entry<BytesRef,SeedAndOrd>> iterator;
- private Map.Entry<BytesRef,Long> current;
+ private Map.Entry<BytesRef,SeedAndOrd> current;
- public SeedTermsEnum(SortedMap<BytesRef,Long> terms, IndexOptions maxAllowed, boolean allowPayloads) {
+ public SeedTermsEnum(SortedMap<BytesRef,SeedAndOrd> terms, IndexOptions maxAllowed, boolean allowPayloads) {
this.terms = terms;
this.maxAllowed = maxAllowed;
this.allowPayloads = allowPayloads;
@@ -565,7 +590,7 @@ public abstract class BasePostingsFormat
@Override
public SeekStatus seekCeil(BytesRef text) {
- SortedMap<BytesRef,Long> tailMap = terms.tailMap(text);
+ SortedMap<BytesRef,SeedAndOrd> tailMap = terms.tailMap(text);
if (tailMap.isEmpty()) {
return SeekStatus.END;
} else {
@@ -600,7 +625,7 @@ public abstract class BasePostingsFormat
@Override
public long ord() {
- throw new UnsupportedOperationException();
+ return current.getValue().ord;
}
@Override
@@ -621,7 +646,7 @@ public abstract class BasePostingsFormat
if ((flags & DocsEnum.FLAG_FREQS) != 0 && maxAllowed.compareTo(IndexOptions.DOCS_AND_FREQS) < 0) {
return null;
}
- return getSeedPostings(current.getKey().utf8ToString(), current.getValue(), false, maxAllowed, allowPayloads);
+ return getSeedPostings(current.getKey().utf8ToString(), current.getValue().seed, false, maxAllowed, allowPayloads);
}
@Override
@@ -638,7 +663,7 @@ public abstract class BasePostingsFormat
if ((flags & DocsAndPositionsEnum.FLAG_PAYLOADS) != 0 && allowPayloads == false) {
return null;
}
- return getSeedPostings(current.getKey().utf8ToString(), current.getValue(), false, maxAllowed, allowPayloads);
+ return getSeedPostings(current.getKey().utf8ToString(), current.getValue().seed, false, maxAllowed, allowPayloads);
}
}
@@ -766,7 +791,7 @@ public abstract class BasePostingsFormat
// NOTE: can be empty list if we are using liveDocs:
SeedPostings expected = getSeedPostings(term.utf8ToString(),
- fields.get(field).get(term),
+ fields.get(field).get(term).seed,
useLiveDocs,
maxIndexOptions,
true);
@@ -1104,12 +1129,16 @@ public abstract class BasePostingsFormat
// Test random terms/fields:
List<TermState> termStates = new ArrayList<>();
List<FieldAndTerm> termStateTerms = new ArrayList<>();
+
+ boolean supportsOrds = true;
Collections.shuffle(allTerms, random());
int upto = 0;
while (upto < allTerms.size()) {
boolean useTermState = termStates.size() != 0 && random().nextInt(5) == 1;
+ boolean useTermOrd = supportsOrds && useTermState == false && random().nextInt(5) == 1;
+
FieldAndTerm fieldAndTerm;
TermsEnum termsEnum;
@@ -1119,7 +1148,11 @@ public abstract class BasePostingsFormat
// Seek by random field+term:
fieldAndTerm = allTerms.get(upto++);
if (VERBOSE) {
- System.out.println("\nTEST: seek to term=" + fieldAndTerm.field + ":" + fieldAndTerm.term.utf8ToString() );
+ if (useTermOrd) {
+ System.out.println("\nTEST: seek to term=" + fieldAndTerm.field + ":" + fieldAndTerm.term.utf8ToString() + " using ord=" + fieldAndTerm.ord);
+ } else {
+ System.out.println("\nTEST: seek to term=" + fieldAndTerm.field + ":" + fieldAndTerm.term.utf8ToString() );
+ }
}
} else {
// Seek by previous saved TermState
@@ -1136,11 +1169,38 @@ public abstract class BasePostingsFormat
termsEnum = terms.iterator(null);
if (!useTermState) {
- assertTrue(termsEnum.seekExact(fieldAndTerm.term));
+ if (useTermOrd) {
+ // Try seek by ord sometimes:
+ try {
+ termsEnum.seekExact(fieldAndTerm.ord);
+ } catch (UnsupportedOperationException uoe) {
+ supportsOrds = false;
+ assertTrue(termsEnum.seekExact(fieldAndTerm.term));
+ }
+ } else {
+ assertTrue(termsEnum.seekExact(fieldAndTerm.term));
+ }
} else {
termsEnum.seekExact(fieldAndTerm.term, termState);
}
+ long termOrd;
+ if (supportsOrds) {
+ try {
+ termOrd = termsEnum.ord();
+ } catch (UnsupportedOperationException uoe) {
+ supportsOrds = false;
+ termOrd = -1;
+ }
+ } else {
+ termOrd = -1;
+ }
+
+ if (termOrd != -1) {
+ // PostingsFormat supports ords
+ assertEquals(fieldAndTerm.ord, termsEnum.ord());
+ }
+
boolean savedTermState = false;
if (options.contains(Option.TERM_STATE) && !useTermState && random().nextInt(5) == 1) {
@@ -1185,6 +1245,71 @@ public abstract class BasePostingsFormat
alwaysTestMax);
}
}
+
+ // Test Terms.intersect:
+ for(String field : fields.keySet()) {
+ while (true) {
+ Automaton a = AutomatonTestUtil.randomAutomaton(random());
+ CompiledAutomaton ca = new CompiledAutomaton(a);
+ if (ca.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
+ // Keep retrying until we get an A that will really "use" the PF's intersect code:
+ continue;
+ }
+ // System.out.println("A:\n" + a.toDot());
+
+ BytesRef startTerm = null;
+ if (random().nextBoolean()) {
+ RandomAcceptedStrings ras = new RandomAcceptedStrings(a);
+ for (int iter=0;iter<100;iter++) {
+ int[] codePoints = ras.getRandomAcceptedString(random());
+ if (codePoints.length == 0) {
+ continue;
+ }
+ startTerm = new BytesRef(UnicodeUtil.newString(codePoints, 0, codePoints.length));
+ break;
+ }
+ // Don't allow empty string startTerm:
+ if (startTerm == null) {
+ continue;
+ }
+ }
+ TermsEnum intersected = fieldsSource.terms(field).intersect(ca, startTerm);
+
+ Set<BytesRef> intersectedTerms = new HashSet<BytesRef>();
+ BytesRef term;
+ while ((term = intersected.next()) != null) {
+ if (startTerm != null) {
+ // NOTE: not <=
+ assertTrue(startTerm.compareTo(term) < 0);
+ }
+ intersectedTerms.add(BytesRef.deepCopyOf(term));
+ verifyEnum(threadState,
+ field,
+ term,
+ intersected,
+ maxTestOptions,
+ maxIndexOptions,
+ options,
+ alwaysTestMax);
+ }
+
+ if (ca.runAutomaton == null) {
+ assertTrue(intersectedTerms.isEmpty());
+ } else {
+ for(BytesRef term2 : fields.get(field).keySet()) {
+ boolean expected;
+ if (startTerm != null && startTerm.compareTo(term2) >= 0) {
+ expected = false;
+ } else {
+ expected = ca.runAutomaton.run(term2.bytes, term2.offset, term2.length);
+ }
+ assertEquals("term=" + term2, expected, intersectedTerms.contains(term2));
+ }
+ }
+
+ break;
+ }
+ }
}
private void testFields(Fields fields) throws Exception {
@@ -1284,7 +1409,7 @@ public abstract class BasePostingsFormat
}
}
- public void testEmptyField() throws Exception {
+ public void testJustEmptyField() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, null);
iwc.setCodec(getCodec());
@@ -1449,6 +1574,7 @@ public abstract class BasePostingsFormat
DocsEnum docs = null;
while(termsEnum.next() != null) {
BytesRef term = termsEnum.term();
+
if (random().nextBoolean()) {
docs = termsEnum.docs(null, docs, DocsEnum.FLAG_FREQS);
} else if (docs instanceof DocsAndPositionsEnum) {
@@ -1584,11 +1710,24 @@ public abstract class BasePostingsFormat
TermsEnum termsEnum = terms.iterator(null);
long termCount = 0;
+ boolean supportsOrds = true;
while(termsEnum.next() != null) {
BytesRef term = termsEnum.term();
- termCount++;
assertEquals(termFreqs.get(term.utf8ToString()).docFreq, termsEnum.docFreq());
assertEquals(termFreqs.get(term.utf8ToString()).totalTermFreq, termsEnum.totalTermFreq());
+ if (supportsOrds) {
+ long ord;
+ try {
+ ord = termsEnum.ord();
+ } catch (UnsupportedOperationException uoe) {
+ supportsOrds = false;
+ ord = -1;
+ }
+ if (ord != -1) {
+ assertEquals(termCount, ord);
+ }
+ }
+ termCount++;
}
assertEquals(termFreqs.size(), termCount);
Modified: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/store/MockDirectoryWrapper.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/store/MockDirectoryWrapper.java?rev=1612080&r1=1612079&r2=1612080&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/store/MockDirectoryWrapper.java (original)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/store/MockDirectoryWrapper.java Sun Jul 20 12:08:32 2014
@@ -692,7 +692,7 @@ public class MockDirectoryWrapper extend
if (LuceneTestCase.VERBOSE) {
System.out.println("\nNOTE: MockDirectoryWrapper: now run CheckIndex");
}
- TestUtil.checkIndex(this, getCrossCheckTermVectorsOnClose());
+ TestUtil.checkIndex(this, getCrossCheckTermVectorsOnClose(), true);
// TODO: factor this out / share w/ TestIW.assertNoUnreferencedFiles
if (assertNoUnreferencedFilesOnClose) {
Modified: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java?rev=1612080&r1=1612079&r2=1612080&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java (original)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java Sun Jul 20 12:08:32 2014
@@ -28,7 +28,6 @@ import java.io.PrintStream;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.nio.CharBuffer;
-import java.util.ArrayList;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.HashMap;
@@ -194,9 +193,16 @@ public final class TestUtil {
}
public static CheckIndex.Status checkIndex(Directory dir, boolean crossCheckTermVectors) throws IOException {
+ return checkIndex(dir, crossCheckTermVectors, false);
+ }
+
+ /** If failFast is true, then throw the first exception when index corruption is hit, instead of moving on to other fields/segments to
+ * look for any other corruption. */
+ public static CheckIndex.Status checkIndex(Directory dir, boolean crossCheckTermVectors, boolean failFast) throws IOException {
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
CheckIndex checker = new CheckIndex(dir);
checker.setCrossCheckTermVectors(crossCheckTermVectors);
+ checker.setFailFast(failFast);
checker.setInfoStream(new PrintStream(bos, false, IOUtils.UTF_8), false);
CheckIndex.Status indexStatus = checker.checkIndex(null);
if (indexStatus == null || indexStatus.clean == false) {
@@ -224,24 +230,14 @@ public final class TestUtil {
PrintStream infoStream = new PrintStream(bos, false, IOUtils.UTF_8);
reader.checkIntegrity();
- FieldNormStatus fieldNormStatus = CheckIndex.testFieldNorms(reader, infoStream);
- TermIndexStatus termIndexStatus = CheckIndex.testPostings(reader, infoStream);
- StoredFieldStatus storedFieldStatus = CheckIndex.testStoredFields(reader, infoStream);
- TermVectorStatus termVectorStatus = CheckIndex.testTermVectors(reader, infoStream, false, crossCheckTermVectors);
- DocValuesStatus docValuesStatus = CheckIndex.testDocValues(reader, infoStream);
+ FieldNormStatus fieldNormStatus = CheckIndex.testFieldNorms(reader, infoStream, true);
+ TermIndexStatus termIndexStatus = CheckIndex.testPostings(reader, infoStream, false, true);
+ StoredFieldStatus storedFieldStatus = CheckIndex.testStoredFields(reader, infoStream, true);
+ TermVectorStatus termVectorStatus = CheckIndex.testTermVectors(reader, infoStream, false, crossCheckTermVectors, true);
+ DocValuesStatus docValuesStatus = CheckIndex.testDocValues(reader, infoStream, true);
- if (fieldNormStatus.error != null ||
- termIndexStatus.error != null ||
- storedFieldStatus.error != null ||
- termVectorStatus.error != null ||
- docValuesStatus.error != null) {
- System.out.println("CheckReader failed");
+ if (LuceneTestCase.INFOSTREAM) {
System.out.println(bos.toString(IOUtils.UTF_8));
- throw new RuntimeException("CheckReader failed");
- } else {
- if (LuceneTestCase.INFOSTREAM) {
- System.out.println(bos.toString(IOUtils.UTF_8));
- }
}
}