You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2014/10/28 23:52:50 UTC
svn commit: r1635002 [2/4] - in /lucene/dev/branches/lucene6005/lucene:
codecs/src/java/org/apache/lucene/codecs/blocktreeords/
codecs/src/java/org/apache/lucene/codecs/memory/
core/src/java/org/apache/lucene/codecs/
core/src/java/org/apache/lucene/cod...
Modified: lucene/dev/branches/lucene6005/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene6005/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java?rev=1635002&r1=1635001&r2=1635002&view=diff
==============================================================================
--- lucene/dev/branches/lucene6005/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java (original)
+++ lucene/dev/branches/lucene6005/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java Tue Oct 28 22:52:49 2014
@@ -23,6 +23,7 @@ import org.apache.lucene.index.DocsAndPo
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.TermState;
+import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.ArrayUtil;
@@ -30,23 +31,38 @@ import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
-import org.apache.lucene.util.automaton.CompiledAutomaton;
+import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.RunAutomaton;
+import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Outputs;
-// NOTE: cannot seek!
+/** This is used to implement efficient {@link Terms#intersect} for
+ * block-tree. Note that it cannot seek, except for the initial term on
+ * init. It just "nexts" through the intersection of the automaton and
+ * the terms. It does not use the terms index at all: on init, it
+ * loads the root block, and scans its way to the initial term.
+ * Likewise, in next it scans until it finds a term that matches the
+ * current automaton transition. If the index has auto-prefix terms
+ * (only for DOCS_ONLY fields currently) it will visit these terms
+ * when possible and then skip the real terms that auto-prefix term
+ * matched. */
+
final class IntersectTermsEnum extends TermsEnum {
+
+ //static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
+
final IndexInput in;
final static Outputs<BytesRef> fstOutputs = ByteSequenceOutputs.getSingleton();
- private IntersectTermsEnumFrame[] stack;
+ IntersectTermsEnumFrame[] stack;
@SuppressWarnings({"rawtypes","unchecked"}) private FST.Arc<BytesRef>[] arcs = new FST.Arc[5];
final RunAutomaton runAutomaton;
- final CompiledAutomaton compiledAutomaton;
+ final Automaton automaton;
+ final BytesRef commonSuffix;
private IntersectTermsEnumFrame currentFrame;
@@ -54,19 +70,33 @@ final class IntersectTermsEnum extends T
private final FST.BytesReader fstReader;
+ private final boolean allowAutoPrefixTerms;
+
final FieldReader fr;
+ /** Which state in the automaton accepts all possible suffixes. */
+ private final int sinkState;
+
private BytesRef savedStartTerm;
+ /** True if we did return the current auto-prefix term */
+ private boolean useAutoPrefixTerm;
+
// TODO: in some cases we can filter by length? eg
// regexp foo*bar must be at least length 6 bytes
- public IntersectTermsEnum(FieldReader fr, CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
- // if (DEBUG) {
- // System.out.println("\nintEnum.init seg=" + segment + " commonSuffix=" + brToString(compiled.commonSuffixRef));
- // }
+ public IntersectTermsEnum(FieldReader fr, Automaton automaton, RunAutomaton runAutomaton, BytesRef commonSuffix, BytesRef startTerm, int sinkState) throws IOException {
+ //if (DEBUG) System.out.println("\nintEnum.init seg=" + fr.parent.segment + " commonSuffix=" + commonSuffix);
this.fr = fr;
- runAutomaton = compiled.runAutomaton;
- compiledAutomaton = compiled;
+ this.sinkState = sinkState;
+
+ assert automaton != null;
+ assert runAutomaton != null;
+
+ //if (DEBUG) System.out.println("sinkState=" + sinkState + " AUTOMATON:\n" + automaton.toDot());
+ this.runAutomaton = runAutomaton;
+ this.allowAutoPrefixTerms = sinkState != -1;
+ this.automaton = automaton;
+ this.commonSuffix = commonSuffix;
in = fr.parent.termsIn.clone();
stack = new IntersectTermsEnumFrame[5];
for(int idx=0;idx<stack.length;idx++) {
@@ -154,7 +184,7 @@ final class IntersectTermsEnum extends T
f.fp = f.fpOrig = currentFrame.lastSubFP;
f.prefix = currentFrame.prefix + currentFrame.suffix;
- // if (DEBUG) System.out.println(" pushFrame state=" + state + " prefix=" + f.prefix);
+ //if (DEBUG) System.out.println(" pushFrame state=" + state + " prefix=" + f.prefix);
f.setState(state);
// Walk the arc through the index -- we only
@@ -233,7 +263,7 @@ final class IntersectTermsEnum extends T
// arbitrary seekExact/Ceil. Note that this is a
// seekFloor!
private void seekToStartTerm(BytesRef target) throws IOException {
- //if (DEBUG) System.out.println("seek to startTerm=" + target.utf8ToString());
+ //if (DEBUG) System.out.println("seek to startTerm=" + target.utf8ToString() + " length=" + target.length);
assert currentFrame.ord == 0;
if (term.length < target.length) {
term.bytes = ArrayUtil.grow(term.bytes, target.length);
@@ -242,23 +272,29 @@ final class IntersectTermsEnum extends T
assert arc == currentFrame.arc;
for(int idx=0;idx<=target.length;idx++) {
+ //if (DEBUG) System.out.println("cycle idx=" + idx);
while (true) {
+ final int savNextEnt = currentFrame.nextEnt;
final int savePos = currentFrame.suffixesReader.getPosition();
final int saveStartBytePos = currentFrame.startBytePos;
final int saveSuffix = currentFrame.suffix;
final long saveLastSubFP = currentFrame.lastSubFP;
final int saveTermBlockOrd = currentFrame.termState.termBlockOrd;
+ final boolean saveIsAutoPrefixTerm = currentFrame.isAutoPrefixTerm;
+
+ //if (DEBUG) System.out.println(" cycle isAutoPrefix=" + saveIsAutoPrefixTerm + " ent=" + currentFrame.nextEnt + " (of " + currentFrame.entCount + ") prefix=" + currentFrame.prefix + " suffix=" + currentFrame.suffix + " firstLabel=" + (currentFrame.suffix == 0 ? "" : (currentFrame.suffixBytes[currentFrame.startBytePos])&0xff));
final boolean isSubBlock = currentFrame.next();
- //if (DEBUG) System.out.println(" cycle ent=" + currentFrame.nextEnt + " (of " + currentFrame.entCount + ") prefix=" + currentFrame.prefix + " suffix=" + currentFrame.suffix + " isBlock=" + isSubBlock + " firstLabel=" + (currentFrame.suffix == 0 ? "" : (currentFrame.suffixBytes[currentFrame.startBytePos])&0xff));
term.length = currentFrame.prefix + currentFrame.suffix;
if (term.bytes.length < term.length) {
term.bytes = ArrayUtil.grow(term.bytes, term.length);
}
System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix);
+ //if (DEBUG) System.out.println(" isSubBlock=" + isSubBlock + " term/prefix=" + brToString(term) + " saveIsAutoPrefixTerm=" + saveIsAutoPrefixTerm + " allowAutoPrefixTerms=" + allowAutoPrefixTerms);
+
if (isSubBlock && StringHelper.startsWith(target, term)) {
// Recurse
//if (DEBUG) System.out.println(" recurse!");
@@ -266,9 +302,11 @@ final class IntersectTermsEnum extends T
break;
} else {
final int cmp = term.compareTo(target);
+ //if (DEBUG) System.out.println(" cmp=" + cmp);
if (cmp < 0) {
if (currentFrame.nextEnt == currentFrame.entCount) {
if (!currentFrame.isLastInFloor) {
+ // Advance to next floor block
//if (DEBUG) System.out.println(" load floorBlock");
currentFrame.loadNextFloorBlock();
continue;
@@ -279,19 +317,24 @@ final class IntersectTermsEnum extends T
}
continue;
} else if (cmp == 0) {
+ if (allowAutoPrefixTerms == false && currentFrame.isAutoPrefixTerm) {
+ continue;
+ }
//if (DEBUG) System.out.println(" return term=" + brToString(term));
return;
- } else {
+ } else if (allowAutoPrefixTerms || currentFrame.isAutoPrefixTerm == false) {
// Fallback to prior entry: the semantics of
// this method is that the first call to
// next() will return the term after the
// requested term
- currentFrame.nextEnt--;
+ //if (DEBUG) System.out.println(" fallback prior entry");
+ currentFrame.nextEnt = savNextEnt;
currentFrame.lastSubFP = saveLastSubFP;
currentFrame.startBytePos = saveStartBytePos;
currentFrame.suffix = saveSuffix;
currentFrame.suffixesReader.setPosition(savePos);
currentFrame.termState.termBlockOrd = saveTermBlockOrd;
+ currentFrame.isAutoPrefixTerm = saveIsAutoPrefixTerm;
System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix);
term.length = currentFrame.prefix + currentFrame.suffix;
// If the last entry was a block we don't
@@ -310,77 +353,245 @@ final class IntersectTermsEnum extends T
@Override
public BytesRef next() throws IOException {
- // if (DEBUG) {
- // System.out.println("\nintEnum.next seg=" + segment);
- // System.out.println(" frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix);
- // }
+ //if (DEBUG) {
+ // System.out.println("\nintEnum.next seg=" + fr.parent.segment);
+ // System.out.println(" frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " outputPrefix=" + currentFrame.outputPrefix + " trans: " + currentFrame.transition + " useAutoPrefix=" + useAutoPrefixTerm);
+ //}
nextTerm:
- while(true) {
- // Pop finished frames
- while (currentFrame.nextEnt == currentFrame.entCount) {
- if (!currentFrame.isLastInFloor) {
- //if (DEBUG) System.out.println(" next-floor-block");
- currentFrame.loadNextFloorBlock();
- //if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix);
+ while (true) {
+
+ boolean isSubBlock;
+
+ if (useAutoPrefixTerm) {
+
+ assert currentFrame.isAutoPrefixTerm;
+ useAutoPrefixTerm = false;
+ currentFrame.termState.isRealTerm = true;
+
+ //if (DEBUG) System.out.println(" now scan beyond auto-prefix term=" + brToString(term) + " floorSuffixLeadEnd=" + Integer.toHexString(currentFrame.floorSuffixLeadEnd));
+ // If we last returned an auto-prefix term, we must now skip all
+ // actual terms sharing that prefix. At most, that skipping
+ // requires popping one frame, but it can also require simply
+ // scanning ahead within the current frame. This scanning will
+ // skip sub-blocks that contain many terms, which is why the
+ // optimization "works":
+ int floorSuffixLeadEnd = currentFrame.floorSuffixLeadEnd;
+ if (floorSuffixLeadEnd == -1) {
+ // An ordinary prefix, e.g. foo*
+ int prefix = currentFrame.prefix;
+ int suffix = currentFrame.suffix;
+ //if (DEBUG) System.out.println(" prefix=" + prefix + " suffix=" + suffix);
+ if (suffix == 0) {
+ //if (DEBUG) System.out.println(" pop frame & nextTerm");
+
+ // Easy case: the prefix term's suffix is the empty string,
+ // meaning the prefix corresponds to all terms in the
+ // current block, so we just pop this entire block:
+ if (currentFrame.ord == 0) {
+ //if (DEBUG) System.out.println(" return null");
+ return null;
+ }
+ currentFrame = stack[currentFrame.ord-1];
+ continue nextTerm;
+ } else {
+
+ // Just next() until we hit an entry that doesn't share this
+ // prefix. The first next should be a sub-block sharing the
+ // same prefix, because if there are enough terms matching a
+ // given prefix to warrant an auto-prefix term, then there
+ // must also be enough to make a sub-block (assuming
+ // minItemsInPrefix > minItemsInBlock):
+ scanPrefix:
+ while (true) {
+ //if (DEBUG) System.out.println(" scan next");
+ if (currentFrame.nextEnt == currentFrame.entCount) {
+ if (currentFrame.isLastInFloor == false) {
+ currentFrame.loadNextFloorBlock();
+ } else if (currentFrame.ord == 0) {
+ //if (DEBUG) System.out.println(" return null0");
+ return null;
+ } else {
+ // Pop frame, which also means we've moved beyond this
+ // auto-prefix term:
+ //if (DEBUG) System.out.println(" pop; nextTerm");
+ currentFrame = stack[currentFrame.ord-1];
+ continue nextTerm;
+ }
+ }
+ isSubBlock = currentFrame.next();
+ //if (DEBUG) {
+ // BytesRef suffixBytes = new BytesRef(currentFrame.suffix);
+ // System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, suffixBytes.bytes, 0, currentFrame.suffix);
+ // suffixBytes.length = currentFrame.suffix;
+ // System.out.println(" currentFrame.suffix=" + brToString(suffixBytes));
+ //}
+ for(int i=0;i<suffix;i++) {
+ if (term.bytes[prefix+i] != currentFrame.suffixBytes[currentFrame.startBytePos+i]) {
+ //if (DEBUG) System.out.println(" done; now stop scan");
+ break scanPrefix;
+ }
+ }
+ }
+ }
} else {
- //if (DEBUG) System.out.println(" pop frame");
- if (currentFrame.ord == 0) {
- return null;
- }
- final long lastFP = currentFrame.fpOrig;
- currentFrame = stack[currentFrame.ord-1];
- assert currentFrame.lastSubFP == lastFP;
- //if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix);
+ // Floor'd auto-prefix term; in this case we must skip all
+ // terms e.g. matching foo[a-m]*. We are currently "on" fooa,
+ // which the automaton accepted (fooa* through foom*), and
+ // floorSuffixLeadEnd is m, so we must now scan to foon:
+ int prefix = currentFrame.prefix;
+ int suffix = currentFrame.suffix;
+
+ if (currentFrame.floorSuffixLeadStart == -1) {
+ suffix++;
+ }
+
+ //if (DEBUG) System.out.println(" prefix=" + prefix + " suffix=" + suffix);
+
+ if (suffix == 0) {
+
+ //if (DEBUG) System.out.println(" pop frame");
+
+ // This means current frame is fooa*, so we have to first
+ // pop the current frame, then scan in parent frame:
+ if (currentFrame.ord == 0) {
+ //if (DEBUG) System.out.println(" return null");
+ return null;
+ }
+ currentFrame = stack[currentFrame.ord-1];
+
+ // Current (parent) frame is now foo*, so now we just scan
+ // until the lead suffix byte is > floorSuffixLeadEnd
+ //assert currentFrame.prefix == prefix-1;
+ //prefix = currentFrame.prefix;
+
+ // In case when we pop, and the parent block is not just prefix-1, e.g. in block 417* on
+ // its first term = floor prefix term 41[7-9], popping to block 4*:
+ prefix = currentFrame.prefix;
+
+ suffix = term.length - currentFrame.prefix;
+ } else {
+ // No need to pop; just scan in currentFrame:
+ }
+
+ //if (DEBUG) System.out.println(" start scan: prefix=" + prefix + " suffix=" + suffix);
+
+ // Now we scan until the lead suffix byte is > floorSuffixLeadEnd
+ scanFloor:
+ while (true) {
+ //if (DEBUG) System.out.println(" scan next");
+ if (currentFrame.nextEnt == currentFrame.entCount) {
+ if (currentFrame.isLastInFloor == false) {
+ //if (DEBUG) System.out.println(" next floor block");
+ currentFrame.loadNextFloorBlock();
+ } else if (currentFrame.ord == 0) {
+ //if (DEBUG) System.out.println(" return null");
+ return null;
+ } else {
+ // Pop frame, which also means we've moved beyond this
+ // auto-prefix term:
+ currentFrame = stack[currentFrame.ord-1];
+ //if (DEBUG) System.out.println(" pop, now curFrame.prefix=" + currentFrame.prefix);
+ continue nextTerm;
+ }
+ }
+ isSubBlock = currentFrame.next();
+ //if (DEBUG) {
+ // BytesRef suffixBytes = new BytesRef(currentFrame.suffix);
+ // System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, suffixBytes.bytes, 0, currentFrame.suffix);
+ // suffixBytes.length = currentFrame.suffix;
+ // System.out.println(" currentFrame.suffix=" + brToString(suffixBytes));
+ //}
+ for(int i=0;i<suffix-1;i++) {
+ if (term.bytes[prefix+i] != currentFrame.suffixBytes[currentFrame.startBytePos+i]) {
+ //if (DEBUG) System.out.println(" done; now stop scan");
+ break scanFloor;
+ }
+ }
+ //if (DEBUG) {
+ // if (currentFrame.suffix >= suffix) {
+ // System.out.println(" cmp label=" + Integer.toHexString(currentFrame.suffixBytes[currentFrame.startBytePos+suffix-1]) + " vs " + floorSuffixLeadEnd);
+ // }
+ //}
+ if (currentFrame.suffix >= suffix && (currentFrame.suffixBytes[currentFrame.startBytePos+suffix-1]&0xff) > floorSuffixLeadEnd) {
+ // Done scanning: we are now on the first term after all
+ // terms matched by this auto-prefix term
+ //if (DEBUG) System.out.println(" done; now stop scan");
+ break;
+ }
+ }
+ }
+ } else {
+ // Pop finished frames
+ while (currentFrame.nextEnt == currentFrame.entCount) {
+ if (!currentFrame.isLastInFloor) {
+ //if (DEBUG) System.out.println(" next-floor-block: trans: " + currentFrame.transition);
+ // Advance to next floor block
+ currentFrame.loadNextFloorBlock();
+ //if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " outputPrefix=" + currentFrame.outputPrefix);
+ break;
+ } else {
+ //if (DEBUG) System.out.println(" pop frame");
+ if (currentFrame.ord == 0) {
+ //if (DEBUG) System.out.println(" return null");
+ return null;
+ }
+ final long lastFP = currentFrame.fpOrig;
+ currentFrame = stack[currentFrame.ord-1];
+ assert currentFrame.lastSubFP == lastFP;
+ //if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " outputPrefix=" + currentFrame.outputPrefix);
+ }
}
+
+ isSubBlock = currentFrame.next();
}
- final boolean isSubBlock = currentFrame.next();
- // if (DEBUG) {
- // final BytesRef suffixRef = new BytesRef();
- // suffixRef.bytes = currentFrame.suffixBytes;
- // suffixRef.offset = currentFrame.startBytePos;
- // suffixRef.length = currentFrame.suffix;
- // System.out.println(" " + (isSubBlock ? "sub-block" : "term") + " " + currentFrame.nextEnt + " (of " + currentFrame.entCount + ") suffix=" + brToString(suffixRef));
- // }
+ //if (DEBUG) {
+ // final BytesRef suffixRef = new BytesRef();
+ // suffixRef.bytes = currentFrame.suffixBytes;
+ // suffixRef.offset = currentFrame.startBytePos;
+ // suffixRef.length = currentFrame.suffix;
+ // System.out.println(" " + (isSubBlock ? "sub-block" : "term") + " " + currentFrame.nextEnt + " (of " + currentFrame.entCount + ") suffix=" + brToString(suffixRef));
+ //}
if (currentFrame.suffix != 0) {
+ // Advance where we are in the automaton to match what terms
+ // dict next'd to:
final int label = currentFrame.suffixBytes[currentFrame.startBytePos] & 0xff;
+ //if (DEBUG) System.out.println(" move automaton to label=" + label + " vs curMax=" + currentFrame.curTransitionMax);
while (label > currentFrame.curTransitionMax) {
if (currentFrame.transitionIndex >= currentFrame.transitionCount-1) {
- // Stop processing this frame -- no further
- // matches are possible because we've moved
- // beyond what the max transition will allow
- //if (DEBUG) System.out.println(" break: trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]));
-
- // sneaky! forces a pop above
- currentFrame.isLastInFloor = true;
- currentFrame.nextEnt = currentFrame.entCount;
+ // Pop this frame: no further matches are possible because
+ // we've moved beyond what the max transition will allow
+ //if (DEBUG) System.out.println(" break: trans");
+ if (currentFrame.ord == 0) {
+ //if (DEBUG) System.out.println(" return null");
+ return null;
+ }
+ currentFrame = stack[currentFrame.ord-1];
continue nextTerm;
}
currentFrame.transitionIndex++;
- compiledAutomaton.automaton.getNextTransition(currentFrame.transition);
+ automaton.getNextTransition(currentFrame.transition);
currentFrame.curTransitionMax = currentFrame.transition.max;
- //if (DEBUG) System.out.println(" next trans=" + currentFrame.transitions[currentFrame.transitionIndex]);
+ //if (DEBUG) System.out.println(" next trans");
}
}
// First test the common suffix, if set:
- if (compiledAutomaton.commonSuffixRef != null && !isSubBlock) {
+ if (commonSuffix != null && !isSubBlock) {
final int termLen = currentFrame.prefix + currentFrame.suffix;
- if (termLen < compiledAutomaton.commonSuffixRef.length) {
+ if (termLen < commonSuffix.length) {
// No match
- // if (DEBUG) {
- // System.out.println(" skip: common suffix length");
- // }
+ //if (DEBUG) System.out.println(" skip: common suffix length");
continue nextTerm;
}
final byte[] suffixBytes = currentFrame.suffixBytes;
- final byte[] commonSuffixBytes = compiledAutomaton.commonSuffixRef.bytes;
+ final byte[] commonSuffixBytes = commonSuffix.bytes;
- final int lenInPrefix = compiledAutomaton.commonSuffixRef.length - currentFrame.suffix;
- assert compiledAutomaton.commonSuffixRef.offset == 0;
+ final int lenInPrefix = commonSuffix.length - currentFrame.suffix;
+ assert commonSuffix.offset == 0;
int suffixBytesPos;
int commonSuffixBytesPos = 0;
@@ -394,24 +605,20 @@ final class IntersectTermsEnum extends T
final int termBytesPosEnd = currentFrame.prefix;
while (termBytesPos < termBytesPosEnd) {
if (termBytes[termBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) {
- // if (DEBUG) {
- // System.out.println(" skip: common suffix mismatch (in prefix)");
- // }
+ //if (DEBUG) System.out.println(" skip: common suffix mismatch (in prefix)");
continue nextTerm;
}
}
suffixBytesPos = currentFrame.startBytePos;
} else {
- suffixBytesPos = currentFrame.startBytePos + currentFrame.suffix - compiledAutomaton.commonSuffixRef.length;
+ suffixBytesPos = currentFrame.startBytePos + currentFrame.suffix - commonSuffix.length;
}
// Test overlapping suffix part:
- final int commonSuffixBytesPosEnd = compiledAutomaton.commonSuffixRef.length;
+ final int commonSuffixBytesPosEnd = commonSuffix.length;
while (commonSuffixBytesPos < commonSuffixBytesPosEnd) {
if (suffixBytes[suffixBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) {
- // if (DEBUG) {
- // System.out.println(" skip: common suffix mismatch");
- // }
+ //if (DEBUG) System.out.println(" skip: common suffix mismatch");
continue nextTerm;
}
}
@@ -423,10 +630,17 @@ final class IntersectTermsEnum extends T
// "temporarily" accepted, we just blindly .next()
// until the limit
- // See if the term prefix matches the automaton:
+ // TODO: for first iter of this loop can't we just use the current trans? we already advanced it and confirmed it matches lead
+ // byte of the suffix
+
+ // See if the term suffix matches the automaton:
int state = currentFrame.state;
+ int lastState = currentFrame.lastState;
+ //if (DEBUG) System.out.println(" a state=" + state + " curFrame.suffix.len=" + currentFrame.suffix + " curFrame.prefix=" + currentFrame.prefix);
for (int idx=0;idx<currentFrame.suffix;idx++) {
- state = runAutomaton.step(state, currentFrame.suffixBytes[currentFrame.startBytePos+idx] & 0xff);
+ lastState = state;
+ //if (DEBUG) System.out.println(" step label=" + (char) (currentFrame.suffixBytes[currentFrame.startBytePos+idx] & 0xff));
+ state = runAutomaton.step(state, currentFrame.suffixBytes[currentFrame.startBytePos+idx] & 0xff);
if (state == -1) {
// No match
//System.out.println(" no s=" + state);
@@ -436,16 +650,59 @@ final class IntersectTermsEnum extends T
}
}
+ //if (DEBUG) System.out.println(" after suffix: state=" + state + " lastState=" + lastState);
+
if (isSubBlock) {
// Match! Recurse:
//if (DEBUG) System.out.println(" sub-block match to state=" + state + "; recurse fp=" + currentFrame.lastSubFP);
copyTerm();
currentFrame = pushFrame(state);
- //if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix);
+ currentFrame.lastState = lastState;
+ //xif (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix);
+ } else if (currentFrame.isAutoPrefixTerm) {
+ // We are on an auto-prefix term, meaning this term was compiled
+ // at indexing time, matching all terms sharing this prefix (or,
+ // a floor'd subset of them if that count was too high). A
+ // prefix term represents a range of terms, so we now need to
+ // test whether, from the current state in the automaton, it
+ // accepts all terms in that range. As long as it does, we can
+ // use this term and then later skip ahead past all terms in
+ // this range:
+ if (allowAutoPrefixTerms) {
+
+ if (currentFrame.floorSuffixLeadEnd == -1) {
+ // Simple prefix case
+ useAutoPrefixTerm = state == sinkState;
+ } else {
+ if (currentFrame.floorSuffixLeadStart == -1) {
+ // Must also accept the empty string in this case
+ if (automaton.isAccept(state)) {
+ //if (DEBUG) System.out.println(" state is accept");
+ useAutoPrefixTerm = acceptsSuffixRange(state, 0, currentFrame.floorSuffixLeadEnd);
+ }
+ } else {
+ useAutoPrefixTerm = acceptsSuffixRange(lastState, currentFrame.floorSuffixLeadStart, currentFrame.floorSuffixLeadEnd);
+ }
+ }
+
+ //if (DEBUG) System.out.println(" useAutoPrefixTerm=" + useAutoPrefixTerm);
+
+ if (useAutoPrefixTerm) {
+ copyTerm();
+ currentFrame.termState.isRealTerm = false;
+ //if (DEBUG) System.out.println(" return auto prefix term: " + brToString(term));
+ return term;
+ } else {
+ // We move onto the next term
+ }
+ } else {
+ // We are not allowed to use auto-prefix terms, so we just skip it
+ }
} else if (runAutomaton.isAccept(state)) {
copyTerm();
- //if (DEBUG) System.out.println(" term match to state=" + state + "; return term=" + brToString(term));
+ //if (DEBUG) System.out.println(" term match to state=" + state);
assert savedStartTerm == null || term.compareTo(savedStartTerm) > 0: "saveStartTerm=" + savedStartTerm.utf8ToString() + " term=" + term.utf8ToString();
+ //if (DEBUG) System.out.println(" return term=" + brToString(term));
return term;
} else {
//System.out.println(" no s=" + state);
@@ -453,6 +710,41 @@ final class IntersectTermsEnum extends T
}
}
+ private final Transition transition = new Transition();
+
+ /** Returns true if, from this state, the automaton accepts any suffix
+ * starting with a label between start and end, inclusive. We just
+ * look for a transition, matching this range, to the sink state. */
+ private boolean acceptsSuffixRange(int state, int start, int end) {
+
+ //xif (DEBUG) System.out.println(" acceptsSuffixRange state=" + state + " start=" + start + " end=" + end);
+
+ int count = automaton.initTransition(state, transition);
+ //xif (DEBUG) System.out.println(" transCount=" + count);
+ //xif (DEBUG) System.out.println(" trans=" + transition);
+ for(int i=0;i<count;i++) {
+ automaton.getNextTransition(transition);
+ if (start >= transition.min && end <= transition.max && transition.dest == sinkState) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ // for debugging
+ @SuppressWarnings("unused")
+ static String brToString(BytesRef b) {
+ try {
+ return b.utf8ToString() + " " + b;
+ } catch (Throwable t) {
+ // If BytesRef isn't actually UTF8, or it's eg a
+ // prefix of UTF8 that ends mid-unicode-char, we
+ // fallback to hex:
+ return b.toString();
+ }
+ }
+
private void copyTerm() {
//System.out.println(" copyTerm cur.prefix=" + currentFrame.prefix + " cur.suffix=" + currentFrame.suffix + " first=" + (char) currentFrame.suffixBytes[currentFrame.startBytePos]);
final int len = currentFrame.prefix + currentFrame.suffix;
Modified: lucene/dev/branches/lucene6005/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene6005/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java?rev=1635002&r1=1635001&r2=1635002&view=diff
==============================================================================
--- lucene/dev/branches/lucene6005/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java (original)
+++ lucene/dev/branches/lucene6005/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java Tue Oct 28 22:52:49 2014
@@ -35,9 +35,14 @@ final class IntersectTermsEnumFrame {
long fpEnd;
long lastSubFP;
+ // private static boolean DEBUG = IntersectTermsEnum.DEBUG;
+
// State in automaton
int state;
+ // State just before the last label
+ int lastState;
+
int metaDataUpto;
byte[] suffixBytes = new byte[128];
@@ -73,6 +78,8 @@ final class IntersectTermsEnumFrame {
int transitionIndex;
int transitionCount;
+ final boolean versionAutoPrefix;
+
FST.Arc<BytesRef> arc;
final BlockTermState termState;
@@ -89,6 +96,17 @@ final class IntersectTermsEnumFrame {
int startBytePos;
int suffix;
+ // When we are on an auto-prefix term this is the starting lead byte
+ // of the suffix (e.g. 'a' for the foo[a-m]* case):
+ int floorSuffixLeadStart;
+
+ // When we are on an auto-prefix term this is the ending lead byte
+ // of the suffix (e.g. 'm' for the foo[a-m]* case):
+ int floorSuffixLeadEnd;
+
+ // True if the term we are currently on is an auto-prefix term:
+ boolean isAutoPrefixTerm;
+
private final IntersectTermsEnum ite;
public IntersectTermsEnumFrame(IntersectTermsEnum ite, int ord) throws IOException {
@@ -97,35 +115,39 @@ final class IntersectTermsEnumFrame {
this.termState = ite.fr.parent.postingsReader.newTermState();
this.termState.totalTermFreq = -1;
this.longs = new long[ite.fr.longsSize];
+ this.versionAutoPrefix = ite.fr.parent.version >= BlockTreeTermsReader.VERSION_AUTO_PREFIX_TERMS;
}
void loadNextFloorBlock() throws IOException {
assert numFollowFloorBlocks > 0;
- //if (DEBUG) System.out.println(" loadNextFoorBlock trans=" + transitions[transitionIndex]);
+ //if (DEBUG) System.out.println(" loadNextFloorBlock transition.min=" + transition.min);
do {
fp = fpOrig + (floorDataReader.readVLong() >>> 1);
numFollowFloorBlocks--;
- // if (DEBUG) System.out.println(" skip floor block2! nextFloorLabel=" + (char) nextFloorLabel + " vs target=" + (char) transitions[transitionIndex].getMin() + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks);
+ //if (DEBUG) System.out.println(" skip floor block2! nextFloorLabel=" + (char) nextFloorLabel + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks);
if (numFollowFloorBlocks != 0) {
nextFloorLabel = floorDataReader.readByte() & 0xff;
} else {
nextFloorLabel = 256;
}
- // if (DEBUG) System.out.println(" nextFloorLabel=" + (char) nextFloorLabel);
+ //if (DEBUG) System.out.println(" nextFloorLabel=" + (char) nextFloorLabel);
} while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min);
+ //if (DEBUG) System.out.println(" done loadNextFloorBlock");
+
load(null);
}
public void setState(int state) {
this.state = state;
transitionIndex = 0;
- transitionCount = ite.compiledAutomaton.automaton.getNumTransitions(state);
+ transitionCount = ite.automaton.getNumTransitions(state);
if (transitionCount != 0) {
- ite.compiledAutomaton.automaton.initTransition(state, transition);
- ite.compiledAutomaton.automaton.getNextTransition(transition);
+ ite.automaton.initTransition(state, transition);
+ ite.automaton.getNextTransition(transition);
curTransitionMax = transition.max;
+ //if (DEBUG) System.out.println(" after setState state=" + state + " trans: " + transition + " transCount=" + transitionCount);
} else {
curTransitionMax = -1;
}
@@ -133,7 +155,7 @@ final class IntersectTermsEnumFrame {
void load(BytesRef frameIndexData) throws IOException {
- // if (DEBUG) System.out.println(" load fp=" + fp + " fpOrig=" + fpOrig + " frameIndexData=" + frameIndexData + " trans=" + (transitions.length != 0 ? transitions[0] : "n/a" + " state=" + state));
+ //xif (DEBUG) System.out.println(" load fp=" + fp + " fpOrig=" + fpOrig + " frameIndexData=" + frameIndexData + " trans=" + (transitions.length != 0 ? transitions[0] : "n/a" + " state=" + state));
if (frameIndexData != null && transitionCount != 0) {
// Floor frame
@@ -148,7 +170,7 @@ final class IntersectTermsEnumFrame {
if ((code & BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
numFollowFloorBlocks = floorDataReader.readVInt();
nextFloorLabel = floorDataReader.readByte() & 0xff;
- // if (DEBUG) System.out.println(" numFollowFloorBlocks=" + numFollowFloorBlocks + " nextFloorLabel=" + nextFloorLabel);
+ //if (DEBUG) System.out.println(" numFollowFloorBlocks=" + numFollowFloorBlocks + " nextFloorLabel=" + nextFloorLabel);
// If current state is accept, we must process
// first block in case it has empty suffix:
@@ -158,7 +180,7 @@ final class IntersectTermsEnumFrame {
while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min) {
fp = fpOrig + (floorDataReader.readVLong() >>> 1);
numFollowFloorBlocks--;
- // if (DEBUG) System.out.println(" skip floor block! nextFloorLabel=" + (char) nextFloorLabel + " vs target=" + (char) transitions[0].getMin() + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks);
+ //xif (DEBUG) System.out.println(" skip floor block! nextFloorLabel=" + (char) nextFloorLabel + " vs target=" + (char) transitions[0].getMin() + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks);
if (numFollowFloorBlocks != 0) {
nextFloorLabel = floorDataReader.readByte() & 0xff;
} else {
@@ -179,7 +201,7 @@ final class IntersectTermsEnumFrame {
code = ite.in.readVInt();
isLeafBlock = (code & 1) != 0;
int numBytes = code >>> 1;
- // if (DEBUG) System.out.println(" entCount=" + entCount + " lastInFloor?=" + isLastInFloor + " leafBlock?=" + isLeafBlock + " numSuffixBytes=" + numBytes);
+ //if (DEBUG) System.out.println(" entCount=" + entCount + " lastInFloor?=" + isLastInFloor + " leafBlock?=" + isLeafBlock + " numSuffixBytes=" + numBytes);
if (suffixBytes.length < numBytes) {
suffixBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
}
@@ -214,41 +236,102 @@ final class IntersectTermsEnumFrame {
// written one after another -- tail recurse:
fpEnd = ite.in.getFilePointer();
}
+
+ // Necessary in case this ord previously was an auto-prefix
+ // term but now we recurse to a new leaf block
+ isAutoPrefixTerm = false;
}
// TODO: maybe add scanToLabel; should give perf boost
+ // Decodes next entry; returns true if it's a sub-block
public boolean next() {
- return isLeafBlock ? nextLeaf() : nextNonLeaf();
+ if (isLeafBlock) {
+ nextLeaf();
+ return false;
+ } else {
+ return nextNonLeaf();
+ }
}
- // Decodes next entry; returns true if it's a sub-block
- public boolean nextLeaf() {
- //if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount);
+ public void nextLeaf() {
+ //if (DEBUG) System.out.println(" frame.nextLeaf ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount);
assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
nextEnt++;
suffix = suffixesReader.readVInt();
startBytePos = suffixesReader.getPosition();
suffixesReader.skipBytes(suffix);
- return false;
}
public boolean nextNonLeaf() {
- //if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount);
+ //if (DEBUG) System.out.println(" frame.nextNonLeaf ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount + " versionAutoPrefix=" + versionAutoPrefix + " fp=" + suffixesReader.getPosition());
assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
nextEnt++;
final int code = suffixesReader.readVInt();
- suffix = code >>> 1;
- startBytePos = suffixesReader.getPosition();
- suffixesReader.skipBytes(suffix);
- if ((code & 1) == 0) {
- // A normal term
- termState.termBlockOrd++;
- return false;
+ if (versionAutoPrefix == false) {
+ suffix = code >>> 1;
+ startBytePos = suffixesReader.getPosition();
+ suffixesReader.skipBytes(suffix);
+ if ((code & 1) == 0) {
+ // A normal term
+ termState.termBlockOrd++;
+ return false;
+ } else {
+ // A sub-block; make sub-FP absolute:
+ lastSubFP = fp - suffixesReader.readVLong();
+ return true;
+ }
} else {
- // A sub-block; make sub-FP absolute:
- lastSubFP = fp - suffixesReader.readVLong();
- return true;
+ suffix = code >>> 2;
+ startBytePos = suffixesReader.getPosition();
+ suffixesReader.skipBytes(suffix);
+ switch (code & 3) {
+ case 0:
+ // A normal term
+ //if (DEBUG) System.out.println(" ret: term");
+ isAutoPrefixTerm = false;
+ termState.termBlockOrd++;
+ return false;
+ case 1:
+ // A sub-block; make sub-FP absolute:
+ isAutoPrefixTerm = false;
+ lastSubFP = fp - suffixesReader.readVLong();
+ //if (DEBUG) System.out.println(" ret: sub-block");
+ return true;
+ case 2:
+ // A normal prefix term, suffix leads with empty string
+ floorSuffixLeadStart = -1;
+ termState.termBlockOrd++;
+ floorSuffixLeadEnd = suffixesReader.readByte() & 0xff;
+ if (floorSuffixLeadEnd == 0xff) {
+ floorSuffixLeadEnd = -1;
+ //System.out.println(" fill in -1");
+ }
+ //if (DEBUG) System.out.println(" ret: floor prefix term: start=-1 end=" + floorSuffixLeadEnd);
+ isAutoPrefixTerm = true;
+ return false;
+ case 3:
+ // A floor'd prefix term, suffix leads with real byte
+ if (suffix == 0) {
+ // TODO: this is messy, but necessary because we are an auto-prefix term, but our suffix is the empty string here, so we have to
+ // look at the parent block to get the lead suffix byte:
+ assert ord > 0;
+ IntersectTermsEnumFrame parent = ite.stack[ord-1];
+ floorSuffixLeadStart = parent.suffixBytes[parent.startBytePos+parent.suffix-1] & 0xff;
+ //if (DEBUG) System.out.println(" peek-parent: suffix=" + floorSuffixLeadStart);
+ } else {
+ floorSuffixLeadStart = suffixBytes[startBytePos+suffix-1] & 0xff;
+ }
+ termState.termBlockOrd++;
+ isAutoPrefixTerm = true;
+ floorSuffixLeadEnd = suffixesReader.readByte() & 0xff;
+ //if (DEBUG) System.out.println(" ret: floor prefix term start=" + floorSuffixLeadStart + " end=" + floorSuffixLeadEnd);
+ return false;
+ default:
+ // Silly javac:
+ assert false;
+ return false;
+ }
}
}
@@ -277,10 +360,10 @@ final class IntersectTermsEnumFrame {
// stats
termState.docFreq = statsReader.readVInt();
- //if (DEBUG) System.out.println(" dF=" + state.docFreq);
+ //xif (DEBUG) System.out.println(" dF=" + state.docFreq);
if (ite.fr.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
termState.totalTermFreq = termState.docFreq + statsReader.readVLong();
- //if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq);
+ //xif (DEBUG) System.out.println(" totTF=" + state.totalTermFreq);
}
// metadata
for (int i = 0; i < ite.fr.longsSize; i++) {
Modified: lucene/dev/branches/lucene6005/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene6005/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnum.java?rev=1635002&r1=1635001&r2=1635002&view=diff
==============================================================================
--- lucene/dev/branches/lucene6005/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnum.java (original)
+++ lucene/dev/branches/lucene6005/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnum.java Tue Oct 28 22:52:49 2014
@@ -36,7 +36,9 @@ import org.apache.lucene.util.RamUsageEs
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Util;
-/** Iterates through terms in this field */
+/** Iterates through terms in this field. This implementation skips
+ * any auto-prefix terms it encounters. */
+
final class SegmentTermsEnum extends TermsEnum {
// Lazy init:
@@ -50,7 +52,7 @@ final class SegmentTermsEnum extends Ter
private int targetBeforeCurrentLength;
- // static boolean DEBUG = false;
+ //static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
private final ByteArrayDataInput scratchReader = new ByteArrayDataInput();
@@ -121,6 +123,8 @@ final class SegmentTermsEnum extends Ter
* computing aggregate statistics. */
public Stats computeBlockStats() throws IOException {
+ // TODO: add total auto-prefix term count
+
Stats stats = new Stats(fr.parent.segment, fr.fieldInfo.name);
if (fr.index != null) {
stats.indexNodeCount = fr.index.getNodeCount();
@@ -154,8 +158,10 @@ final class SegmentTermsEnum extends Ter
while (currentFrame.nextEnt == currentFrame.entCount) {
stats.endBlock(currentFrame);
if (!currentFrame.isLastInFloor) {
+ // Advance to next floor block
currentFrame.loadNextFloorBlock();
stats.startBlock(currentFrame, true);
+ break;
} else {
if (currentFrame.ord == 0) {
break allTerms;
@@ -177,8 +183,6 @@ final class SegmentTermsEnum extends Ter
// This is a "next" frame -- even if it's
// floor'd we must pretend it isn't so we don't
// try to scan to the right floor frame:
- currentFrame.isFloor = false;
- //currentFrame.hasTerms = true;
currentFrame.loadBlock();
stats.startBlock(currentFrame, !currentFrame.isLastInFloor);
} else {
@@ -296,6 +300,7 @@ final class SegmentTermsEnum extends Ter
return true;
}
+ /*
// for debugging
@SuppressWarnings("unused")
static String brToString(BytesRef b) {
@@ -309,8 +314,15 @@ final class SegmentTermsEnum extends Ter
}
}
+ // for debugging
+ @SuppressWarnings("unused")
+ static String brToString(BytesRefBuilder b) {
+ return brToString(b.get());
+ }
+ */
+
@Override
- public boolean seekExact(final BytesRef target) throws IOException {
+ public boolean seekExact(BytesRef target) throws IOException {
if (fr.index == null) {
throw new IllegalStateException("terms index was not loaded");
@@ -567,7 +579,8 @@ final class SegmentTermsEnum extends Ter
}
@Override
- public SeekStatus seekCeil(final BytesRef target) throws IOException {
+ public SeekStatus seekCeil(BytesRef target) throws IOException {
+
if (fr.index == null) {
throw new IllegalStateException("terms index was not loaded");
}
@@ -577,7 +590,7 @@ final class SegmentTermsEnum extends Ter
assert clearEOF();
// if (DEBUG) {
- // System.out.println("\nBTTR.seekCeil seg=" + fr.parent.segment + " target=" + fr.fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix= " + validIndexPrefix);
+ // System.out.println("\nBTTR.seekCeil seg=" + fr.parent.segment + " target=" + fr.fieldInfo.name + ":" + brToString(target) + " " + target + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix= " + validIndexPrefix);
// printSeekState(System.out);
// }
@@ -619,7 +632,7 @@ final class SegmentTermsEnum extends Ter
while (targetUpto < targetLimit) {
cmp = (term.byteAt(targetUpto)&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF);
//if (DEBUG) {
- //System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")" + " arc.output=" + arc.output + " output=" + output);
+ //System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.byteAt(targetUpto)) + ")" + " arc.output=" + arc.output + " output=" + output);
//}
if (cmp != 0) {
break;
@@ -649,7 +662,7 @@ final class SegmentTermsEnum extends Ter
while (targetUpto < targetLimit2) {
cmp = (term.byteAt(targetUpto)&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF);
//if (DEBUG) {
- //System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")");
+ //System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.byteAt(targetUpto)) + ")");
//}
if (cmp != 0) {
break;
@@ -735,7 +748,7 @@ final class SegmentTermsEnum extends Ter
// Index is exhausted
// if (DEBUG) {
- // System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + toHex(targetLabel));
+ // System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + targetLabel);
// }
validIndexPrefix = currentFrame.prefix;
@@ -745,6 +758,7 @@ final class SegmentTermsEnum extends Ter
currentFrame.loadBlock();
+ //if (DEBUG) System.out.println(" now scanToTerm");
final SeekStatus result = currentFrame.scanToTerm(target, false);
if (result == SeekStatus.END) {
term.copyBytes(target);
@@ -752,7 +766,7 @@ final class SegmentTermsEnum extends Ter
if (next() != null) {
//if (DEBUG) {
- //System.out.println(" return NOT_FOUND term=" + brToString(term) + " " + term);
+ //System.out.println(" return NOT_FOUND term=" + brToString(term));
//}
return SeekStatus.NOT_FOUND;
} else {
@@ -763,7 +777,7 @@ final class SegmentTermsEnum extends Ter
}
} else {
//if (DEBUG) {
- //System.out.println(" return " + result + " term=" + brToString(term) + " " + term);
+ //System.out.println(" return " + result + " term=" + brToString(term));
//}
return result;
}
@@ -778,7 +792,7 @@ final class SegmentTermsEnum extends Ter
}
//if (DEBUG) {
- //System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput);
+ //System.out.println(" index: follow label=" + (target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput);
//}
targetUpto++;
@@ -804,7 +818,7 @@ final class SegmentTermsEnum extends Ter
termExists = false;
if (next() != null) {
//if (DEBUG) {
- //System.out.println(" return NOT_FOUND term=" + term.utf8ToString() + " " + term);
+ //System.out.println(" return NOT_FOUND term=" + term.get().utf8ToString() + " " + term);
//}
return SeekStatus.NOT_FOUND;
} else {
@@ -908,7 +922,9 @@ final class SegmentTermsEnum extends Ter
// Pop finished blocks
while (currentFrame.nextEnt == currentFrame.entCount) {
if (!currentFrame.isLastInFloor) {
+ // Advance to next floor block
currentFrame.loadNextFloorBlock();
+ break;
} else {
//if (DEBUG) System.out.println(" pop frame");
if (currentFrame.ord == 0) {
@@ -948,11 +964,9 @@ final class SegmentTermsEnum extends Ter
// This is a "next" frame -- even if it's
// floor'd we must pretend it isn't so we don't
// try to scan to the right floor frame:
- currentFrame.isFloor = false;
- //currentFrame.hasTerms = true;
currentFrame.loadBlock();
} else {
- //if (DEBUG) System.out.println(" return term=" + term.utf8ToString() + " " + term + " currentFrame.ord=" + currentFrame.ord);
+ //if (DEBUG) System.out.println(" return term=" + brToString(term) + " currentFrame.ord=" + currentFrame.ord);
return term.get();
}
}
Modified: lucene/dev/branches/lucene6005/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene6005/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java?rev=1635002&r1=1635001&r2=1635002&view=diff
==============================================================================
--- lucene/dev/branches/lucene6005/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java (original)
+++ lucene/dev/branches/lucene6005/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java Tue Oct 28 22:52:49 2014
@@ -37,6 +37,10 @@ final class SegmentTermsEnumFrame {
FST.Arc<BytesRef> arc;
+ final boolean versionAutoPrefix;
+
+ //static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
+
// File pointer where this block was loaded from
long fp;
long fpOrig;
@@ -96,6 +100,7 @@ final class SegmentTermsEnumFrame {
this.state = ste.fr.parent.postingsReader.newTermState();
this.state.totalTermFreq = -1;
this.longs = new long[ste.fr.longsSize];
+ this.versionAutoPrefix = ste.fr.parent.version >= BlockTreeTermsReader.VERSION_AUTO_PREFIX_TERMS;
}
public void setFloorData(ByteArrayDataInput in, BytesRef source) {
@@ -262,12 +267,17 @@ final class SegmentTermsEnumFrame {
*/
}
- public boolean next() {
- return isLeafBlock ? nextLeaf() : nextNonLeaf();
+ // Decodes next entry; returns true if it's a sub-block
+ public boolean next() throws IOException {
+ if (isLeafBlock) {
+ nextLeaf();
+ return false;
+ } else {
+ return nextNonLeaf();
+ }
}
- // Decodes next entry; returns true if it's a sub-block
- public boolean nextLeaf() {
+ public void nextLeaf() {
//if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount);
assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
nextEnt++;
@@ -276,36 +286,73 @@ final class SegmentTermsEnumFrame {
ste.term.setLength(prefix + suffix);
ste.term.grow(ste.term.length());
suffixesReader.readBytes(ste.term.bytes(), prefix, suffix);
- // A normal term
ste.termExists = true;
- return false;
}
- public boolean nextNonLeaf() {
- //if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount);
- assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
- nextEnt++;
- final int code = suffixesReader.readVInt();
- suffix = code >>> 1;
- startBytePos = suffixesReader.getPosition();
- ste.term.setLength(prefix + suffix);
- ste.term.grow(ste.term.length());
- suffixesReader.readBytes(ste.term.bytes(), prefix, suffix);
- if ((code & 1) == 0) {
- // A normal term
- ste.termExists = true;
- subCode = 0;
- state.termBlockOrd++;
- return false;
- } else {
- // A sub-block; make sub-FP absolute:
- ste.termExists = false;
- subCode = suffixesReader.readVLong();
- lastSubFP = fp - subCode;
- //if (DEBUG) {
- //System.out.println(" lastSubFP=" + lastSubFP);
- //}
- return true;
+ public boolean nextNonLeaf() throws IOException {
+ //if (DEBUG) System.out.println(" stef.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + suffixesReader.getPosition());
+ while (true) {
+ if (nextEnt == entCount) {
+ assert arc == null || (isFloor && isLastInFloor == false): "isFloor=" + isFloor + " isLastInFloor=" + isLastInFloor;
+ loadNextFloorBlock();
+ continue;
+ }
+
+ assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
+ nextEnt++;
+ final int code = suffixesReader.readVInt();
+ if (versionAutoPrefix == false) {
+ suffix = code >>> 1;
+ } else {
+ suffix = code >>> 2;
+ }
+ startBytePos = suffixesReader.getPosition();
+ ste.term.setLength(prefix + suffix);
+ ste.term.grow(ste.term.length());
+ suffixesReader.readBytes(ste.term.bytes(), prefix, suffix);
+ if (versionAutoPrefix == false) {
+ if ((code & 1) == 0) {
+ // A normal term
+ ste.termExists = true;
+ subCode = 0;
+ state.termBlockOrd++;
+ return false;
+ } else {
+ // A sub-block; make sub-FP absolute:
+ ste.termExists = false;
+ subCode = suffixesReader.readVLong();
+ lastSubFP = fp - subCode;
+ //if (DEBUG) {
+ //System.out.println(" lastSubFP=" + lastSubFP);
+ //}
+ return true;
+ }
+ } else {
+
+ switch(code & 3) {
+ case 0:
+ // A normal term
+ ste.termExists = true;
+ subCode = 0;
+ state.termBlockOrd++;
+ return false;
+ case 1:
+ // A sub-block; make sub-FP absolute:
+ ste.termExists = false;
+ subCode = suffixesReader.readVLong();
+ lastSubFP = fp - subCode;
+ //if (DEBUG) {
+ //System.out.println(" lastSubFP=" + lastSubFP);
+ //}
+ return true;
+ case 2:
+ case 3:
+ // A prefix term: skip it
+ state.termBlockOrd++;
+ suffixesReader.readByte();
+ continue;
+ }
+ }
}
}
@@ -448,18 +495,38 @@ final class SegmentTermsEnumFrame {
assert nextEnt < entCount;
nextEnt++;
final int code = suffixesReader.readVInt();
- suffixesReader.skipBytes(isLeafBlock ? code : code >>> 1);
- //if (DEBUG) System.out.println(" " + nextEnt + " (of " + entCount + ") ent isSubBlock=" + ((code&1)==1));
- if ((code & 1) != 0) {
- final long subCode = suffixesReader.readVLong();
- //if (DEBUG) System.out.println(" subCode=" + subCode);
- if (targetSubCode == subCode) {
- //if (DEBUG) System.out.println(" match!");
- lastSubFP = subFP;
- return;
+ if (versionAutoPrefix == false) {
+ suffixesReader.skipBytes(code >>> 1);
+ if ((code & 1) != 0) {
+ final long subCode = suffixesReader.readVLong();
+ if (targetSubCode == subCode) {
+ //if (DEBUG) System.out.println(" match!");
+ lastSubFP = subFP;
+ return;
+ }
+ } else {
+ state.termBlockOrd++;
}
} else {
- state.termBlockOrd++;
+ int flag = code & 3;
+ suffixesReader.skipBytes(code >>> 2);
+ //if (DEBUG) System.out.println(" " + nextEnt + " (of " + entCount + ") ent isSubBlock=" + ((code&1)==1));
+ if (flag == 1) {
+ // Sub-block
+ final long subCode = suffixesReader.readVLong();
+ //if (DEBUG) System.out.println(" subCode=" + subCode);
+ if (targetSubCode == subCode) {
+ //if (DEBUG) System.out.println(" match!");
+ lastSubFP = subFP;
+ return;
+ }
+ } else {
+ state.termBlockOrd++;
+ if (flag == 2 || flag == 3) {
+ // Floor'd prefix term
+ suffixesReader.readByte();
+ }
+ }
}
}
}
@@ -473,6 +540,21 @@ final class SegmentTermsEnumFrame {
private int suffix;
private long subCode;
+ // for debugging
+ /*
+ @SuppressWarnings("unused")
+ static String brToString(BytesRef b) {
+ try {
+ return b.utf8ToString() + " " + b;
+ } catch (Throwable t) {
+ // If BytesRef isn't actually UTF8, or it's eg a
+ // prefix of UTF8 that ends mid-unicode-char, we
+ // fallback to hex:
+ return b.toString();
+ }
+ }
+ */
+
// Target's prefix matches this block's prefix; we
// scan the entries check if the suffix matches.
public SeekStatus scanToTermLeaf(BytesRef target, boolean exactOnly) throws IOException {
@@ -535,9 +617,6 @@ final class SegmentTermsEnumFrame {
// keep scanning
if (nextEnt == entCount) {
- if (exactOnly) {
- fillTerm();
- }
// We are done scanning this block
break nextTerm;
} else {
@@ -590,7 +669,7 @@ final class SegmentTermsEnumFrame {
// scan the entries check if the suffix matches.
public SeekStatus scanToTermNonLeaf(BytesRef target, boolean exactOnly) throws IOException {
- //if (DEBUG) System.out.println(" scanToTermNonLeaf: block fp=" + fp + " prefix=" + prefix + " nextEnt=" + nextEnt + " (of " + entCount + ") target=" + brToString(target) + " term=" + brToString(term));
+ //if (DEBUG) System.out.println(" scanToTermNonLeaf: block fp=" + fp + " prefix=" + prefix + " nextEnt=" + nextEnt + " (of " + entCount + ") target=" + brToString(target) + " term=" + brToString(target));
assert nextEnt != -1;
@@ -605,30 +684,60 @@ final class SegmentTermsEnumFrame {
assert prefixMatches(target);
// Loop over each entry (term or sub-block) in this block:
- //nextTerm: while(nextEnt < entCount) {
- nextTerm: while (true) {
+ nextTerm: while(nextEnt < entCount) {
+
nextEnt++;
final int code = suffixesReader.readVInt();
- suffix = code >>> 1;
- // if (DEBUG) {
- // BytesRef suffixBytesRef = new BytesRef();
- // suffixBytesRef.bytes = suffixBytes;
- // suffixBytesRef.offset = suffixesReader.getPosition();
- // suffixBytesRef.length = suffix;
- // System.out.println(" cycle: " + ((code&1)==1 ? "sub-block" : "term") + " " + (nextEnt-1) + " (of " + entCount + ") suffix=" + brToString(suffixBytesRef));
- // }
+ if (versionAutoPrefix == false) {
+ suffix = code >>> 1;
+ } else {
+ suffix = code >>> 2;
+ }
+
+ //if (DEBUG) {
+ // BytesRef suffixBytesRef = new BytesRef();
+ // suffixBytesRef.bytes = suffixBytes;
+ // suffixBytesRef.offset = suffixesReader.getPosition();
+ // suffixBytesRef.length = suffix;
+ // System.out.println(" cycle: " + ((code&1)==1 ? "sub-block" : "term") + " " + (nextEnt-1) + " (of " + entCount + ") suffix=" + brToString(suffixBytesRef));
+ //}
- ste.termExists = (code & 1) == 0;
final int termLen = prefix + suffix;
startBytePos = suffixesReader.getPosition();
suffixesReader.skipBytes(suffix);
- if (ste.termExists) {
- state.termBlockOrd++;
- subCode = 0;
+ if (versionAutoPrefix == false) {
+ ste.termExists = (code & 1) == 0;
+ if (ste.termExists) {
+ state.termBlockOrd++;
+ subCode = 0;
+ } else {
+ subCode = suffixesReader.readVLong();
+ lastSubFP = fp - subCode;
+ }
} else {
- subCode = suffixesReader.readVLong();
- lastSubFP = fp - subCode;
+ switch (code & 3) {
+ case 0:
+ // Normal term
+ ste.termExists = true;
+ state.termBlockOrd++;
+ subCode = 0;
+ break;
+ case 1:
+ // Sub-block
+ ste.termExists = false;
+ subCode = suffixesReader.readVLong();
+ lastSubFP = fp - subCode;
+ break;
+ case 2:
+ case 3:
+ // Floor prefix term: skip it
+ //if (DEBUG) System.out.println(" skip floor prefix term");
+ suffixesReader.readByte();
+ ste.termExists = false;
+ state.termBlockOrd++;
+ continue;
+ }
}
final int targetLimit = target.offset + (target.length < termLen ? target.length : termLen);
@@ -637,7 +746,7 @@ final class SegmentTermsEnumFrame {
// Loop over bytes in the suffix, comparing to
// the target
int bytePos = startBytePos;
- while(true) {
+ while (true) {
final int cmp;
final boolean stop;
if (targetPos < targetLimit) {
@@ -652,24 +761,18 @@ final class SegmentTermsEnumFrame {
if (cmp < 0) {
// Current entry is still before the target;
// keep scanning
-
- if (nextEnt == entCount) {
- if (exactOnly) {
- fillTerm();
- //termExists = true;
- }
- // We are done scanning this block
- break nextTerm;
- } else {
- continue nextTerm;
- }
+ continue nextTerm;
} else if (cmp > 0) {
// Done! Current entry is after target --
// return NOT_FOUND:
fillTerm();
+ //if (DEBUG) System.out.println(" maybe done exactOnly=" + exactOnly + " ste.termExists=" + ste.termExists);
+
if (!exactOnly && !ste.termExists) {
+ //System.out.println(" now pushFrame");
+ // TODO this
// We are on a sub-block, and caller wants
// us to position to the next term after
// the target, so we must recurse into the
Modified: lucene/dev/branches/lucene6005/lucene/core/src/java/org/apache/lucene/codecs/blocktree/Stats.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene6005/lucene/core/src/java/org/apache/lucene/codecs/blocktree/Stats.java?rev=1635002&r1=1635001&r2=1635002&view=diff
==============================================================================
--- lucene/dev/branches/lucene6005/lucene/core/src/java/org/apache/lucene/codecs/blocktree/Stats.java (original)
+++ lucene/dev/branches/lucene6005/lucene/core/src/java/org/apache/lucene/codecs/blocktree/Stats.java Tue Oct 28 22:52:49 2014
@@ -48,6 +48,8 @@ public class Stats {
/** Total number of bytes (sum of term lengths) across all terms in the field. */
public long totalTermBytes;
+ // TODO: add total auto-prefix term count
+
/** The number of normal (non-floor) blocks in the terms file. */
public int nonFloorBlockCount;
Modified: lucene/dev/branches/lucene6005/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50PostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene6005/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50PostingsFormat.java?rev=1635002&r1=1635001&r2=1635002&view=diff
==============================================================================
--- lucene/dev/branches/lucene6005/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50PostingsFormat.java (original)
+++ lucene/dev/branches/lucene6005/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50PostingsFormat.java Tue Oct 28 22:52:49 2014
@@ -391,8 +391,10 @@ public final class Lucene50PostingsForma
final static int VERSION_START = 0;
final static int VERSION_CURRENT = VERSION_START;
- private final int minTermBlockSize;
- private final int maxTermBlockSize;
+ private final int minTemsInBlock;
+ private final int maxItemsInBlock;
+ private final int minItemsInAutoPrefix;
+ private final int maxItemsInAutoPrefix;
/**
* Fixed packed block size, number of integers encoded in
@@ -404,19 +406,33 @@ public final class Lucene50PostingsForma
/** Creates {@code Lucene50PostingsFormat} with default
* settings. */
public Lucene50PostingsFormat() {
- this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
+ this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE, 0, 0);
}
/** Creates {@code Lucene50PostingsFormat} with custom
* values for {@code minBlockSize} and {@code
- * maxBlockSize} passed to block terms dictionary.
+ * maxBlockSize} and default values for {@code minItemsInAutoPrefix} and
+ * {@code maxItemsInAutoPrefix}, passed to block tree terms dictionary.
* @see BlockTreeTermsWriter#BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int) */
- public Lucene50PostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
+ public Lucene50PostingsFormat(int minTemsInBlock, int maxItemsInBlock) {
+ this(minTemsInBlock, maxItemsInBlock, 0, 0);
+ }
+
+ /** Creates {@code Lucene50PostingsFormat} with custom
+ * values for {@code minBlockSize}, {@code
+ * maxBlockSize}, {@code minItemsInAutoPrefix} and {@code maxItemsInAutoPrefix}, passed
+ * to block tree terms dictionary.
+ * @see BlockTreeTermsWriter#BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int,int,int) */
+ public Lucene50PostingsFormat(int minTemsInBlock, int maxItemsInBlock, int minItemsInAutoPrefix, int maxItemsInAutoPrefix) {
super("Lucene50");
- this.minTermBlockSize = minTermBlockSize;
- assert minTermBlockSize > 1;
- this.maxTermBlockSize = maxTermBlockSize;
- assert minTermBlockSize <= maxTermBlockSize;
+ BlockTreeTermsWriter.validateSettings(minTemsInBlock,
+ maxItemsInBlock);
+ BlockTreeTermsWriter.validateAutoPrefixSettings(minItemsInAutoPrefix,
+ maxItemsInAutoPrefix);
+ this.minTemsInBlock = minTemsInBlock;
+ this.maxItemsInBlock = maxItemsInBlock;
+ this.minItemsInAutoPrefix = minItemsInAutoPrefix;
+ this.maxItemsInAutoPrefix = maxItemsInAutoPrefix;
}
@Override
@@ -432,8 +448,10 @@ public final class Lucene50PostingsForma
try {
FieldsConsumer ret = new BlockTreeTermsWriter(state,
postingsWriter,
- minTermBlockSize,
- maxTermBlockSize);
+ minTemsInBlock,
+ maxItemsInBlock,
+ minItemsInAutoPrefix,
+ maxItemsInAutoPrefix);
success = true;
return ret;
} finally {
Modified: lucene/dev/branches/lucene6005/lucene/core/src/java/org/apache/lucene/document/Document2.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene6005/lucene/core/src/java/org/apache/lucene/document/Document2.java?rev=1635002&r1=1635001&r2=1635002&view=diff
==============================================================================
--- lucene/dev/branches/lucene6005/lucene/core/src/java/org/apache/lucene/document/Document2.java (original)
+++ lucene/dev/branches/lucene6005/lucene/core/src/java/org/apache/lucene/document/Document2.java Tue Oct 28 22:52:49 2014
@@ -24,7 +24,6 @@ import java.util.Iterator;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.NumericTokenStream;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.FieldTypes.FieldType;
import org.apache.lucene.index.FieldInfo.DocValuesType;
@@ -81,6 +80,20 @@ public class Document2 implements IndexD
return boost;
}
+ private TokenStream getReusedBinaryTokenStream(BytesRef value, TokenStream reuse) {
+ BinaryTokenStream bts;
+ if (reuse != null) {
+ if (reuse instanceof BinaryTokenStream == false) {
+ FieldTypes.illegalState(fieldName, "should have had BinaryTokenStream for reuse, but got " + reuse);
+ }
+ bts = (BinaryTokenStream) reuse;
+ } else {
+ bts = new BinaryTokenStream();
+ }
+ bts.setValue(value);
+ return bts;
+ }
+
@Override
public TokenStream tokenStream(Analyzer analyzerIn, TokenStream reuse) throws IOException {
Analyzer analyzer = fieldTypes.getIndexAnalyzer();
@@ -92,41 +105,13 @@ public class Document2 implements IndexD
FieldTypes.FieldType fieldType = fieldTypes.getFieldType(fieldName);
switch (fieldType.valueType) {
case INT:
+ return getReusedBinaryTokenStream(intToBytes(((Number) value).intValue()), reuse);
case FLOAT:
+ return getReusedBinaryTokenStream(intToBytes(Float.floatToIntBits(((Number) value).floatValue())), reuse);
case LONG:
+ return getReusedBinaryTokenStream(longToBytes(((Number) value).longValue()), reuse);
case DOUBLE:
- NumericTokenStream nts;
- if (reuse != null) {
- if (reuse instanceof NumericTokenStream == false) {
- FieldTypes.illegalState(fieldName, "should have had NumericTokenStream for reuse, but got " + reuse);
- }
- nts = (NumericTokenStream) reuse;
- if (fieldType.numericPrecisionStep == null || nts.getPrecisionStep() != fieldType.numericPrecisionStep.intValue()) {
- FieldTypes.illegalState(fieldName, "reused NumericTokenStream has precisionStep " + nts.getPrecisionStep() + ", which is different from FieldType's " + fieldType.numericPrecisionStep);
- }
- } else {
- nts = new NumericTokenStream(fieldType.numericPrecisionStep);
- }
- // initialize value in TokenStream
- final Number number = (Number) value;
- switch (fieldType.valueType) {
- case INT:
- nts.setIntValue(number.intValue());
- break;
- case LONG:
- nts.setLongValue(number.longValue());
- break;
- case FLOAT:
- nts.setFloatValue(number.floatValue());
- break;
- case DOUBLE:
- nts.setDoubleValue(number.doubleValue());
- break;
- default:
- throw new AssertionError("Should never get here");
- }
- return nts;
-
+ return getReusedBinaryTokenStream(longToBytes(Double.doubleToLongBits(((Number) value).doubleValue())), reuse);
case ATOM:
if (value instanceof String) {
StringTokenStream sts;
@@ -142,17 +127,7 @@ public class Document2 implements IndexD
return sts;
} else {
assert value instanceof BytesRef;
- BinaryTokenStream bts;
- if (reuse != null) {
- if (reuse instanceof BinaryTokenStream == false) {
- FieldTypes.illegalState(fieldName, "should have had BinaryTokenStream for reuse, but got " + reuse);
- }
- bts = (BinaryTokenStream) reuse;
- } else {
- bts = new BinaryTokenStream();
- }
- bts.setValue((BytesRef) value);
- return bts;
+ return getReusedBinaryTokenStream((BytesRef) value, reuse);
}
case BINARY:
@@ -453,4 +428,30 @@ public class Document2 implements IndexD
}
}
}
+
+ static BytesRef intToBytes(int v) {
+ int sortableBits = v ^ 0x80000000;
+ BytesRef token = new BytesRef(4);
+ token.length = 4;
+ int index = 3;
+ while (index >= 0) {
+ token.bytes[index] = (byte) (sortableBits & 0xff);
+ index--;
+ sortableBits >>>= 8;
+ }
+ return token;
+ }
+
+ static BytesRef longToBytes(long v) {
+ long sortableBits = v ^ 0x8000000000000000L;
+ BytesRef token = new BytesRef(8);
+ token.length = 8;
+ int index = 7;
+ while (index >= 0) {
+ token.bytes[index] = (byte) (sortableBits & 0xff);
+ index--;
+ sortableBits >>>= 8;
+ }
+ return token;
+ }
}