You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2011/08/04 13:03:35 UTC
svn commit: r1153845 - in /lucene/dev/branches/blocktree_3030: ./
lucene/src/java/org/apache/lucene/index/codecs/
lucene/src/java/org/apache/lucene/index/codecs/pulsing/
lucene/src/java/org/apache/lucene/index/codecs/standard/
lucene/src/test/org/apach...
Author: mikemccand
Date: Thu Aug 4 11:03:34 2011
New Revision: 1153845
URL: http://svn.apache.org/viewvc?rev=1153845&view=rev
Log:
LUCENE-3030: fix more false exc in test sops; add missing & 0xff (caused Test2BTerms to fail); fix bug in IntersectEnum seekToStartTerm causing fail in TestFuzzyQuery2
Modified:
lucene/dev/branches/blocktree_3030/TODO
lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java
lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsWriter.java
lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java
lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java
lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/index/Test2BTerms.java
lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/index/TestTermsEnum.java
Modified: lucene/dev/branches/blocktree_3030/TODO
URL: http://svn.apache.org/viewvc/lucene/dev/branches/blocktree_3030/TODO?rev=1153845&r1=1153844&r2=1153845&view=diff
==============================================================================
--- lucene/dev/branches/blocktree_3030/TODO (original)
+++ lucene/dev/branches/blocktree_3030/TODO Thu Aug 4 11:03:34 2011
@@ -1,4 +1,3 @@
-
perf tests:
- GRRR -- indexing MUCH slower now?
trunk:
@@ -34,9 +33,8 @@ automaton q should apply maxlength test
intersect should use suffix ref
-maybe blocks should NOT store sub-block pointers? it's reudundant w/ the index...
-
LATER:
+ - maybe blocks should NOT store sub-block pointers? it's reudundant w/ the index...
- hmm: maybe switch PKLookupTask to intersect!? do we have fast string builder?
- hmm -- fix DOT when there are multiple outputs!? oh, maybe not -- it just works?
- maybe we should provide a "terms dict rewriter" tool? ie can rewrite terms dict w/ new settings after segment was already created
Modified: lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java?rev=1153845&r1=1153844&r2=1153845&view=diff
==============================================================================
--- lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java (original)
+++ lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java Thu Aug 4 11:03:34 2011
@@ -624,7 +624,7 @@ public class BlockTreeTermsReader extend
void load(BytesRef frameIndexData) throws IOException {
- if (DEBUG) System.out.println(" load fp=" + fp + " fpOrig=" + fpOrig + " frameIndexData=" + frameIndexData + " trans=" + (transitions.length != 0 ? transitions[0] : "n/a"));
+ if (DEBUG) System.out.println(" load fp=" + fp + " fpOrig=" + fpOrig + " frameIndexData=" + frameIndexData + " trans=" + (transitions.length != 0 ? transitions[0] : "n/a" + " state=" + state));
if (frameIndexData != null && transitions.length != 0) {
// Floor frame
@@ -772,6 +772,9 @@ public class BlockTreeTermsReader extend
private final BytesRef savedStartTerm;
public IntersectEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
+ if (DEBUG) {
+ System.out.println("\nintEnum.init seg=" + segment);
+ }
// nocommit can we use suffixRef?
// nocommit in some cases we can do hard filter by
// length!! eg regexp ????????
@@ -805,9 +808,6 @@ public class BlockTreeTermsReader extend
if (startTerm != null) {
seekToStartTerm(startTerm);
}
- if (DEBUG) {
- System.out.println("\nintEnum.init seg=" + segment);
- }
}
@Override
@@ -846,6 +846,7 @@ public class BlockTreeTermsReader extend
f.fp = f.fpOrig = currentFrame.lastSubFP;
f.prefix = currentFrame.prefix + currentFrame.suffix;
+ if (DEBUG) System.out.println(" pushFrame state=" + state + " prefix=" + f.prefix);
f.setState(state);
// Walk the arc through the index -- we only
@@ -909,6 +910,15 @@ public class BlockTreeTermsReader extend
}
}
+ private int getState() {
+ int state = currentFrame.state;
+ for(int idx=0;idx<currentFrame.suffix;idx++) {
+ state = runAutomaton.step(state, currentFrame.suffixBytes[currentFrame.startBytePos+idx] & 0xff);
+ assert state != -1;
+ }
+ return state;
+ }
+
// NOTE: specialized to only doing the first-time
// seek, but we could generalize it to allow
// arbitrary seekExact/Ceil. Note that this is a
@@ -923,15 +933,6 @@ public class BlockTreeTermsReader extend
assert arc == currentFrame.arc;
for(int idx=0;idx<=target.length;idx++) {
- final int targetLabel = idx == target.length ? -1 : target.bytes[target.offset+idx] & 0xff;
- final int nextState;
- if (idx < target.length) {
- nextState = runAutomaton.step(currentFrame.state, targetLabel);
- assert nextState != -1;
- } else {
- nextState = -1;
- }
- if (DEBUG) System.out.println(" idx=" + idx + " label=" + (char) targetLabel + " f.ord=" + currentFrame.ord);
boolean lastIsSubBlock = false;
@@ -953,8 +954,7 @@ public class BlockTreeTermsReader extend
if (isSubBlock && target.startsWith(term)) {
// Recurse
- assert nextState != -1;
- currentFrame = pushFrame(nextState);
+ currentFrame = pushFrame(getState());
break;
} else {
final int cmp = term.compareTo(target);
@@ -989,10 +989,10 @@ public class BlockTreeTermsReader extend
term.length = currentFrame.prefix + currentFrame.suffix;
if (lastIsSubBlock) {
// Recurse
- currentFrame = pushFrame(nextState);
+ currentFrame = pushFrame(getState());
break;
} else {
- if (DEBUG) System.out.println(" return term=" + brToString(term));
+ if (DEBUG) System.out.println(" fallback return term=" + brToString(term) + " curFrame.nextEnt=" + currentFrame.nextEnt);
return;
}
}
@@ -1053,9 +1053,6 @@ public class BlockTreeTermsReader extend
// sneaky! forces a pop above
currentFrame.isLastInFloor = true;
- //while (!currentFrame.isLastInFloor) {
- //currentFrame.loadNextFloorBlock();
- //}
currentFrame.nextEnt = currentFrame.entCount;
continue nextTerm;
}
@@ -1072,7 +1069,10 @@ public class BlockTreeTermsReader extend
state = runAutomaton.step(state, currentFrame.suffixBytes[currentFrame.startBytePos+idx] & 0xff);
if (state == -1) {
// No match
+ //System.out.println(" no s=" + state);
continue nextTerm;
+ } else {
+ //System.out.println(" c s=" + state);
}
}
@@ -1088,7 +1088,7 @@ public class BlockTreeTermsReader extend
assert savedStartTerm == null || term.compareTo(savedStartTerm) > 0: "saveStartTerm=" + savedStartTerm.utf8ToString() + " term=" + term.utf8ToString();
return term;
} else {
- //System.out.println(" no match");
+ //System.out.println(" no s=" + state);
}
}
}
Modified: lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsWriter.java?rev=1153845&r1=1153844&r2=1153845&view=diff
==============================================================================
--- lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsWriter.java (original)
+++ lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsWriter.java Thu Aug 4 11:03:34 2011
@@ -461,12 +461,12 @@ public class BlockTreeTermsWriter extend
assert numSubs == 0;
label = -1;
} else {
- label = term.term.bytes[term.term.offset + prefixLength];
+ label = term.term.bytes[term.term.offset + prefixLength] & 0xff;
}
} else {
PendingBlock block = (PendingBlock) ent;
assert block.prefix.length > prefixLength;
- label = block.prefix.bytes[block.prefix.offset + prefixLength];
+ label = block.prefix.bytes[block.prefix.offset + prefixLength] & 0xff;
}
if (label != lastLabel && (termCount + subCount) != 0) {
@@ -579,6 +579,8 @@ public class BlockTreeTermsWriter extend
}
if (curStart <= maxItemsInBlock) {
+ // nocommit -- should we do a better job
+ // segmenting here...?
// remainder is small enough to fit into a
// block. NOTE that this may be too small (<
// minItemsInBlock); need a true segmenter
@@ -605,16 +607,14 @@ public class BlockTreeTermsWriter extend
return 1;
}
- String brPrefixToString(BytesRef b) {
- // nocommit
- return b.toString();
- //return b.utf8ToString() + " " + b;
- }
-
- String brToString(BytesRef b) {
- // nocommit
- // return b.toString();
- return b.utf8ToString() + " " + b;
+ // for debugging
+ private String toString(BytesRef b) {
+ final String s;
+ try {
+ return b.utf8ToString() + " " + b;
+ } catch (Throwable t) {
+ return b.toString();
+ }
}
// TODO: we could block-write the term suffix pointers;
@@ -624,27 +624,21 @@ public class BlockTreeTermsWriter extend
assert length > 0;
- final BytesRef prefix = new BytesRef(indexPrefixLength);
- for(int m=0;m<indexPrefixLength;m++) {
- prefix.bytes[m] = (byte) prevTerm.ints[m];
- }
- prefix.length = indexPrefixLength;
-
- /*if (isFloor) {
- System.out.println(" wb seg=" + segment + " prefix=" + prefix.utf8ToString() + " " + prefix + " field=" + fieldInfo.name + " prefix=" + prefixLength + " pending=" + pending.size() + " start=" + start + " length=" + length);
- } else {
- System.out.println("\nWB seg=" + segment + " prefix=" + prefix.utf8ToString() + " " + prefix + " field=" + fieldInfo.name + " prefix=" + prefixLength + " pending=" + pending.size() + " start=" + start + " length=" + length);
- }*/
assert pending.size() >= start: "pending.size()=" + pending.size() + " start=" + start + " length=" + length;
final List<Object> slice = pending.subList(pending.size()-start, pending.size()-start + length);
final long startFP = out.getFilePointer();
+ final BytesRef prefix = new BytesRef(indexPrefixLength);
+ for(int m=0;m<indexPrefixLength;m++) {
+ prefix.bytes[m] = (byte) prevTerm.ints[m];
+ }
+ prefix.length = indexPrefixLength;
out.writeVInt((length<<1)|(isLastInFloor ? 1:0));
if (DEBUG2 || DEBUG) {
- System.out.println(" writeBlock " + (isFloor ? "(floor) " : "") + "seg=" + segment + " pending.size()=" + pending.size() + " prefixLength=" + prefixLength + " indexPrefix=" + prefix + " entCount=" + length + " startFP=" + startFP + " futureTermCount=" + futureTermCount + (isFloor ? (" floorLeadByte=" + Integer.toHexString(floorLeadByte&0xff)) : "") + " isLastInFloor=" + isLastInFloor);
+ System.out.println(" writeBlock " + (isFloor ? "(floor) " : "") + "seg=" + segment + " pending.size()=" + pending.size() + " prefixLength=" + prefixLength + " indexPrefix=" + toString(prefix) + " entCount=" + length + " startFP=" + startFP + " futureTermCount=" + futureTermCount + (isFloor ? (" floorLeadByte=" + Integer.toHexString(floorLeadByte&0xff)) : "") + " isLastInFloor=" + isLastInFloor);
}
// 1st pass: pack term suffix bytes into byte[] blob
@@ -699,7 +693,7 @@ public class BlockTreeTermsWriter extend
BytesRef suffixBytes = new BytesRef(suffix);
System.arraycopy(block.prefix.bytes, prefixLength, suffixBytes.bytes, 0, suffix);
suffixBytes.length = suffix;
- System.out.println(" write sub-block suffix=" + brPrefixToString(suffixBytes) + " subFP=" + block.fp + " subCode=" + (startFP-block.fp) + " floor=" + block.isFloor);
+ System.out.println(" write sub-block suffix=" + toString(suffixBytes) + " subFP=" + block.fp + " subCode=" + (startFP-block.fp) + " floor=" + block.isFloor);
}
bytesWriter.writeVLong(startFP - block.fp);
@@ -771,7 +765,7 @@ public class BlockTreeTermsWriter extend
@Override
public PostingsConsumer startTerm(BytesRef text) throws IOException {
- if (DEBUG) System.out.println("\nBTTW.startTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text + " seg=" + segment);
+ if (DEBUG) System.out.println("\nBTTW.startTerm term=" + fieldInfo.name + ":" + toString(text) + " seg=" + segment);
postingsWriter.startTerm();
/*
if (fieldInfo.name.equals("id")) {
@@ -787,7 +781,7 @@ public class BlockTreeTermsWriter extend
public void finishTerm(BytesRef text, TermStats stats) throws IOException {
assert stats.docFreq > 0;
- if (DEBUG) System.out.println("BTTW.finishTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text + " seg=" + segment + " df=" + stats.docFreq);
+ if (DEBUG) System.out.println("BTTW.finishTerm term=" + fieldInfo.name + ":" + toString(text) + " seg=" + segment + " df=" + stats.docFreq);
blockBuilder.add(text, noOutputs.getNoOutput());
pending.add(new PendingTerm(new BytesRef(text), stats));
Modified: lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java?rev=1153845&r1=1153844&r2=1153845&view=diff
==============================================================================
--- lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java (original)
+++ lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java Thu Aug 4 11:03:34 2011
@@ -66,7 +66,7 @@ public class PulsingCodec extends Codec
@Override
public String toString() {
- return name + "(freqCutoff=" + freqCutoff + ")";
+ return name + "(freqCutoff=" + freqCutoff + " minBlockSize=" + minBlockSize + " maxBlockSize=" + maxBlockSize + ")";
}
@Override
Modified: lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java?rev=1153845&r1=1153844&r2=1153845&view=diff
==============================================================================
--- lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java (original)
+++ lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java Thu Aug 4 11:03:34 2011
@@ -124,6 +124,11 @@ public class StandardCodec extends Codec
}
@Override
+ public String toString() {
+ return name + "(minBlockSize=" + minBlockSize + " maxBlockSize=" + maxBlockSize + ")";
+ }
+
+ @Override
public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException {
return new DefaultDocValuesConsumer(state, getDocValuesSortComparator(), getDocValuesUseCFS());
}
Modified: lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/index/Test2BTerms.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/index/Test2BTerms.java?rev=1153845&r1=1153844&r2=1153845&view=diff
==============================================================================
--- lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/index/Test2BTerms.java (original)
+++ lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/index/Test2BTerms.java Thu Aug 4 11:03:34 2011
@@ -169,6 +169,7 @@ public class Test2BTerms extends LuceneT
.setMergePolicy(newLogMergePolicy(false, 10))
.setOpenMode(IndexWriterConfig.OpenMode.CREATE));
+ w.setInfoStream(VERBOSE ? System.out : null);
MergePolicy mp = w.getConfig().getMergePolicy();
if (mp instanceof LogByteSizeMergePolicy) {
// 1 petabyte:
Modified: lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/index/TestTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/index/TestTermsEnum.java?rev=1153845&r1=1153844&r2=1153845&view=diff
==============================================================================
--- lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/index/TestTermsEnum.java (original)
+++ lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/index/TestTermsEnum.java Thu Aug 4 11:03:34 2011
@@ -175,12 +175,21 @@ public class TestTermsEnum extends Lucen
terms.clear();
}
+ private boolean accepts(CompiledAutomaton c, BytesRef b) {
+ int state = c.runAutomaton.getInitialState();
+ for(int idx=0;idx<b.length;idx++) {
+ assertTrue(state != -1);
+ state = c.runAutomaton.step(state, b.bytes[b.offset+idx] & 0xff);
+ }
+ return c.runAutomaton.isAccept(state);
+ }
+
// Tests Terms.intersect
public void testIntersectRandom() throws IOException {
final Directory dir = newDirectory();
final RandomIndexWriter w = new RandomIndexWriter(random, dir);
-
+
final int numTerms = atLeast(1000);
final Set<String> terms = new HashSet<String>();
@@ -267,6 +276,7 @@ public class TestTermsEnum extends Lucen
final BytesRef b = new BytesRef(s);
acceptTermsArray[upto++] = b;
acceptTermsSet.add(b);
+ assertTrue(accepts(c, b));
}
Arrays.sort(acceptTermsArray);