You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2011/08/06 00:25:04 UTC
svn commit: r1154396 - in /lucene/dev/branches/blocktree_3030/lucene/src:
java/org/apache/lucene/index/codecs/
java/org/apache/lucene/index/codecs/pulsing/
java/org/apache/lucene/index/codecs/sep/
java/org/apache/lucene/index/codecs/standard/ java/org/...
Author: mikemccand
Date: Fri Aug 5 22:25:03 2011
New Revision: 1154396
URL: http://svn.apache.org/viewvc?rev=1154396&view=rev
Log:
LUCENE-3030: fix more nocommits
Removed:
lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/index/codecs/TestBlockTree.java
Modified:
lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTermState.java
lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java
lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java
lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsWriter.java
lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java
lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java
lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java
lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java
lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/store/ByteArrayDataInput.java
lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
lucene/dev/branches/blocktree_3030/lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java
lucene/dev/branches/blocktree_3030/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java
lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/index/TestTermsEnum.java
lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/search/TestWildcard.java
lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java
Modified: lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTermState.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTermState.java?rev=1154396&r1=1154395&r2=1154396&view=diff
==============================================================================
--- lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTermState.java (original)
+++ lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTermState.java Fri Aug 5 22:25:03 2011
@@ -32,10 +32,6 @@ public class BlockTermState extends OrdT
public int termBlockOrd; // the term's ord in the current block
public long blockFilePointer; // fp into the terms dict primary file (_X.tim) that holds this term
- // nocommit -- should not be here? BT dict doesn't need
- // it but B dict does?
- public int blockTermCount; // how many terms in current block
-
@Override
public void copyFrom(TermState _other) {
assert _other instanceof BlockTermState : "can not copy from " + _other.getClass().getName();
Modified: lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java?rev=1154396&r1=1154395&r2=1154396&view=diff
==============================================================================
--- lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java (original)
+++ lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java Fri Aug 5 22:25:03 2011
@@ -45,10 +45,6 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
import org.apache.lucene.util.DoubleBarrelLRUCache;
-// nocommit -- cut the postings interface over to the same
-// as blocktree? this way we don't need 2x of each codec's
-// postings impls?
-
/** Handles a terms dict, but decouples all details of
* doc/freqs/positions reading to an instance of {@link
* PostingsReaderBase}. This class is reusable for
@@ -110,8 +106,7 @@ public class BlockTermsReader extends Fi
}
}
- // nocommit
- private String segment;
+ // private String segment;
public BlockTermsReader(TermsIndexReaderBase indexReader, Directory dir, FieldInfos fieldInfos, String segment, PostingsReaderBase postingsReader, IOContext context,
int termsCacheSize, int codecId)
@@ -120,7 +115,7 @@ public class BlockTermsReader extends Fi
this.postingsReader = postingsReader;
termsCache = new DoubleBarrelLRUCache<FieldAndTerm,BlockTermState>(termsCacheSize);
- this.segment = segment;
+ // this.segment = segment;
in = dir.openInput(IndexFileNames.segmentFileName(segment, codecId, BlockTermsWriter.TERMS_EXTENSION),
context);
@@ -326,6 +321,9 @@ public class BlockTermsReader extends Fi
/* Common prefix used for all terms in this block. */
private int termBlockPrefix;
+ /* How many terms in current block */
+ private int blockTermCount;
+
private byte[] docFreqBytes;
private final ByteArrayDataInput freqReader = new ByteArrayDataInput();
private int metaDataUpto;
@@ -447,7 +445,7 @@ public class BlockTermsReader extends Fi
//System.out.println(" seek: term=" + term.utf8ToString());
} else {
//System.out.println(" skip seek");
- if (state.termBlockOrd == state.blockTermCount && !nextBlock()) {
+ if (state.termBlockOrd == blockTermCount && !nextBlock()) {
indexIsCurrent = false;
return SeekStatus.END;
}
@@ -483,8 +481,8 @@ public class BlockTermsReader extends Fi
// but it could be in next block. We
// must scan to end-of-block to set common
// prefix for next block:
- if (state.termBlockOrd < state.blockTermCount) {
- while(state.termBlockOrd < state.blockTermCount-1) {
+ if (state.termBlockOrd < blockTermCount) {
+ while(state.termBlockOrd < blockTermCount-1) {
state.termBlockOrd++;
state.ord++;
termSuffixesReader.skipBytes(termSuffixesReader.readVInt());
@@ -584,7 +582,7 @@ public class BlockTermsReader extends Fi
}
}
- if (state.termBlockOrd == state.blockTermCount) {
+ if (state.termBlockOrd == blockTermCount) {
// Must pre-fill term for next block's common prefix
term.length = termBlockPrefix + suffix;
if (term.bytes.length < term.length) {
@@ -650,8 +648,8 @@ public class BlockTermsReader extends Fi
metadata, ie docFreq, totalTermFreq or pulls a D/&PEnum, we then (lazily)
decode all metadata up to the current term. */
private BytesRef _next() throws IOException {
- //System.out.println("BTR._next seg=" + segment + " this=" + this + " termCount=" + state.termBlockOrd + " (vs " + state.blockTermCount + ")");
- if (state.termBlockOrd == state.blockTermCount && !nextBlock()) {
+ //System.out.println("BTR._next seg=" + segment + " this=" + this + " termCount=" + state.termBlockOrd + " (vs " + blockTermCount + ")");
+ if (state.termBlockOrd == blockTermCount && !nextBlock()) {
//System.out.println(" eof");
indexIsCurrent = false;
return null;
@@ -804,9 +802,9 @@ public class BlockTermsReader extends Fi
//System.out.println("BTR.nextBlock() fp=" + in.getFilePointer() + " this=" + this);
state.blockFilePointer = in.getFilePointer();
- state.blockTermCount = in.readVInt();
- //System.out.println(" blockTermCount=" + state.blockTermCount);
- if (state.blockTermCount == 0) {
+ blockTermCount = in.readVInt();
+ //System.out.println(" blockTermCount=" + blockTermCount);
+ if (blockTermCount == 0) {
return false;
}
termBlockPrefix = in.readVInt();
Modified: lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java?rev=1154396&r1=1154395&r2=1154396&view=diff
==============================================================================
--- lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java (original)
+++ lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java Fri Aug 5 22:25:03 2011
@@ -64,8 +64,6 @@ import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Outputs;
import org.apache.lucene.util.fst.Util;
-// nocommit finish jdocs
-
/** A block-based terms index and dictionary that assigns
* terms to variable length blocks according to how they
* share prefixes. The terms index is a prefix trie
@@ -79,7 +77,21 @@ import org.apache.lucene.util.fst.Util;
*
* <p><b>NOTE</b>: this terms dictionary does not support
* index divisor when opening an IndexReader. Instead, you
- * can change the min/maxItemsPerBlock during indexing.
+ * can change the min/maxItemsPerBlock during indexing.</p>
+ *
+ * <p>The data structure used by this implementation is very
+ * similar to a burst trie
+ * (http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.18.3499),
+ * but with added logic to break up too-large blocks of all
+ * terms sharing a given prefix into smaller ones.</p>
+ *
+ * <p>Use {@link CheckIndex} with the <code>-verbose</code>
+ * option to see summary statistics on the blocks in the
+ * dictionary.
+ *
+ * See {@link BlockTreeTermsWriter}.
+ *
+ * @lucene.experimental
*/
public class BlockTreeTermsReader extends FieldsProducer {
Modified: lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsWriter.java?rev=1154396&r1=1154395&r2=1154396&view=diff
==============================================================================
--- lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsWriter.java (original)
+++ lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsWriter.java Fri Aug 5 22:25:03 2011
@@ -83,6 +83,11 @@ import org.apache.lucene.util.fst.Util;
* @lucene.experimental
*/
+/** See {@link BlockTreeTermsReader}.
+ *
+ * @lucene.experimental
+*/
+
public class BlockTreeTermsWriter extends FieldsConsumer {
public static boolean DEBUG = false;
@@ -116,6 +121,10 @@ public class BlockTreeTermsWriter extend
private final List<TermsWriter> fields = new ArrayList<TermsWriter>();
private final String segment;
+ /** Create a new writer. The number of items (terms or
+ * sub-blocks) per block will aim to be between
+ * minItemsPerBlock and maxItemsPerBlock, though in some
+ * cases the blocks may be smaller than the min. */
public BlockTreeTermsWriter(
SegmentWriteState state,
PostingsWriterBase postingsWriter,
@@ -123,8 +132,8 @@ public class BlockTreeTermsWriter extend
int maxItemsInBlock)
throws IOException
{
- if (minItemsInBlock <= 0) {
- throw new IllegalArgumentException("minItemsInBlock must be >= 1; got " + minItemsInBlock);
+ if (minItemsInBlock <= 1) {
+ throw new IllegalArgumentException("minItemsInBlock must be >= 2; got " + minItemsInBlock);
}
if (maxItemsInBlock <= 0) {
throw new IllegalArgumentException("maxItemsInBlock must be >= 1; got " + maxItemsInBlock);
Modified: lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java?rev=1154396&r1=1154395&r2=1154396&view=diff
==============================================================================
--- lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java (original)
+++ lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java Fri Aug 5 22:25:03 2011
@@ -61,6 +61,7 @@ public class PulsingCodec extends Codec
super("Pulsing");
this.freqCutoff = freqCutoff;
this.minBlockSize = minBlockSize;
+ assert minBlockSize > 1;
this.maxBlockSize = maxBlockSize;
}
Modified: lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java?rev=1154396&r1=1154395&r2=1154396&view=diff
==============================================================================
--- lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java (original)
+++ lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java Fri Aug 5 22:25:03 2011
@@ -335,7 +335,8 @@ public final class PulsingPostingsWriter
buffer.writeTo(termsOut);
buffer.reset();
- // nocommit: O(N^2) though with small N...
+ // TDOO: this could be somewhat costly since
+ // pendingTerms.size() could be biggish?
int futureWrappedCount = 0;
final int limit2 = pendingTerms.size();
for(int idx=limit;idx<limit2;idx++) {
Modified: lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java?rev=1154396&r1=1154395&r2=1154396&view=diff
==============================================================================
--- lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java (original)
+++ lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java Fri Aug 5 22:25:03 2011
@@ -334,8 +334,6 @@ public final class SepPostingsWriterImpl
long lastSkipFP = 0;
if (count == 0) {
- // nocommit: silly? can we avoid this if we know
- // block has no terms?
termsOut.writeByte((byte) 0);
return;
}
Modified: lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java?rev=1154396&r1=1154395&r2=1154396&view=diff
==============================================================================
--- lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java (original)
+++ lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java Fri Aug 5 22:25:03 2011
@@ -49,6 +49,7 @@ public class StandardCodec extends Codec
public StandardCodec(int minBlockSize, int maxBlockSize) {
super("Standard");
this.minBlockSize = minBlockSize;
+ assert minBlockSize > 1;
this.maxBlockSize = maxBlockSize;
}
Modified: lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/store/ByteArrayDataInput.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/store/ByteArrayDataInput.java?rev=1154396&r1=1154395&r2=1154396&view=diff
==============================================================================
--- lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/store/ByteArrayDataInput.java (original)
+++ lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/store/ByteArrayDataInput.java Fri Aug 5 22:25:03 2011
@@ -43,9 +43,9 @@ public final class ByteArrayDataInput ex
reset(bytes, 0, bytes.length);
}
+ // NOTE: sets pos to 0, which is not right if you had
+ // called reset w/ non-zero offset!!
public void rewind() {
- // nocommit -- not right if .reset was called w/
- // non-zero offset...
pos = 0;
}
Modified: lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java?rev=1154396&r1=1154395&r2=1154396&view=diff
==============================================================================
--- lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java (original)
+++ lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java Fri Aug 5 22:25:03 2011
@@ -38,19 +38,12 @@ public class CompiledAutomaton {
public final BytesRef commonSuffixRef;
public final boolean finite;
- public final Automaton a;
- public final Automaton utf8;
-
// nocommit -- move pulling of a TermsEnum into here, so
// that we can optimize for cases where a simpler enume
// (prefix enum, all terms, no terms, etc.) can be used
public CompiledAutomaton(Automaton automaton, boolean finite) {
- // nocommit
- this.a = automaton;
Automaton utf8 = new UTF32ToUTF8().convert(automaton);
- // nocommit
- this.utf8 = utf8;
runAutomaton = new ByteRunAutomaton(utf8, true);
sortedTransitions = utf8.getSortedTransitions();
this.finite = finite;
@@ -63,9 +56,6 @@ public class CompiledAutomaton {
private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG;
- // nocommit -- needs tests; make sure we test infinite
- // case (should just work?)
-
private BytesRef addTail(int state, BytesRef term, int idx, int leadLabel) {
// Find biggest transition that's < label
@@ -128,8 +118,6 @@ public class CompiledAutomaton {
* accepted by this Automaton). */
public BytesRef floor(BytesRef input, BytesRef output) {
- // nocommit make sure we test empty string
-
output.offset = 0;
if (DEBUG) System.out.println("CA.floor input=" + input.utf8ToString());
Modified: lucene/dev/branches/blocktree_3030/lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/blocktree_3030/lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java?rev=1154396&r1=1154395&r2=1154396&view=diff
==============================================================================
--- lucene/dev/branches/blocktree_3030/lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java (original)
+++ lucene/dev/branches/blocktree_3030/lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java Fri Aug 5 22:25:03 2011
@@ -181,8 +181,10 @@ public class MockRandomCodec extends Cod
System.out.println("MockRandomCodec: writing BlockTree terms dict");
}
- final int minTermsInBlock = _TestUtil.nextInt(random, 1, 100);
- final int maxTermsInBlock = Math.max(1, (minTermsInBlock-1)*2 + random.nextInt(100));
+ // TODO: would be nice to allow 1 but this is very
+ // slow to write
+ final int minTermsInBlock = _TestUtil.nextInt(random, 2, 100);
+ final int maxTermsInBlock = Math.max(2, (minTermsInBlock-1)*2 + random.nextInt(100));
boolean success = false;
try {
Modified: lucene/dev/branches/blocktree_3030/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/blocktree_3030/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java?rev=1154396&r1=1154395&r2=1154396&view=diff
==============================================================================
--- lucene/dev/branches/blocktree_3030/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java (original)
+++ lucene/dev/branches/blocktree_3030/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java Fri Aug 5 22:25:03 2011
@@ -282,8 +282,8 @@ public abstract class LuceneTestCase ext
swapCodec(new MockSepCodec(), cp);
// TODO: make it possible to specify min/max iterms per
// block via CL:
- int minItemsPerBlock = _TestUtil.nextInt(random, 1, 100);
- int maxItemsPerBlock = 2*(Math.max(1, minItemsPerBlock-1)) + random.nextInt(100);
+ int minItemsPerBlock = _TestUtil.nextInt(random, 2, 100);
+ int maxItemsPerBlock = 2*(Math.max(2, minItemsPerBlock-1)) + random.nextInt(100);
swapCodec(new PulsingCodec(codecHasParam && "Pulsing".equals(codec) ? codecParam : 1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock), cp);
swapCodec(new MockFixedIntBlockCodec(codecHasParam && "MockFixedIntBlock".equals(codec) ? codecParam : _TestUtil.nextInt(random, 1, 2000)), cp);
// baseBlockSize cannot be over 127:
@@ -1564,13 +1564,13 @@ public abstract class LuceneTestCase ext
this.perFieldSeed = random.nextInt();
// TODO: make it possible to specify min/max iterms per
// block via CL:
- int minItemsPerBlock = _TestUtil.nextInt(random, 1, 100);
- int maxItemsPerBlock = 2*(Math.max(1, minItemsPerBlock-1)) + random.nextInt(100);
+ int minItemsPerBlock = _TestUtil.nextInt(random, 2, 100);
+ int maxItemsPerBlock = 2*(Math.max(2, minItemsPerBlock-1)) + random.nextInt(100);
register(randomizCodec(random, new StandardCodec(minItemsPerBlock, maxItemsPerBlock)));
register(randomizCodec(random, new PreFlexCodec()));
// TODO: make it possible to specify min/max iterms per
// block via CL:
- minItemsPerBlock = _TestUtil.nextInt(random, 1, 100);
+ minItemsPerBlock = _TestUtil.nextInt(random, 2, 100);
maxItemsPerBlock = 2*(Math.max(1, minItemsPerBlock-1)) + random.nextInt(100);
register(randomizCodec(random, new PulsingCodec( 1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock)));
register(randomizCodec(random, new SimpleTextCodec()));
Modified: lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/index/TestTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/index/TestTermsEnum.java?rev=1154396&r1=1154395&r2=1154396&view=diff
==============================================================================
--- lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/index/TestTermsEnum.java (original)
+++ lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/index/TestTermsEnum.java Fri Aug 5 22:25:03 2011
@@ -29,9 +29,12 @@ import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
+import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericField;
+import org.apache.lucene.index.codecs.CoreCodecProvider;
+import org.apache.lucene.index.codecs.standard.StandardCodec;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
@@ -338,4 +341,390 @@ public class TestTermsEnum extends Lucen
r.close();
dir.close();
}
+
+ private Directory d;
+ private IndexReader r;
+
+ private final String FIELD = "field";
+
+ private IndexReader makeIndex(int minTermsInBlock, int maxTermsInBlock, String... terms) throws Exception {
+ d = newDirectory();
+ IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random));
+
+ /*
+ CoreCodecProvider cp = new CoreCodecProvider();
+ cp.unregister(cp.lookup("Standard"));
+ cp.register(new StandardCodec(minTermsInBlock, maxTermsInBlock));
+ cp.setDefaultFieldCodec("Standard");
+ iwc.setCodecProvider(cp);
+ */
+
+ final RandomIndexWriter w = new RandomIndexWriter(random, d, iwc);
+ w.w.setInfoStream(VERBOSE ? System.out : null);
+ for(String term : terms) {
+ Document doc = new Document();
+ Field f = newField(FIELD, term, Field.Index.NOT_ANALYZED_NO_NORMS);
+ doc.add(f);
+ w.addDocument(doc);
+ }
+ if (r != null) {
+ close();
+ }
+ r = w.getReader();
+ w.close();
+ return r;
+ }
+
+ private void close() throws Exception {
+ final Directory d = ((SegmentReader) r.getSequentialSubReaders()[0]).directory();
+ r.close();
+ d.close();
+ }
+
+ private int docFreq(IndexReader r, String term) throws Exception {
+ return r.docFreq(new Term(FIELD, term));
+ }
+
+ public void testEasy() throws Exception {
+ // No floor arcs:
+ r = makeIndex(3, 6, "aa0", "aa1", "aa2", "aa3", "bb0", "bb1", "bb2", "bb3", "aa");
+
+ // First term in block:
+ assertEquals(1, docFreq(r, "aa0"));
+
+ // Scan forward to another term in same block
+ assertEquals(1, docFreq(r, "aa2"));
+
+ assertEquals(1, docFreq(r, "aa"));
+
+ // Reset same block then scan forwards
+ assertEquals(1, docFreq(r, "aa1"));
+
+ // Not found, in same block
+ assertEquals(0, docFreq(r, "aa5"));
+
+ // Found, in same block
+ assertEquals(1, docFreq(r, "aa2"));
+
+ // Not found in index:
+ assertEquals(0, docFreq(r, "b0"));
+
+ // Found:
+ assertEquals(1, docFreq(r, "aa2"));
+
+ // Found, rewind:
+ assertEquals(1, docFreq(r, "aa0"));
+
+
+ // First term in block:
+ assertEquals(1, docFreq(r, "bb0"));
+
+ // Scan forward to another term in same block
+ assertEquals(1, docFreq(r, "bb2"));
+
+ // Reset same block then scan forwards
+ assertEquals(1, docFreq(r, "bb1"));
+
+ // Not found, in same block
+ assertEquals(0, docFreq(r, "bb5"));
+
+ // Found, in same block
+ assertEquals(1, docFreq(r, "bb2"));
+
+ // Not found in index:
+ assertEquals(0, docFreq(r, "b0"));
+
+ // Found:
+ assertEquals(1, docFreq(r, "bb2"));
+
+ // Found, rewind:
+ assertEquals(1, docFreq(r, "bb0"));
+
+ close();
+ }
+
+ // tests:
+ // - test same prefix has non-floor block and floor block (ie, has 2 long outputs on same term prefix)
+ // - term that's entirely in the index
+
+ public void testFloorBlocks() throws Exception {
+ final String[] terms = new String[] {"aa0", "aa1", "aa2", "aa3", "aa4", "aa5", "aa6", "aa7", "aa8", "aa9", "aa", "xx"};
+ r = makeIndex(3, 6, terms);
+ //r = makeIndex(3, 6, "aa0", "aa1", "aa2", "aa3", "aa4", "aa5", "aa6", "aa7", "aa8", "aa9");
+
+ // First term in first block:
+ assertEquals(1, docFreq(r, "aa0"));
+ assertEquals(1, docFreq(r, "aa4"));
+
+ // No block
+ assertEquals(0, docFreq(r, "bb0"));
+
+ // Second block
+ assertEquals(1, docFreq(r, "aa4"));
+
+ // Backwards to prior floor block:
+ assertEquals(1, docFreq(r, "aa0"));
+
+ // Forwards to last floor block:
+ assertEquals(1, docFreq(r, "aa9"));
+
+ assertEquals(0, docFreq(r, "a"));
+ assertEquals(1, docFreq(r, "aa"));
+ assertEquals(0, docFreq(r, "a"));
+ assertEquals(1, docFreq(r, "aa"));
+
+ // Forwards to last floor block:
+ assertEquals(1, docFreq(r, "xx"));
+ assertEquals(1, docFreq(r, "aa1"));
+ assertEquals(0, docFreq(r, "yy"));
+
+ assertEquals(1, docFreq(r, "xx"));
+ assertEquals(1, docFreq(r, "aa9"));
+
+ assertEquals(1, docFreq(r, "xx"));
+ assertEquals(1, docFreq(r, "aa4"));
+
+ final TermsEnum te = r.getSequentialSubReaders()[0].fields().terms(FIELD).iterator();
+ while(te.next() != null) {
+ //System.out.println("TEST: next term=" + te.term().utf8ToString());
+ }
+
+ assertTrue(seekExact(te, "aa1"));
+ assertEquals("aa2", next(te));
+ assertTrue(seekExact(te, "aa8"));
+ assertEquals("aa9", next(te));
+ assertEquals("xx", next(te));
+
+ testRandomSeeks(r, terms);
+ close();
+ }
+
+ public void testZeroTerms() throws Exception {
+ d = newDirectory();
+ final RandomIndexWriter w = new RandomIndexWriter(random, d);
+ w.w.setInfoStream(VERBOSE ? System.out : null);
+ Document doc = new Document();
+ doc.add(newField("field", "one two three", Field.Index.ANALYZED));
+ doc = new Document();
+ doc.add(newField("field2", "one two three", Field.Index.ANALYZED));
+ w.addDocument(doc);
+ w.commit();
+ w.deleteDocuments(new Term("field", "one"));
+ w.optimize();
+ IndexReader r = w.getReader();
+ w.close();
+ assertEquals(1, r.numDocs());
+ assertEquals(1, r.maxDoc());
+ Terms terms = MultiFields.getTerms(r, "field");
+ if (terms != null) {
+ assertNull(terms.iterator().next());
+ }
+ r.close();
+ d.close();
+ }
+
+ private String getRandomString() {
+ //return _TestUtil.randomSimpleString(random);
+ return _TestUtil.randomRealisticUnicodeString(random);
+ }
+
+ public void testRandomTerms() throws Exception {
+ final String[] terms = new String[_TestUtil.nextInt(random, 1, atLeast(1000))];
+ final Set<String> seen = new HashSet<String>();
+
+ final boolean allowEmptyString = random.nextBoolean();
+
+ if (random.nextInt(10) == 7 && terms.length > 2) {
+ // Sometimes add a bunch of terms sharing a longish common prefix:
+ final int numTermsSamePrefix = random.nextInt(terms.length/2);
+ if (numTermsSamePrefix > 0) {
+ String prefix;
+ while(true) {
+ prefix = getRandomString();
+ if (prefix.length() < 5) {
+ continue;
+ } else {
+ break;
+ }
+ }
+ while(seen.size() < numTermsSamePrefix) {
+ final String t = prefix + getRandomString();
+ if (!seen.contains(t)) {
+ terms[seen.size()] = t;
+ seen.add(t);
+ }
+ }
+ }
+ }
+
+ while(seen.size() < terms.length) {
+ final String t = getRandomString();
+ if (!seen.contains(t) && (allowEmptyString || t.length() != 0)) {
+ terms[seen.size()] = t;
+ seen.add(t);
+ }
+ }
+ final int minBlockSize = _TestUtil.nextInt(random, 1, 10);
+ final int maxBlockSize = Math.max(2*(minBlockSize-1) + random.nextInt(60), 1);
+ if (VERBOSE) {
+ System.out.println("TEST: minBlockSize=" + minBlockSize + " maxBlockSize=" + maxBlockSize);
+ }
+ r = makeIndex(minBlockSize, maxBlockSize, terms);
+ testRandomSeeks(r, terms);
+ close();
+ }
+
+ // sugar
+ private boolean seekExact(TermsEnum te, String term) throws IOException {
+ return te.seekExact(new BytesRef(term), random.nextBoolean());
+ }
+
+ // sugar
+ private String next(TermsEnum te) throws IOException {
+ final BytesRef br = te.next();
+ if (br == null) {
+ return null;
+ } else {
+ return br.utf8ToString();
+ }
+ }
+
+ private BytesRef getNonExistTerm(BytesRef[] terms) {
+ BytesRef t = null;
+ while(true) {
+ final String ts = getRandomString();
+ t = new BytesRef(ts);
+ if (Arrays.binarySearch(terms, t) < 0) {
+ return t;
+ }
+ }
+ }
+
+ private static class TermAndState {
+ public final BytesRef term;
+ public final TermState state;
+
+ public TermAndState(BytesRef term, TermState state) {
+ this.term = term;
+ this.state = state;
+ }
+ }
+
+ private void testRandomSeeks(IndexReader r, String... validTermStrings) throws IOException {
+ final BytesRef[] validTerms = new BytesRef[validTermStrings.length];
+ for(int termIDX=0;termIDX<validTermStrings.length;termIDX++) {
+ validTerms[termIDX] = new BytesRef(validTermStrings[termIDX]);
+ }
+ Arrays.sort(validTerms);
+ if (VERBOSE) {
+ System.out.println("TEST: " + validTerms.length + " terms:");
+ for(BytesRef t : validTerms) {
+ System.out.println(" " + t.utf8ToString() + " " + t);
+ }
+ }
+ final TermsEnum te = MultiFields.getTerms(r, FIELD).iterator();
+
+ final int END_LOC = -validTerms.length-1;
+
+ final List<TermAndState> termStates = new ArrayList<TermAndState>();
+
+ for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
+
+ final BytesRef t;
+ int loc;
+ final TermState termState;
+ if (random.nextInt(6) == 4) {
+ // pick term that doens't exist:
+ t = getNonExistTerm(validTerms);
+ termState = null;
+ if (VERBOSE) {
+ System.out.println("\nTEST: invalid term=" + t.utf8ToString());
+ }
+ loc = Arrays.binarySearch(validTerms, t);
+ } else if (termStates.size() != 0 && random.nextInt(4) == 1) {
+ final TermAndState ts = termStates.get(random.nextInt(termStates.size()));
+ t = ts.term;
+ loc = Arrays.binarySearch(validTerms, t);
+ assertTrue(loc >= 0);
+ termState = ts.state;
+ if (VERBOSE) {
+ System.out.println("\nTEST: valid termState term=" + t.utf8ToString());
+ }
+ } else {
+ // pick valid term
+ loc = random.nextInt(validTerms.length);
+ t = new BytesRef(validTerms[loc]);
+ termState = null;
+ if (VERBOSE) {
+ System.out.println("\nTEST: valid term=" + t.utf8ToString());
+ }
+ }
+
+ // nocommit -- add some .termState() / seekExact(termState)
+
+ // seekCeil or seekExact:
+ final boolean doSeekExact = random.nextBoolean();
+ if (termState != null) {
+ if (VERBOSE) {
+ System.out.println(" seekExact termState");
+ }
+ te.seekExact(t, termState);
+ } else if (doSeekExact) {
+ if (VERBOSE) {
+ System.out.println(" seekExact");
+ }
+ assertEquals(loc >= 0, te.seekExact(t, random.nextBoolean()));
+ } else {
+ if (VERBOSE) {
+ System.out.println(" seekCeil");
+ }
+
+ final TermsEnum.SeekStatus result = te.seekCeil(t, random.nextBoolean());
+ if (VERBOSE) {
+ System.out.println(" got " + result);
+ }
+
+ if (loc >= 0) {
+ assertEquals(TermsEnum.SeekStatus.FOUND, result);
+ } else if (loc == END_LOC) {
+ assertEquals(TermsEnum.SeekStatus.END, result);
+ } else {
+ assert loc >= -validTerms.length;
+ assertEquals(TermsEnum.SeekStatus.NOT_FOUND, result);
+ }
+ }
+
+ if (loc >= 0) {
+ assertEquals(t, te.term());
+ } else if (doSeekExact) {
+ // TermsEnum is unpositioned if seekExact returns false
+ continue;
+ } else if (loc == END_LOC) {
+ continue;
+ } else {
+ loc = -loc-1;
+ assertEquals(validTerms[loc], te.term());
+ }
+
+ // Do a bunch of next's after the seek
+ final int numNext = random.nextInt(validTerms.length);
+
+ for(int nextCount=0;nextCount<numNext;nextCount++) {
+ if (VERBOSE) {
+ System.out.println("\nTEST: next loc=" + loc + " of " + validTerms.length);
+ }
+ final BytesRef t2 = te.next();
+ loc++;
+ if (loc == validTerms.length) {
+ assertNull(t2);
+ break;
+ } else {
+ assertEquals(validTerms[loc], t2);
+ if (random.nextInt(40) == 17 && termStates.size() < 100) {
+ termStates.add(new TermAndState(validTerms[loc], te.termState()));
+ }
+ }
+ }
+ }
+ }
}
Modified: lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/search/TestWildcard.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/search/TestWildcard.java?rev=1154396&r1=1154395&r2=1154396&view=diff
==============================================================================
--- lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/search/TestWildcard.java (original)
+++ lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/search/TestWildcard.java Fri Aug 5 22:25:03 2011
@@ -135,8 +135,6 @@ public class TestWildcard
wq = new WildcardQuery(new Term("field", "*"));
assertMatches(searcher, wq, 2);
assertFalse(wq.getTermsEnum(terms) instanceof PrefixTermsEnum);
- // nocommit: what to do?
- //assertFalse(wq.getTermsEnum(terms) instanceof AutomatonTermsEnum);
searcher.close();
indexStore.close();
}
Modified: lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java?rev=1154396&r1=1154395&r2=1154396&view=diff
==============================================================================
--- lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java (original)
+++ lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java Fri Aug 5 22:25:03 2011
@@ -64,7 +64,7 @@ public class TestCompiledAutomaton exten
for(BytesRef t : termBytes) {
System.out.println(" " + t.utf8ToString());
}
- System.out.println(c.utf8.toDot());
+ //System.out.println(c.utf8.toDot());
}
for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
@@ -114,6 +114,7 @@ public class TestCompiledAutomaton exten
testFloor(c, "foc", "fob");
testFloor(c, "foz", "foo");
testFloor(c, "f", null);
+ testFloor(c, "", null);
testFloor(c, "aa", null);
testFloor(c, "zzz", "goo");
}