You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2011/01/05 11:52:06 UTC
svn commit: r1055405 [1/3] - in /lucene/dev/trunk/lucene: ./
src/java/org/apache/lucene/index/ src/java/org/apache/lucene/index/codecs/
src/java/org/apache/lucene/index/codecs/sep/
src/java/org/apache/lucene/index/codecs/simpletext/ src/java/org/apache...
Author: mikemccand
Date: Wed Jan 5 10:52:04 2011
New Revision: 1055405
URL: http://svn.apache.org/viewvc?rev=1055405&view=rev
Log:
LUCENE-2843: add variable gap terms index writer/reader, using FST to hold the index
Added:
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java (with props)
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java (with props)
lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/fst/FSTEnum.java (with props)
lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java (with props)
lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/mockrandom/
lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java (with props)
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentReadState.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesReader.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesWriter.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexReaderBase.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/fst/BytesRefFSTEnum.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/fst/IntsRefFSTEnum.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/fst/NodeHash.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/RandomIndexWriter.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestCodecs.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestFlex.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestPerFieldCodecSupport.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestSegmentTermEnum.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockCodec.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/mockintblock/MockVariableIntBlockCodec.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/mocksep/MockSingleIntIndexOutput.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/LuceneTestCase.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1055405&r1=1055404&r2=1055405&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Wed Jan 5 10:52:04 2011
@@ -315,6 +315,12 @@ New features
extensions within the same segment. Codecs now use their per-segment codec
ID in the file names. (Simon Willnauer)
+* LUCENE-2843: Added a new terms index impl,
+ VariableGapTermsIndexWriter/Reader, that accepts a pluggable
+ IndexTermSelector for picking which terms should be indexed in the
+ terms dict. This impl stores the indexed terms in an FST, which is
+ much more RAM efficient than FixedGapTermsIndex. (Mike McCandless)
+
Optimizations
* LUCENE-2410: ~20% speedup on exact (slop=0) PhraseQuery matching.
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java?rev=1055405&r1=1055404&r2=1055405&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java Wed Jan 5 10:52:04 2011
@@ -906,7 +906,8 @@ final class DocumentsWriter {
final static int BYTE_BLOCK_NOT_MASK = ~BYTE_BLOCK_MASK;
/* if you increase this, you must fix field cache impl for
- * getTerms/getTermsIndex requires <= 32768 */
+ * getTerms/getTermsIndex requires <= 32768. Also fix
+ * DeltaBytesWriter's TERM_EOF if necessary. */
final static int MAX_TERM_LENGTH_UTF8 = BYTE_BLOCK_SIZE-2;
/* Initial chunks size of the shared int[] blocks used to
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java?rev=1055405&r1=1055404&r2=1055405&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java Wed Jan 5 10:52:04 2011
@@ -55,7 +55,7 @@ public final class IndexWriterConfig imp
public static enum OpenMode { CREATE, APPEND, CREATE_OR_APPEND }
/** Default value is 32. Change using {@link #setTermIndexInterval(int)}. */
- public static final int DEFAULT_TERM_INDEX_INTERVAL = 32;
+ public static final int DEFAULT_TERM_INDEX_INTERVAL = 32; // TODO: this should be private to the codec, not settable here
/** Denotes a flush trigger is disabled. */
public final static int DISABLE_AUTO_FLUSH = -1;
@@ -115,7 +115,7 @@ public final class IndexWriterConfig imp
private OpenMode openMode;
private int maxFieldLength;
private Similarity similarity;
- private int termIndexInterval;
+ private int termIndexInterval; // TODO: this should be private to the codec, not settable here
private MergeScheduler mergeScheduler;
private long writeLockTimeout;
private int maxBufferedDeleteTerms;
@@ -147,7 +147,7 @@ public final class IndexWriterConfig imp
openMode = OpenMode.CREATE_OR_APPEND;
maxFieldLength = UNLIMITED_FIELD_LENGTH;
similarity = Similarity.getDefault();
- termIndexInterval = DEFAULT_TERM_INDEX_INTERVAL;
+ termIndexInterval = DEFAULT_TERM_INDEX_INTERVAL; // TODO: this should be private to the codec, not settable here
mergeScheduler = new ConcurrentMergeScheduler();
writeLockTimeout = WRITE_LOCK_TIMEOUT;
maxBufferedDeleteTerms = DEFAULT_MAX_BUFFERED_DELETE_TERMS;
@@ -312,7 +312,7 @@ public final class IndexWriterConfig imp
*
* @see #DEFAULT_TERM_INDEX_INTERVAL
*/
- public IndexWriterConfig setTermIndexInterval(int interval) {
+ public IndexWriterConfig setTermIndexInterval(int interval) { // TODO: this should be private to the codec, not settable here
this.termIndexInterval = interval;
return this;
}
@@ -322,7 +322,7 @@ public final class IndexWriterConfig imp
*
* @see #setTermIndexInterval(int)
*/
- public int getTermIndexInterval() {
+ public int getTermIndexInterval() { // TODO: this should be private to the codec, not settable here
return termIndexInterval;
}
@@ -613,7 +613,7 @@ public final class IndexWriterConfig imp
sb.append("openMode=").append(openMode).append("\n");
sb.append("maxFieldLength=").append(maxFieldLength).append("\n");
sb.append("similarity=").append(similarity.getClass().getName()).append("\n");
- sb.append("termIndexInterval=").append(termIndexInterval).append("\n");
+ sb.append("termIndexInterval=").append(termIndexInterval).append("\n"); // TODO: this should be private to the codec, not settable here
sb.append("mergeScheduler=").append(mergeScheduler.getClass().getName()).append("\n");
sb.append("default WRITE_LOCK_TIMEOUT=").append(WRITE_LOCK_TIMEOUT).append("\n");
sb.append("writeLockTimeout=").append(writeLockTimeout).append("\n");
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentReadState.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentReadState.java?rev=1055405&r1=1055404&r2=1055405&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentReadState.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentReadState.java Wed Jan 5 10:52:04 2011
@@ -33,7 +33,7 @@ public class SegmentReadState {
// terms index on init (preflex is the only once currently
// that must do so), then it should negate this value to
// get the app's terms divisor:
- public final int termsIndexDivisor;
+ public int termsIndexDivisor;
public final String codecId;
public SegmentReadState(Directory dir, SegmentInfo info,
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java?rev=1055405&r1=1055404&r2=1055405&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java Wed Jan 5 10:52:04 2011
@@ -43,7 +43,7 @@ public class SegmentWriteState {
* faster, while larger values use less memory and make searching slightly
* slower. Searching is typically not dominated by dictionary lookup, so
* tweaking this is rarely useful.*/
- public final int termIndexInterval;
+ public int termIndexInterval; // TODO: this should be private to the codec, not settable here or in IWC
/** Expert: The fraction of TermDocs entries stored in skip tables,
* used to accelerate {@link DocsEnum#advance(int)}. Larger values result in
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesReader.java?rev=1055405&r1=1055404&r2=1055405&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesReader.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesReader.java Wed Jan 5 10:52:04 2011
@@ -36,13 +36,17 @@ final class DeltaBytesReader {
term.copy(text);
}
- void read() throws IOException {
+ boolean read() throws IOException {
final int start = in.readVInt();
+ if (start == DeltaBytesWriter.TERM_EOF) {
+ return false;
+ }
final int suffix = in.readVInt();
assert start <= term.length: "start=" + start + " length=" + term.length;
final int newLength = start+suffix;
term.grow(newLength);
in.readBytes(term.bytes, start, suffix);
term.length = newLength;
+ return true;
}
}
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesWriter.java?rev=1055405&r1=1055404&r2=1055405&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesWriter.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesWriter.java Wed Jan 5 10:52:04 2011
@@ -20,11 +20,18 @@ package org.apache.lucene.index.codecs;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
+import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
import java.io.IOException;
final class DeltaBytesWriter {
+ // Must be bigger than
+ // DocumentsWriter.MAX_TERM_LENGTH_UTF8. If you change
+ // this it's an index format change, so that change must be
+ // versioned:
+ final static int TERM_EOF = BYTE_BLOCK_SIZE;
+
private byte[] lastBytes = new byte[10];
private int lastLength;
final IndexOutput out;
@@ -45,8 +52,9 @@ final class DeltaBytesWriter {
final int limit = length < lastLength ? length : lastLength;
while(start < limit) {
- if (bytes[upto] != lastBytes[start])
+ if (bytes[upto] != lastBytes[start]) {
break;
+ }
start++;
upto++;
}
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java?rev=1055405&r1=1055404&r2=1055405&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java Wed Jan 5 10:52:04 2011
@@ -33,29 +33,6 @@ import java.util.Collection;
import java.util.Comparator;
import java.io.IOException;
-/**
- * Uses a simplistic format to record terms dict index
- * information. Limititations:
- *
- * - Index for all fields is loaded entirely into RAM up
- * front
- * - Index is stored in RAM using shared byte[] that
- * wastefully expand every term. Using FST to share
- * common prefix & suffix would save RAM.
- * - Index is taken at regular numTerms (every 128 by
- * default); might be better to do it by "net docFreqs"
- * encountered, so that for spans of low-freq terms we
- * take index less often.
- *
- * A better approach might be something similar to how
- * postings are encoded, w/ multi-level skips. Ie, load all
- * terms index data into memory, as a single large compactly
- * encoded stream (eg delta bytes + delta offset). Index
- * that w/ multi-level skipper. Then to look up a term is
- * the equivalent binary search, using the skipper instead,
- * while data remains compressed in memory.
- */
-
import org.apache.lucene.index.IndexFileNames;
/** @lucene.experimental */
@@ -74,7 +51,7 @@ public class FixedGapTermsIndexReader ex
final private int indexInterval;
// Closed if indexLoaded is true:
- final private IndexInput in;
+ private IndexInput in;
private volatile boolean indexLoaded;
private final Comparator<BytesRef> termComp;
@@ -85,7 +62,7 @@ public class FixedGapTermsIndexReader ex
private final PagedBytes termBytes = new PagedBytes(PAGED_BYTES_BITS);
private PagedBytes.Reader termBytesReader;
- final HashMap<FieldInfo,FieldIndexReader> fields = new HashMap<FieldInfo,FieldIndexReader>();
+ final HashMap<FieldInfo,FieldIndexData> fields = new HashMap<FieldInfo,FieldIndexData>();
// start of the field info data
protected long dirOffset;
@@ -95,7 +72,7 @@ public class FixedGapTermsIndexReader ex
this.termComp = termComp;
- IndexInput in = dir.openInput(IndexFileNames.segmentFileName(segment, codecId, FixedGapTermsIndexWriter.TERMS_INDEX_EXTENSION));
+ in = dir.openInput(IndexFileNames.segmentFileName(segment, codecId, FixedGapTermsIndexWriter.TERMS_INDEX_EXTENSION));
boolean success = false;
@@ -116,49 +93,137 @@ public class FixedGapTermsIndexReader ex
seekDir(in, dirOffset);
// Read directory
- final int numFields = in.readInt();
-
+ final int numFields = in.readVInt();
for(int i=0;i<numFields;i++) {
- final int field = in.readInt();
- final int numIndexTerms = in.readInt();
- final long termsStart = in.readLong();
- final long indexStart = in.readLong();
- final long packedIndexStart = in.readLong();
- final long packedOffsetsStart = in.readLong();
+ final int field = in.readVInt();
+ final int numIndexTerms = in.readVInt();
+ final long termsStart = in.readVLong();
+ final long indexStart = in.readVLong();
+ final long packedIndexStart = in.readVLong();
+ final long packedOffsetsStart = in.readVLong();
assert packedIndexStart >= indexStart: "packedStart=" + packedIndexStart + " indexStart=" + indexStart + " numIndexTerms=" + numIndexTerms + " seg=" + segment;
- if (numIndexTerms > 0) {
- final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
- fields.put(fieldInfo, new FieldIndexReader(in, fieldInfo, numIndexTerms, indexStart, termsStart, packedIndexStart, packedOffsetsStart));
- }
+ final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
+ fields.put(fieldInfo, new FieldIndexData(fieldInfo, numIndexTerms, indexStart, termsStart, packedIndexStart, packedOffsetsStart));
}
success = true;
} finally {
if (indexDivisor > 0) {
in.close();
- this.in = null;
+ in = null;
if (success) {
indexLoaded = true;
}
termBytesReader = termBytes.freeze(true);
- } else {
- this.in = in;
}
}
}
+ @Override
+ public int getDivisor() {
+ return indexDivisor;
+ }
+
protected void readHeader(IndexInput input) throws IOException {
CodecUtil.checkHeader(input, FixedGapTermsIndexWriter.CODEC_NAME,
FixedGapTermsIndexWriter.VERSION_START, FixedGapTermsIndexWriter.VERSION_START);
dirOffset = input.readLong();
}
- private final class FieldIndexReader extends FieldReader {
+ private class IndexEnum extends FieldIndexEnum {
+ private final FieldIndexData.CoreFieldIndex fieldIndex;
+ private final BytesRef term = new BytesRef();
+ private final BytesRef nextTerm = new BytesRef();
+ private long ord;
- final private FieldInfo fieldInfo;
+ public IndexEnum(FieldIndexData.CoreFieldIndex fieldIndex) {
+ this.fieldIndex = fieldIndex;
+ }
+
+ @Override
+ public BytesRef term() {
+ return term;
+ }
+
+ @Override
+ public long seek(BytesRef target) {
+ int lo = 0; // binary search
+ int hi = fieldIndex.numIndexTerms - 1;
+ assert totalIndexInterval > 0 : "totalIndexInterval=" + totalIndexInterval;
+
+ while (hi >= lo) {
+ int mid = (lo + hi) >>> 1;
+
+ final long offset = fieldIndex.termOffsets.get(mid);
+ final int length = (int) (fieldIndex.termOffsets.get(1+mid) - offset);
+ termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
+
+ int delta = termComp.compare(target, term);
+ if (delta < 0) {
+ hi = mid - 1;
+ } else if (delta > 0) {
+ lo = mid + 1;
+ } else {
+ assert mid >= 0;
+ ord = mid*totalIndexInterval;
+ return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(mid);
+ }
+ }
- private volatile CoreFieldIndex coreIndex;
+ if (hi < 0) {
+ assert hi == -1;
+ hi = 0;
+ }
- private final IndexInput in;
+ final long offset = fieldIndex.termOffsets.get(hi);
+ final int length = (int) (fieldIndex.termOffsets.get(1+hi) - offset);
+ termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
+
+ ord = hi*totalIndexInterval;
+ return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(hi);
+ }
+
+ @Override
+ public long next() {
+ final int idx = 1 + (int) (ord / totalIndexInterval);
+ if (idx >= fieldIndex.numIndexTerms) {
+ return -1;
+ }
+ ord += totalIndexInterval;
+
+ final long offset = fieldIndex.termOffsets.get(idx);
+ final int length = (int) (fieldIndex.termOffsets.get(1+idx) - offset);
+ termBytesReader.fillSlice(nextTerm, fieldIndex.termBytesStart + offset, length);
+ return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(idx);
+ }
+
+ @Override
+ public long ord() {
+ return ord;
+ }
+
+ @Override
+ public long seek(long ord) {
+ int idx = (int) (ord / totalIndexInterval);
+ // caller must ensure ord is in bounds
+ assert idx < fieldIndex.numIndexTerms;
+ final long offset = fieldIndex.termOffsets.get(idx);
+ final int length = (int) (fieldIndex.termOffsets.get(1+idx) - offset);
+ termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
+ this.ord = idx * totalIndexInterval;
+ return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(idx);
+ }
+ }
+
+ @Override
+ public boolean supportsOrd() {
+ return true;
+ }
+
+ private final class FieldIndexData {
+
+ final private FieldInfo fieldInfo;
+
+ volatile CoreFieldIndex coreIndex;
private final long indexStart;
private final long termsStart;
@@ -167,11 +232,10 @@ public class FixedGapTermsIndexReader ex
private final int numIndexTerms;
- public FieldIndexReader(IndexInput in, FieldInfo fieldInfo, int numIndexTerms, long indexStart, long termsStart, long packedIndexStart,
- long packedOffsetsStart) throws IOException {
+ public FieldIndexData(FieldInfo fieldInfo, int numIndexTerms, long indexStart, long termsStart, long packedIndexStart,
+ long packedOffsetsStart) throws IOException {
this.fieldInfo = fieldInfo;
- this.in = in;
this.termsStart = termsStart;
this.indexStart = indexStart;
this.packedIndexStart = packedIndexStart;
@@ -182,12 +246,7 @@ public class FixedGapTermsIndexReader ex
// is -1, so that PrefixCodedTermsReader can call
// isIndexTerm for each field:
if (indexDivisor > 0) {
- coreIndex = new CoreFieldIndex(indexStart,
- termsStart,
- packedIndexStart,
- packedOffsetsStart,
- numIndexTerms);
-
+ loadTermsIndex();
}
}
@@ -197,46 +256,11 @@ public class FixedGapTermsIndexReader ex
}
}
- @Override
- public boolean isIndexTerm(long ord, int docFreq, boolean onlyLoaded) {
- if (onlyLoaded) {
- return ord % totalIndexInterval == 0;
- } else {
- return ord % indexInterval == 0;
- }
- }
-
- @Override
- public boolean nextIndexTerm(long ord, TermsIndexResult result) throws IOException {
- if (coreIndex == null) {
- throw new IllegalStateException("terms index was not loaded");
- } else {
- return coreIndex.nextIndexTerm(ord, result);
- }
- }
-
- @Override
- public void getIndexOffset(BytesRef term, TermsIndexResult result) throws IOException {
- // You must call loadTermsIndex if you had specified -1 for indexDivisor
- if (coreIndex == null) {
- throw new IllegalStateException("terms index was not loaded");
- }
- coreIndex.getIndexOffset(term, result);
- }
-
- @Override
- public void getIndexOffset(long ord, TermsIndexResult result) throws IOException {
- // You must call loadTermsIndex if you had specified
- // indexDivisor < 0 to ctor
- if (coreIndex == null) {
- throw new IllegalStateException("terms index was not loaded");
- }
- coreIndex.getIndexOffset(ord, result);
- }
-
private final class CoreFieldIndex {
- final private long termBytesStart;
+ // where this field's terms begin in the packed byte[]
+ // data
+ final long termBytesStart;
// offset into index termBytes
final PackedInts.Reader termOffsets;
@@ -245,7 +269,6 @@ public class FixedGapTermsIndexReader ex
final PackedInts.Reader termsDictOffsets;
final int numIndexTerms;
-
final long termsStart;
public CoreFieldIndex(long indexStart, long termsStart, long packedIndexStart, long packedOffsetsStart, int numIndexTerms) throws IOException {
@@ -315,7 +338,6 @@ public class FixedGapTermsIndexReader ex
termsDictOffsetsM.set(upto, termsDictOffsetsIter.next());
termOffsetsM.set(upto, termOffsetUpto);
- upto++;
long termOffset = termOffsetsIter.next();
long nextTermOffset = termOffsetsIter.next();
@@ -328,6 +350,11 @@ public class FixedGapTermsIndexReader ex
termBytes.copy(clone, numTermBytes);
termOffsetUpto += numTermBytes;
+ upto++;
+ if (upto == this.numIndexTerms) {
+ break;
+ }
+
// skip terms:
termsDictOffsetsIter.next();
for(int i=0;i<indexDivisor-2;i++) {
@@ -344,71 +371,10 @@ public class FixedGapTermsIndexReader ex
}
}
}
-
- public boolean nextIndexTerm(long ord, TermsIndexResult result) throws IOException {
- int idx = 1 + (int) (ord / totalIndexInterval);
- if (idx < numIndexTerms) {
- fillResult(idx, result);
- return true;
- } else {
- return false;
- }
- }
-
- private void fillResult(int idx, TermsIndexResult result) {
- final long offset = termOffsets.get(idx);
- final int length = (int) (termOffsets.get(1+idx) - offset);
- termBytesReader.fillSlice(result.term, termBytesStart + offset, length);
- result.position = idx * totalIndexInterval;
- result.offset = termsStart + termsDictOffsets.get(idx);
- }
-
- public void getIndexOffset(BytesRef term, TermsIndexResult result) throws IOException {
- int lo = 0; // binary search
- int hi = numIndexTerms - 1;
- assert totalIndexInterval > 0 : "totalIndexInterval=" + totalIndexInterval;
-
- while (hi >= lo) {
- int mid = (lo + hi) >>> 1;
-
- final long offset = termOffsets.get(mid);
- final int length = (int) (termOffsets.get(1+mid) - offset);
- termBytesReader.fillSlice(result.term, termBytesStart + offset, length);
-
- int delta = termComp.compare(term, result.term);
- if (delta < 0) {
- hi = mid - 1;
- } else if (delta > 0) {
- lo = mid + 1;
- } else {
- assert mid >= 0;
- result.position = mid*totalIndexInterval;
- result.offset = termsStart + termsDictOffsets.get(mid);
- return;
- }
- }
- if (hi < 0) {
- assert hi == -1;
- hi = 0;
- }
-
- final long offset = termOffsets.get(hi);
- final int length = (int) (termOffsets.get(1+hi) - offset);
- termBytesReader.fillSlice(result.term, termBytesStart + offset, length);
-
- result.position = hi*totalIndexInterval;
- result.offset = termsStart + termsDictOffsets.get(hi);
- }
-
- public void getIndexOffset(long ord, TermsIndexResult result) throws IOException {
- int idx = (int) (ord / totalIndexInterval);
- // caller must ensure ord is in bounds
- assert idx < numIndexTerms;
- fillResult(idx, result);
- }
}
}
+ // Externally synced in IndexWriter
@Override
public void loadTermsIndex(int indexDivisor) throws IOException {
if (!indexLoaded) {
@@ -420,7 +386,7 @@ public class FixedGapTermsIndexReader ex
}
this.totalIndexInterval = indexInterval * this.indexDivisor;
- Iterator<FieldIndexReader> it = fields.values().iterator();
+ Iterator<FieldIndexData> it = fields.values().iterator();
while(it.hasNext()) {
it.next().loadTermsIndex();
}
@@ -432,8 +398,13 @@ public class FixedGapTermsIndexReader ex
}
@Override
- public FieldReader getField(FieldInfo fieldInfo) {
- return fields.get(fieldInfo);
+ public FieldIndexEnum getFieldEnum(FieldInfo fieldInfo) {
+ final FieldIndexData fieldData = fields.get(fieldInfo);
+ if (fieldData.coreIndex == null) {
+ return null;
+ } else {
+ return new IndexEnum(fieldData.coreIndex);
+ }
}
public static void files(Directory dir, SegmentInfo info, String id, Collection<String> files) {
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java?rev=1055405&r1=1055404&r2=1055405&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java Wed Jan 5 10:52:04 2011
@@ -31,7 +31,14 @@ import java.util.List;
import java.util.ArrayList;
import java.io.IOException;
-/** @lucene.experimental */
+/**
+ * Selects every Nth term as and index term, and hold term
+ * bytes fully expanded in memory. This terms index
+ * supports seeking by ord. See {@link
+ * VariableGapTermsIndexWriter} for a more memory efficient
+ * terms index that does not support seeking by ord.
+ *
+ * @lucene.experimental */
public class FixedGapTermsIndexWriter extends TermsIndexWriterBase {
protected final IndexOutput out;
@@ -203,15 +210,25 @@ public class FixedGapTermsIndexWriter ex
final long dirStart = out.getFilePointer();
final int fieldCount = fields.size();
- out.writeInt(fieldCount);
+ int nonNullFieldCount = 0;
for(int i=0;i<fieldCount;i++) {
SimpleFieldWriter field = fields.get(i);
- out.writeInt(field.fieldInfo.number);
- out.writeInt(field.numIndexTerms);
- out.writeLong(field.termsStart);
- out.writeLong(field.indexStart);
- out.writeLong(field.packedIndexStart);
- out.writeLong(field.packedOffsetsStart);
+ if (field.numIndexTerms > 0) {
+ nonNullFieldCount++;
+ }
+ }
+
+ out.writeVInt(nonNullFieldCount);
+ for(int i=0;i<fieldCount;i++) {
+ SimpleFieldWriter field = fields.get(i);
+ if (field.numIndexTerms > 0) {
+ out.writeVInt(field.fieldInfo.number);
+ out.writeVInt(field.numIndexTerms);
+ out.writeVLong(field.termsStart);
+ out.writeVLong(field.indexStart);
+ out.writeVLong(field.packedIndexStart);
+ out.writeVLong(field.packedOffsetsStart);
+ }
}
writeTrailer(dirStart);
out.close();
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java?rev=1055405&r1=1055404&r2=1055405&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java Wed Jan 5 10:52:04 2011
@@ -140,12 +140,10 @@ public class PrefixCodedTermsReader exte
final long numTerms = in.readLong();
assert numTerms >= 0;
final long termsStartPointer = in.readLong();
- final TermsIndexReaderBase.FieldReader fieldIndexReader;
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
- fieldIndexReader = indexReader.getField(fieldInfo);
if (numTerms > 0) {
assert !fields.containsKey(fieldInfo.name);
- fields.put(fieldInfo.name, new FieldReader(fieldIndexReader, fieldInfo, numTerms, termsStartPointer));
+ fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer));
}
}
success = true;
@@ -251,14 +249,12 @@ public class PrefixCodedTermsReader exte
final long numTerms;
final FieldInfo fieldInfo;
final long termsStartPointer;
- final TermsIndexReaderBase.FieldReader fieldIndexReader;
- FieldReader(TermsIndexReaderBase.FieldReader fieldIndexReader, FieldInfo fieldInfo, long numTerms, long termsStartPointer) {
+ FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer) {
assert numTerms > 0;
this.fieldInfo = fieldInfo;
this.numTerms = numTerms;
this.termsStartPointer = termsStartPointer;
- this.fieldIndexReader = fieldIndexReader;
}
@Override
@@ -281,18 +277,25 @@ public class PrefixCodedTermsReader exte
return numTerms;
}
- // Iterates through terms in this field
+ // Iterates through terms in this field, not supporting ord()
private class SegmentTermsEnum extends TermsEnum {
private final IndexInput in;
private final DeltaBytesReader bytesReader;
private final TermState state;
private boolean seekPending;
- private final TermsIndexReaderBase.TermsIndexResult indexResult = new TermsIndexReaderBase.TermsIndexResult();
private final FieldAndTerm fieldTerm = new FieldAndTerm();
+ private final TermsIndexReaderBase.FieldIndexEnum indexEnum;
+ private boolean positioned;
+ private boolean didIndexNext;
+ private BytesRef nextIndexTerm;
+ private boolean isIndexTerm;
+ private final boolean doOrd;
SegmentTermsEnum() throws IOException {
in = (IndexInput) PrefixCodedTermsReader.this.in.clone();
in.seek(termsStartPointer);
+ indexEnum = indexReader.getFieldEnum(fieldInfo);
+ doOrd = indexReader.supportsOrd();
bytesReader = new DeltaBytesReader(in);
fieldTerm.field = fieldInfo.name;
state = postingsReader.newTermState();
@@ -312,12 +315,41 @@ public class PrefixCodedTermsReader exte
stateCopy);
}
+ // called only from assert
+ private boolean first;
+ private int indexTermCount;
+
+ private boolean startSeek() {
+ first = true;
+ indexTermCount = 0;
+ return true;
+ }
+
+ private boolean checkSeekScan() {
+ if (!first && isIndexTerm) {
+ indexTermCount++;
+ if (indexTermCount >= indexReader.getDivisor()) {
+ //System.out.println("now fail count=" + indexTermCount);
+ return false;
+ }
+ }
+ first = false;
+ return true;
+ }
+
/** Seeks until the first term that's >= the provided
* text; returns SeekStatus.FOUND if the exact term
* is found, SeekStatus.NOT_FOUND if a different term
* was found, SeekStatus.END if we hit EOF */
@Override
public SeekStatus seek(BytesRef term, boolean useCache) throws IOException {
+
+ if (indexEnum == null) {
+ throw new IllegalStateException("terms index was not loaded");
+ }
+
+ //System.out.println("te.seek term=" + fieldInfo.name + ":" + term.utf8ToString() + " current=" + term().utf8ToString() + " useCache=" + useCache + " this=" + this);
+
// Check cache
fieldTerm.term = term;
TermState cachedState;
@@ -326,7 +358,9 @@ public class PrefixCodedTermsReader exte
if (cachedState != null) {
state.copy(cachedState);
seekPending = true;
+ positioned = false;
bytesReader.term.copy(term);
+ //System.out.println(" cached!");
return SeekStatus.FOUND;
}
} else {
@@ -335,36 +369,54 @@ public class PrefixCodedTermsReader exte
boolean doSeek = true;
- if (state.ord != -1) {
- // we are positioned
+ if (positioned) {
final int cmp = termComp.compare(bytesReader.term, term);
if (cmp == 0) {
// already at the requested term
return SeekStatus.FOUND;
- }
+ } else if (cmp < 0) {
+
+ if (seekPending) {
+ seekPending = false;
+ in.seek(state.filePointer);
+ indexEnum.seek(bytesReader.term);
+ didIndexNext = false;
+ }
+
+ // Target term is after current term
+ if (!didIndexNext) {
+ if (indexEnum.next() == -1) {
+ nextIndexTerm = null;
+ } else {
+ nextIndexTerm = indexEnum.term();
+ }
+ //System.out.println(" now do index next() nextIndexTerm=" + (nextIndexTerm == null ? "null" : nextIndexTerm.utf8ToString()));
+ didIndexNext = true;
+ }
- if (cmp < 0 &&
- fieldIndexReader.nextIndexTerm(state.ord, indexResult) &&
- termComp.compare(indexResult.term, term) > 0) {
- // Optimization: requested term is within the
- // same index block we are now in; skip seeking
- // (but do scanning):
- doSeek = false;
+ if (nextIndexTerm == null || termComp.compare(term, nextIndexTerm) < 0) {
+ // Optimization: requested term is within the
+ // same index block we are now in; skip seeking
+ // (but do scanning):
+ doSeek = false;
+ //System.out.println(" skip seek: nextIndexTerm=" + nextIndexTerm);
+ }
}
}
- // Used only for assert:
- final long startOrd;
-
if (doSeek) {
- // As index to find biggest index term that's <=
- // our text:
- fieldIndexReader.getIndexOffset(term, indexResult);
+ positioned = true;
- in.seek(indexResult.offset);
+ // Ask terms index to find biggest index term that's <=
+ // our text:
+ in.seek(indexEnum.seek(term));
+ didIndexNext = false;
+ if (doOrd) {
+ state.ord = indexEnum.ord()-1;
+ }
seekPending = false;
// NOTE: the first next() after an index seek is
@@ -373,22 +425,20 @@ public class PrefixCodedTermsReader exte
// those bytes in the primary file, but then when
// scanning over an index term we'd have to
// special case it:
- bytesReader.reset(indexResult.term);
-
- state.ord = indexResult.position-1;
- assert state.ord >= -1: "ord=" + state.ord + " pos=" + indexResult.position;
-
- startOrd = indexResult.position;
+ bytesReader.reset(indexEnum.term());
+ //System.out.println(" doSeek term=" + indexEnum.term().utf8ToString() + " vs target=" + term.utf8ToString());
} else {
- startOrd = -1;
+ //System.out.println(" skip seek");
}
+ assert startSeek();
+
// Now scan:
- while(next() != null) {
+ while (next() != null) {
final int cmp = termComp.compare(bytesReader.term, term);
if (cmp == 0) {
-
- if (doSeek && useCache) {
+ // Done!
+ if (useCache) {
// Store in cache
FieldAndTerm entryKey = new FieldAndTerm(fieldTerm);
cachedState = (TermState) state.clone();
@@ -396,94 +446,62 @@ public class PrefixCodedTermsReader exte
cachedState.filePointer = in.getFilePointer();
termsCache.put(entryKey, cachedState);
}
-
+
return SeekStatus.FOUND;
} else if (cmp > 0) {
return SeekStatus.NOT_FOUND;
}
+
// The purpose of the terms dict index is to seek
// the enum to the closest index term before the
// term we are looking for. So, we should never
// cross another index term (besides the first
// one) while we are scanning:
- assert state.ord == startOrd || !fieldIndexReader.isIndexTerm(state.ord, state.docFreq, true): "state.ord=" + state.ord + " startOrd=" + startOrd + " ir.isIndexTerm=" + fieldIndexReader.isIndexTerm(state.ord, state.docFreq, true) + " state.docFreq=" + state.docFreq;
+ assert checkSeekScan();
}
+ positioned = false;
return SeekStatus.END;
}
@Override
- public SeekStatus seek(long ord) throws IOException {
-
- // TODO: should we cache term lookup by ord as well...?
-
- if (ord >= numTerms) {
- state.ord = numTerms-1;
- return SeekStatus.END;
- }
-
- fieldIndexReader.getIndexOffset(ord, indexResult);
- in.seek(indexResult.offset);
- seekPending = false;
-
- // NOTE: the first next() after an index seek is
- // wasteful, since it redundantly reads the same
- // bytes into the buffer
- bytesReader.reset(indexResult.term);
-
- state.ord = indexResult.position-1;
- assert state.ord >= -1: "ord=" + state.ord;
-
- // Now, scan:
- int left = (int) (ord - state.ord);
- while(left > 0) {
- final BytesRef term = next();
- assert term != null;
- left--;
- }
-
- // always found
- return SeekStatus.FOUND;
- }
-
- @Override
public BytesRef term() {
return bytesReader.term;
}
@Override
- public long ord() {
- return state.ord;
- }
-
- @Override
public BytesRef next() throws IOException {
if (seekPending) {
seekPending = false;
in.seek(state.filePointer);
+ indexEnum.seek(bytesReader.term);
+ didIndexNext = false;
}
- if (state.ord >= numTerms-1) {
+ if (!bytesReader.read()) {
+ //System.out.println("te.next end!");
+ positioned = false;
return null;
}
- bytesReader.read();
- state.docFreq = in.readVInt();
+ final byte b = in.readByte();
+ isIndexTerm = (b & 0x80) != 0;
+
+ if ((b & 0x40) == 0) {
+ // Fast case -- docFreq fits in 6 bits
+ state.docFreq = b & 0x3F;
+ } else {
+ state.docFreq = (in.readVInt() << 6) | (b & 0x3F);
+ }
- // TODO: would be cleaner, but space-wasting, to
- // simply record a bit into each index entry as to
- // whether it's an index entry or not, rather than
- // re-compute that information... or, possibly store
- // a "how many terms until next index entry" in each
- // index entry, but that'd require some tricky
- // lookahead work when writing the index
postingsReader.readTerm(in,
fieldInfo, state,
- fieldIndexReader.isIndexTerm(1+state.ord, state.docFreq, false));
-
+ isIndexTerm);
state.ord++;
+ positioned = true;
+ //System.out.println("te.next term=" + bytesReader.term.utf8ToString());
return bytesReader.term;
}
@@ -507,6 +525,50 @@ public class PrefixCodedTermsReader exte
return postingsReader.docsAndPositions(fieldInfo, state, skipDocs, reuse);
}
}
+
+ @Override
+ public SeekStatus seek(long ord) throws IOException {
+
+ if (indexEnum == null) {
+ throw new IllegalStateException("terms index was not loaded");
+ }
+
+ if (ord >= numTerms) {
+ state.ord = numTerms-1;
+ return SeekStatus.END;
+ }
+
+ in.seek(indexEnum.seek(ord));
+ seekPending = false;
+ positioned = true;
+
+ // NOTE: the first next() after an index seek is
+ // wasteful, since it redundantly reads the same
+ // bytes into the buffer
+ bytesReader.reset(indexEnum.term());
+
+ state.ord = indexEnum.ord()-1;
+ assert state.ord >= -1: "ord=" + state.ord;
+
+ // Now, scan:
+ int left = (int) (ord - state.ord);
+ while(left > 0) {
+ final BytesRef term = next();
+ assert term != null;
+ left--;
+ }
+
+ // always found
+ return SeekStatus.FOUND;
+ }
+
+ @Override
+ public long ord() {
+ if (!doOrd) {
+ throw new UnsupportedOperationException();
+ }
+ return state.ord;
+ }
}
}
}
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java?rev=1055405&r1=1055404&r2=1055405&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java Wed Jan 5 10:52:04 2011
@@ -93,7 +93,7 @@ public class PrefixCodedTermsWriter exte
}
@Override
- public TermsConsumer addField(FieldInfo field) {
+ public TermsConsumer addField(FieldInfo field) throws IOException {
assert currentField == null || currentField.name.compareTo(field.name) < 0;
currentField = field;
TermsIndexWriterBase.FieldWriter fieldIndexWriter = termsIndexWriter.addField(field);
@@ -173,12 +173,25 @@ public class PrefixCodedTermsWriter exte
public void finishTerm(BytesRef text, int numDocs) throws IOException {
assert numDocs > 0;
+ //System.out.println("finishTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " fp=" + out.getFilePointer());
final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, numDocs);
termWriter.write(text);
- out.writeVInt(numDocs);
+ final int highBit = isIndexTerm ? 0x80 : 0;
+ //System.out.println(" isIndex=" + isIndexTerm);
+ // This is a vInt, except, we steal top bit to record
+ // whether this was an indexed term:
+ if ((numDocs & ~0x3F) == 0) {
+ // Fast case -- docFreq fits in 6 bits
+ out.writeByte((byte) (highBit | numDocs));
+ } else {
+ // Write bottom 6 bits of docFreq, then write the
+ // remainder as vInt:
+ out.writeByte((byte) (highBit | 0x40 | (numDocs & 0x3F)));
+ out.writeVInt(numDocs >>> 6);
+ }
postingsWriter.finishTerm(numDocs, isIndexTerm);
numTerms++;
}
@@ -186,6 +199,8 @@ public class PrefixCodedTermsWriter exte
// Finishes all terms in this field
@Override
public void finish() throws IOException {
+ // EOF marker:
+ out.writeVInt(DeltaBytesWriter.TERM_EOF);
fieldIndexWriter.finish();
}
}
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexReaderBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexReaderBase.java?rev=1055405&r1=1055404&r2=1055405&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexReaderBase.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexReaderBase.java Wed Jan 5 10:52:04 2011
@@ -21,6 +21,7 @@ import org.apache.lucene.index.FieldInfo
import org.apache.lucene.util.BytesRef;
import java.io.IOException;
+import java.io.Closeable;
import java.util.Collection;
@@ -38,39 +39,37 @@ import java.util.Collection;
* text.
* @lucene.experimental */
-public abstract class TermsIndexReaderBase {
+public abstract class TermsIndexReaderBase implements Closeable {
- static class TermsIndexResult {
- long position;
- final BytesRef term = new BytesRef();
- long offset;
- };
-
- public abstract class FieldReader {
- /** Returns position of "largest" index term that's <=
- * text. Returned TermsIndexResult may be reused
- * across calls. This resets internal state, and
- * expects that you'll then scan the file and
- * sequentially call isIndexTerm for each term
- * encountered. */
- public abstract void getIndexOffset(BytesRef term, TermsIndexResult result) throws IOException;
-
- public abstract void getIndexOffset(long ord, TermsIndexResult result) throws IOException;
-
- /** Call this sequentially for each term encountered,
- * after calling {@link #getIndexOffset}. */
- public abstract boolean isIndexTerm(long ord, int docFreq, boolean onlyLoaded) throws IOException;
-
- /** Finds the next index term, after the specified
- * ord. Returns true if one exists. */
- public abstract boolean nextIndexTerm(long ord, TermsIndexResult result) throws IOException;
- }
-
- public abstract FieldReader getField(FieldInfo fieldInfo);
+ public abstract FieldIndexEnum getFieldEnum(FieldInfo fieldInfo);
public abstract void loadTermsIndex(int indexDivisor) throws IOException;
public abstract void close() throws IOException;
public abstract void getExtensions(Collection<String> extensions);
-}
\ No newline at end of file
+
+ public abstract boolean supportsOrd();
+
+ public abstract int getDivisor();
+
+ // Similar to TermsEnum, except, the only "metadata" it
+ // reports for a given indexed term is the long fileOffset
+ // into the main terms dict (_X.tis) file:
+ public static abstract class FieldIndexEnum {
+
+ /** Seeks to "largest" indexed term that's <=
+ * term; retruns file pointer index (into the main
+ * terms index file) for that term */
+ public abstract long seek(BytesRef term) throws IOException;
+
+ /** Returns -1 at end */
+ public abstract long next() throws IOException;
+
+ public abstract BytesRef term();
+
+ // Only impl'd if supportsOrd() returns true!
+ public abstract long seek(long ord) throws IOException;
+ public abstract long ord();
+ }
+}
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java?rev=1055405&r1=1055404&r2=1055405&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java Wed Jan 5 10:52:04 2011
@@ -32,7 +32,7 @@ public abstract class TermsIndexWriterBa
public abstract void finish() throws IOException;
}
- public abstract FieldWriter addField(FieldInfo fieldInfo);
+ public abstract FieldWriter addField(FieldInfo fieldInfo) throws IOException;
public abstract void close() throws IOException;
-}
\ No newline at end of file
+}
Added: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java?rev=1055405&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java (added)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java Wed Jan 5 10:52:04 2011
@@ -0,0 +1,256 @@
+package org.apache.lucene.index.codecs;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Iterator;
+
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CodecUtil;
+import org.apache.lucene.util.automaton.fst.Builder;
+import org.apache.lucene.util.automaton.fst.BytesRefFSTEnum;
+import org.apache.lucene.util.automaton.fst.FST;
+import org.apache.lucene.util.automaton.fst.PositiveIntOutputs;
+
+/** See {@link VariableGapTermsIndexWriter}
+ *
+ * @lucene.experimental */
+public class VariableGapTermsIndexReader extends TermsIndexReaderBase {
+
+ private final PositiveIntOutputs fstOutputs = PositiveIntOutputs.getSingleton(true);
+ private int indexDivisor;
+
+ // Closed if indexLoaded is true:
+ private IndexInput in;
+ private volatile boolean indexLoaded;
+
+ final HashMap<FieldInfo,FieldIndexData> fields = new HashMap<FieldInfo,FieldIndexData>();
+
+ // start of the field info data
+ protected long dirOffset;
+
+ public VariableGapTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment, int indexDivisor, String codecId)
+ throws IOException {
+
+ in = dir.openInput(IndexFileNames.segmentFileName(segment, codecId, VariableGapTermsIndexWriter.TERMS_INDEX_EXTENSION));
+
+ boolean success = false;
+
+ try {
+
+ readHeader(in);
+ this.indexDivisor = indexDivisor;
+
+ seekDir(in, dirOffset);
+
+ // Read directory
+ final int numFields = in.readVInt();
+
+ for(int i=0;i<numFields;i++) {
+ final int field = in.readVInt();
+ final long indexStart = in.readVLong();
+ final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
+ fields.put(fieldInfo, new FieldIndexData(fieldInfo, indexStart));
+ }
+ success = true;
+ } finally {
+ if (indexDivisor > 0) {
+ in.close();
+ in = null;
+ if (success) {
+ indexLoaded = true;
+ }
+ }
+ }
+ }
+
+ @Override
+ public int getDivisor() {
+ return indexDivisor;
+ }
+
+ protected void readHeader(IndexInput input) throws IOException {
+ CodecUtil.checkHeader(input, VariableGapTermsIndexWriter.CODEC_NAME,
+ VariableGapTermsIndexWriter.VERSION_START, VariableGapTermsIndexWriter.VERSION_START);
+ dirOffset = input.readLong();
+ }
+
+ private static class IndexEnum extends FieldIndexEnum {
+ private final BytesRefFSTEnum<Long> fstEnum;
+ private BytesRefFSTEnum.InputOutput<Long> current;
+
+ public IndexEnum(FST<Long> fst) {
+ fstEnum = new BytesRefFSTEnum<Long>(fst);
+ }
+
+ @Override
+ public BytesRef term() {
+ if (current == null) {
+ return null;
+ } else {
+ return current.input;
+ }
+ }
+
+ @Override
+ public long seek(BytesRef target) throws IOException {
+ //System.out.println("VGR: seek field=" + fieldInfo.name + " target=" + target);
+ current = fstEnum.seekFloor(target);
+ //System.out.println(" got input=" + current.input + " output=" + current.output);
+ return current.output;
+ }
+
+ @Override
+ public long next() throws IOException {
+ //System.out.println("VGR: next field=" + fieldInfo.name);
+ current = fstEnum.next();
+ if (current == null) {
+ //System.out.println(" eof");
+ return -1;
+ } else {
+ return current.output;
+ }
+ }
+
+ @Override
+ public long ord() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public long seek(long ord) {
+ throw new UnsupportedOperationException();
+ }
+ }
+
+ @Override
+ public boolean supportsOrd() {
+ return false;
+ }
+
+ private final class FieldIndexData {
+
+ private final FieldInfo fieldInfo;
+ private final long indexStart;
+
+ // Set only if terms index is loaded:
+ private volatile FST<Long> fst;
+
+ public FieldIndexData(FieldInfo fieldInfo, long indexStart) throws IOException {
+
+ this.fieldInfo = fieldInfo;
+ this.indexStart = indexStart;
+
+ // We still create the indexReader when indexDivisor
+ // is -1, so that PrefixCodedTermsReader can call
+ // isIndexTerm for each field:
+ if (indexDivisor > 0) {
+ loadTermsIndex();
+ }
+ }
+
+ public void loadTermsIndex() throws IOException {
+ if (fst == null) {
+ IndexInput clone = (IndexInput) in.clone();
+ clone.seek(indexStart);
+ fst = new FST<Long>(clone, fstOutputs);
+ clone.close();
+
+ if (indexDivisor > 1) {
+ // subsample
+ final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
+ final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
+ final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<Long>(fst);
+ BytesRefFSTEnum.InputOutput<Long> result;
+ int count = indexDivisor;
+ while((result = fstEnum.next()) != null) {
+ if (count == indexDivisor) {
+ builder.add(result.input, result.output);
+ count = 0;
+ }
+ count++;
+ }
+ fst = builder.finish();
+ }
+ }
+ }
+ }
+
+ // Externally synced in IndexWriter
+ @Override
+ public void loadTermsIndex(int indexDivisor) throws IOException {
+ if (!indexLoaded) {
+
+ if (indexDivisor < 0) {
+ this.indexDivisor = -indexDivisor;
+ } else {
+ this.indexDivisor = indexDivisor;
+ }
+
+ Iterator<FieldIndexData> it = fields.values().iterator();
+ while(it.hasNext()) {
+ it.next().loadTermsIndex();
+ }
+
+ indexLoaded = true;
+ in.close();
+ }
+ }
+
+ @Override
+ public FieldIndexEnum getFieldEnum(FieldInfo fieldInfo) {
+ final FieldIndexData fieldData = fields.get(fieldInfo);
+ if (fieldData.fst == null) {
+ return null;
+ } else {
+ return new IndexEnum(fieldData.fst);
+ }
+ }
+
+ public static void files(Directory dir, SegmentInfo info, String id, Collection<String> files) {
+ files.add(IndexFileNames.segmentFileName(info.name, id, VariableGapTermsIndexWriter.TERMS_INDEX_EXTENSION));
+ }
+
+ public static void getIndexExtensions(Collection<String> extensions) {
+ extensions.add(VariableGapTermsIndexWriter.TERMS_INDEX_EXTENSION);
+ }
+
+ @Override
+ public void getExtensions(Collection<String> extensions) {
+ getIndexExtensions(extensions);
+ }
+
+ @Override
+ public void close() throws IOException {
+ if (in != null && !indexLoaded) {
+ in.close();
+ }
+ }
+
+ protected void seekDir(IndexInput input, long dirOffset) throws IOException {
+ input.seek(dirOffset);
+ }
+}
Added: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java?rev=1055405&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java (added)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java Wed Jan 5 10:52:04 2011
@@ -0,0 +1,276 @@
+package org.apache.lucene.index.codecs;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CodecUtil;
+import org.apache.lucene.util.automaton.fst.Builder;
+import org.apache.lucene.util.automaton.fst.FST;
+import org.apache.lucene.util.automaton.fst.PositiveIntOutputs;
+
+/**
+ * Selects index terms according to provided pluggable
+ * IndexTermPolicy, and stores them in a prefix trie that's
+ * loaded entirely in RAM stored as an FST. This terms
+ * index only supports unsigned byte term sort order
+ * (unicode codepoint order when the bytes are UTF8).
+ *
+ * @lucene.experimental */
+public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
+ protected final IndexOutput out;
+
+ /** Extension of terms index file */
+ static final String TERMS_INDEX_EXTENSION = "tiv";
+
+ final static String CODEC_NAME = "VARIABLE_GAP_TERMS_INDEX";
+ final static int VERSION_START = 0;
+ final static int VERSION_CURRENT = VERSION_START;
+
+ private final List<FSTFieldWriter> fields = new ArrayList<FSTFieldWriter>();
+ private final FieldInfos fieldInfos; // unread
+ private IndexOutput termsOut;
+ private final IndexTermSelector policy;
+
+ /** @lucene.experimental */
+ public static abstract class IndexTermSelector {
+ // Called sequentially on every term being written,
+ // returning true if this term should be indexed
+ public abstract boolean isIndexTerm(BytesRef term, int docFreq);
+ }
+
+ /** Same policy as {@link FixedGapTermsIndexWriter} */
+ public static final class EveryNTermSelector extends IndexTermSelector {
+ private int count;
+ private final int interval;
+
+ public EveryNTermSelector(int interval) {
+ this.interval = interval;
+ // First term is first indexed term:
+ count = interval;
+ }
+
+ @Override
+ public boolean isIndexTerm(BytesRef term, int docFreq) {
+ if (count >= interval) {
+ count = 0;
+ return true;
+ } else {
+ count++;
+ return false;
+ }
+ }
+ }
+
+ /** Sets an index term when docFreq >= docFreqThresh, or
+ * every interval terms. This should reduce seek time
+ * to high docFreq terms. */
+ public static final class EveryNOrDocFreqTermSelector extends IndexTermSelector {
+ private int count;
+ private final int docFreqThresh;
+ private final int interval;
+
+ public EveryNOrDocFreqTermSelector(int docFreqThresh, int interval) {
+ this.interval = interval;
+ this.docFreqThresh = docFreqThresh;
+ }
+
+ @Override
+ public boolean isIndexTerm(BytesRef term, int docFreq) {
+ if (docFreq >= docFreqThresh || count >= interval) {
+ count = 0;
+ return true;
+ } else {
+ count++;
+ return false;
+ }
+ }
+ }
+
+ // TODO: it'd be nice to let the FST builder prune based
+ // on term count of each node (the prune1/prune2 that it
+ // accepts), and build the index based on that. This
+ // should result in a more compact terms index, more like
+ // a prefix trie than the other selectors, because it
+ // only stores enough leading bytes to get down to N
+ // terms that may complete that prefix. It becomes
+ // "deeper" when terms are dense, and "shallow" when they
+ // are less dense.
+ //
+ // However, it's not easy to make that work this this
+ // API, because that pruning doesn't immediately know on
+ // seeing each term whether that term will be a seek point
+ // or not. It requires some non-causality in the API, ie
+ // only on seeing some number of future terms will the
+ // builder decide which past terms are seek points.
+ // Somehow the API'd need to be able to return a "I don't
+ // know" value, eg like a Future, which only later on is
+ // flipped (frozen) to true or false.
+ //
+ // We could solve this with a 2-pass approach, where the
+ // first pass would build an FSA (no outputs) solely to
+ // determine which prefixes are the 'leaves' in the
+ // pruning. The 2nd pass would then look at this prefix
+ // trie to mark the seek points and build the FST mapping
+ // to the true output.
+ //
+ // But, one downside to this approach is that it'd result
+ // in uneven index term selection. EG with prune1=10, the
+ // resulting index terms could be as frequent as every 10
+ // terms or as rare as every <maxArcCount> * 10 (eg 2560),
+ // in the extremes.
+
+ public VariableGapTermsIndexWriter(SegmentWriteState state, IndexTermSelector policy) throws IOException {
+ final String indexFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, TERMS_INDEX_EXTENSION);
+ state.flushedFiles.add(indexFileName);
+ out = state.directory.createOutput(indexFileName);
+ fieldInfos = state.fieldInfos;
+ this.policy = policy;
+ writeHeader(out);
+ }
+
+ protected void writeHeader(IndexOutput out) throws IOException {
+ CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
+ // Placeholder for dir offset
+ out.writeLong(0);
+ }
+
+ @Override
+ public void setTermsOutput(IndexOutput termsOut) {
+ this.termsOut = termsOut;
+ }
+
+ @Override
+ public FieldWriter addField(FieldInfo field) throws IOException {
+ //System.out.println("VGW: field=" + field.name);
+ FSTFieldWriter writer = new FSTFieldWriter(field);
+ fields.add(writer);
+ return writer;
+ }
+
+ /** NOTE: if your codec does not sort in unicode code
+ * point order, you must override this method, to simply
+ * return indexedTerm.length. */
+ protected int indexedTermPrefixLength(final BytesRef priorTerm, final BytesRef indexedTerm) {
+ // As long as codec sorts terms in unicode codepoint
+ // order, we can safely strip off the non-distinguishing
+ // suffix to save RAM in the loaded terms index.
+ final int idxTermOffset = indexedTerm.offset;
+ final int priorTermOffset = priorTerm.offset;
+ final int limit = Math.min(priorTerm.length, indexedTerm.length);
+ for(int byteIdx=0;byteIdx<limit;byteIdx++) {
+ if (priorTerm.bytes[priorTermOffset+byteIdx] != indexedTerm.bytes[idxTermOffset+byteIdx]) {
+ return byteIdx+1;
+ }
+ }
+ return Math.min(1+priorTerm.length, indexedTerm.length);
+ }
+
+ private class FSTFieldWriter extends FieldWriter {
+ private final Builder<Long> fstBuilder;
+ private final PositiveIntOutputs fstOutputs;
+
+ final FieldInfo fieldInfo;
+ int numIndexTerms;
+ FST<Long> fst;
+ final long indexStart;
+
+ private final BytesRef lastTerm = new BytesRef();
+ private boolean first = true;
+
+ public FSTFieldWriter(FieldInfo fieldInfo) throws IOException {
+ this.fieldInfo = fieldInfo;
+ fstOutputs = PositiveIntOutputs.getSingleton(true);
+ fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE1,
+ 0, 0, true,
+ fstOutputs);
+ indexStart = out.getFilePointer();
+ //System.out.println("VGW: field=" + fieldInfo.name);
+
+ // Always put empty string in
+ fstBuilder.add(new BytesRef(), fstOutputs.get(termsOut.getFilePointer()));
+ }
+
+ @Override
+ public boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException {
+ if (policy.isIndexTerm(text, docFreq) || first) {
+ first = false;
+ //System.out.println("VGW: index term=" + text.utf8ToString() + " fp=" + termsOut.getFilePointer());
+ final int lengthSave = text.length;
+ text.length = indexedTermPrefixLength(lastTerm, text);
+ try {
+ fstBuilder.add(text, fstOutputs.get(termsOut.getFilePointer()));
+ } finally {
+ text.length = lengthSave;
+ }
+ lastTerm.copy(text);
+ return true;
+ } else {
+ //System.out.println("VGW: not index term=" + text.utf8ToString() + " fp=" + termsOut.getFilePointer());
+ lastTerm.copy(text);
+ return false;
+ }
+ }
+
+ @Override
+ public void finish() throws IOException {
+ fst = fstBuilder.finish();
+ if (fst != null) {
+ fst.save(out);
+ }
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ final long dirStart = out.getFilePointer();
+ final int fieldCount = fields.size();
+
+ int nonNullFieldCount = 0;
+ for(int i=0;i<fieldCount;i++) {
+ FSTFieldWriter field = fields.get(i);
+ if (field.fst != null) {
+ nonNullFieldCount++;
+ }
+ }
+
+ out.writeVInt(nonNullFieldCount);
+ for(int i=0;i<fieldCount;i++) {
+ FSTFieldWriter field = fields.get(i);
+ if (field.fst != null) {
+ out.writeVInt(field.fieldInfo.number);
+ out.writeVLong(field.indexStart);
+ }
+ }
+ writeTrailer(dirStart);
+ out.close();
+ }
+
+ protected void writeTrailer(long dirStart) throws IOException {
+ out.seek(CodecUtil.headerLength(CODEC_NAME));
+ out.writeLong(dirStart);
+ }
+}
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java?rev=1055405&r1=1055404&r2=1055405&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java Wed Jan 5 10:52:04 2011
@@ -594,6 +594,7 @@ public class SepPostingsReaderImpl exten
}
final int code = posReader.next();
+ assert code >= 0;
if (storePayloads) {
if ((code & 1) != 0) {
// Payload length has changed
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java?rev=1055405&r1=1055404&r2=1055405&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java Wed Jan 5 10:52:04 2011
@@ -211,6 +211,7 @@ public final class SepPostingsWriterImpl
assert !omitTF;
final int delta = position - lastPosition;
+ assert delta > 0 || position == 0: "position=" + position + " lastPosition=" + lastPosition; // not quite right (if pos=0 is repeated twice we don't catch it)
lastPosition = position;
if (storePayloads) {
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java?rev=1055405&r1=1055404&r2=1055405&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java Wed Jan 5 10:52:04 2011
@@ -130,9 +130,8 @@ class SimpleTextFieldsReader extends Fie
public SeekStatus seek(BytesRef text, boolean useCache /* ignored */) throws IOException {
- fstEnum.reset();
//System.out.println("seek to text=" + text.utf8ToString());
- final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,Long>> result = fstEnum.advance(text);
+ final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,Long>> result = fstEnum.seekCeil(text);
if (result == null) {
//System.out.println(" end");
return SeekStatus.END;
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java?rev=1055405&r1=1055404&r2=1055405&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java Wed Jan 5 10:52:04 2011
@@ -31,8 +31,8 @@ import org.apache.lucene.index.codecs.Po
import org.apache.lucene.index.codecs.PostingsReaderBase;
import org.apache.lucene.index.codecs.TermsIndexWriterBase;
import org.apache.lucene.index.codecs.TermsIndexReaderBase;
-import org.apache.lucene.index.codecs.FixedGapTermsIndexWriter;
-import org.apache.lucene.index.codecs.FixedGapTermsIndexReader;
+import org.apache.lucene.index.codecs.VariableGapTermsIndexWriter;
+import org.apache.lucene.index.codecs.VariableGapTermsIndexReader;
import org.apache.lucene.index.codecs.PrefixCodedTermsWriter;
import org.apache.lucene.index.codecs.PrefixCodedTermsReader;
import org.apache.lucene.store.Directory;
@@ -56,7 +56,7 @@ public class StandardCodec extends Codec
TermsIndexWriterBase indexWriter;
boolean success = false;
try {
- indexWriter = new FixedGapTermsIndexWriter(state);
+ indexWriter = new VariableGapTermsIndexWriter(state, new VariableGapTermsIndexWriter.EveryNTermSelector(state.termIndexInterval));
success = true;
} finally {
if (!success) {
@@ -89,12 +89,11 @@ public class StandardCodec extends Codec
boolean success = false;
try {
- indexReader = new FixedGapTermsIndexReader(state.dir,
- state.fieldInfos,
- state.segmentInfo.name,
- state.termsIndexDivisor,
- BytesRef.getUTF8SortedAsUnicodeComparator(),
- state.codecId);
+ indexReader = new VariableGapTermsIndexReader(state.dir,
+ state.fieldInfos,
+ state.segmentInfo.name,
+ state.termsIndexDivisor,
+ state.codecId);
success = true;
} finally {
if (!success) {
@@ -136,7 +135,7 @@ public class StandardCodec extends Codec
public void files(Directory dir, SegmentInfo segmentInfo, String id, Set<String> files) throws IOException {
StandardPostingsReader.files(dir, segmentInfo, id, files);
PrefixCodedTermsReader.files(dir, segmentInfo, id, files);
- FixedGapTermsIndexReader.files(dir, segmentInfo, id, files);
+ VariableGapTermsIndexReader.files(dir, segmentInfo, id, files);
}
@Override
@@ -148,6 +147,6 @@ public class StandardCodec extends Codec
extensions.add(FREQ_EXTENSION);
extensions.add(PROX_EXTENSION);
PrefixCodedTermsReader.getExtensions(extensions);
- FixedGapTermsIndexReader.getIndexExtensions(extensions);
+ VariableGapTermsIndexReader.getIndexExtensions(extensions);
}
}
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java?rev=1055405&r1=1055404&r2=1055405&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java Wed Jan 5 10:52:04 2011
@@ -157,7 +157,7 @@ public final class StandardPostingsWrite
final int delta = position - lastPosition;
- assert delta > 0 || position == 0 || position == -1: "position=" + position + " lastPosition=" + lastPosition; // not quite right (if pos=0 is repeated twice we don't catch it)
+ assert delta > 0 || position == 0: "position=" + position + " lastPosition=" + lastPosition; // not quite right (if pos=0 is repeated twice we don't catch it)
lastPosition = position;
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java?rev=1055405&r1=1055404&r2=1055405&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java Wed Jan 5 10:52:04 2011
@@ -256,7 +256,7 @@ public class Builder<T> {
}
public void add(IntsRef input, T output) throws IOException {
- //System.out.println("\nADD: " + input.utf8ToString());
+ //System.out.println("\nFST ADD: input=" + input + " output=" + fst.outputs.outputToString(output));
assert lastInput.length == 0 || input.compareTo(lastInput) > 0: "inputs are added out of order lastInput=" + lastInput + " vs input=" + input;
assert validOutput(output);
@@ -361,7 +361,7 @@ public class Builder<T> {
compilePrevTail(1);
//System.out.println("finish: inputCount=" + frontier[0].inputCount);
if (frontier[0].inputCount < minSuffixCount1 || frontier[0].inputCount < minSuffixCount2 || frontier[0].numArcs == 0) {
- if (fst.getEmptyOutput() == null) {
+ if (fst.emptyOutput == null) {
return null;
} else if (minSuffixCount1 > 0 || minSuffixCount2 > 0) {
// empty string got pruned