You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/02/14 18:15:47 UTC
svn commit: r1070580 [1/2] - in /lucene/dev/branches/bulkpostings/lucene/src:
java/org/apache/lucene/index/codecs/bulkvint/
java/org/apache/lucene/index/codecs/fixed/
java/org/apache/lucene/index/codecs/intblock/
java/org/apache/lucene/index/codecs/pfo...
Author: rmuir
Date: Mon Feb 14 17:15:47 2011
New Revision: 1070580
URL: http://svn.apache.org/viewvc?rev=1070580&view=rev
Log:
LUCENE-2905: Interleaved layout for fixed block codecs
Added:
lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/fixed/
lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/fixed/FixedIntStreamFactory.java (with props)
lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/fixed/FixedPostingsReaderImpl.java (with props)
lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/fixed/FixedPostingsWriterImpl.java (with props)
lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/fixed/FixedSkipListReader.java (with props)
lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/fixed/FixedSkipListWriter.java (with props)
lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/fixed/InterleavedIntBlockIndexOutput.java (with props)
lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/fixed/package.html (with props)
Modified:
lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/bulkvint/BulkVIntCodec.java
lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java
lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java
lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/pfordelta/FORFactory.java
lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/pfordelta/FORIndexInput.java
lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/pfordelta/FORIndexOutput.java
lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/pfordelta/FrameOfRefCodec.java
lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/pfordelta/PForDeltaFactory.java
lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/pfordelta/PForDeltaIndexInput.java
lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/pfordelta/PForDeltaIndexOutput.java
lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/pfordelta/PatchedFrameOfRefCodec.java
lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/pfordelta2/PForDeltaFixedIntBlockCodec.java
lucene/dev/branches/bulkpostings/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockCodec.java
lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/bulkvint/BulkVIntCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/bulkvint/BulkVIntCodec.java?rev=1070580&r1=1070579&r2=1070580&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/bulkvint/BulkVIntCodec.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/bulkvint/BulkVIntCodec.java Mon Feb 14 17:15:47 2011
@@ -27,11 +27,9 @@ import org.apache.lucene.index.SegmentRe
import org.apache.lucene.index.codecs.Codec;
import org.apache.lucene.index.codecs.FieldsConsumer;
import org.apache.lucene.index.codecs.FieldsProducer;
-import org.apache.lucene.index.codecs.sep.IntStreamFactory;
-import org.apache.lucene.index.codecs.sep.IntIndexInput;
-import org.apache.lucene.index.codecs.sep.IntIndexOutput;
-import org.apache.lucene.index.codecs.sep.SepPostingsReaderImpl;
-import org.apache.lucene.index.codecs.sep.SepPostingsWriterImpl;
+import org.apache.lucene.index.codecs.fixed.FixedIntStreamFactory;
+import org.apache.lucene.index.codecs.fixed.FixedPostingsReaderImpl;
+import org.apache.lucene.index.codecs.fixed.FixedPostingsWriterImpl;
import org.apache.lucene.index.codecs.intblock.FixedIntBlockIndexInput;
import org.apache.lucene.index.codecs.intblock.FixedIntBlockIndexOutput;
import org.apache.lucene.index.codecs.PostingsWriterBase;
@@ -67,21 +65,26 @@ public class BulkVIntCodec extends Codec
}
// only for testing
- public IntStreamFactory getIntFactory() {
+ public FixedIntStreamFactory getIntFactory() {
return new BulkVIntFactory();
}
- private class BulkVIntFactory extends IntStreamFactory {
+ private class BulkVIntFactory extends FixedIntStreamFactory {
@Override
- public IntIndexInput openInput(Directory dir, String fileName, int readBufferSize) throws IOException {
- return new FixedIntBlockIndexInput(dir.openInput(fileName, readBufferSize)) {
+ public FixedIntBlockIndexInput openInput(IndexInput in, String fileName, boolean isChild) throws IOException {
+ return new FixedIntBlockIndexInput(in) {
@Override
protected BlockReader getBlockReader(final IndexInput in, final int[] buffer) throws IOException {
return new BlockReader() {
final byte bytes[] = new byte[blockSize*5]; // header * max(Vint)
+ public void skipBlock() throws IOException {
+ final int numBytes = in.readVInt(); // read header
+ in.seek(in.getFilePointer() + numBytes); // seek past block
+ }
+
public void readBlock() throws IOException {
final int numBytes = in.readVInt(); // read header
if (numBytes == 0) { // 1's
@@ -110,8 +113,8 @@ public class BulkVIntCodec extends Codec
}
@Override
- public IntIndexOutput createOutput(Directory dir, String fileName) throws IOException {
- return new FixedIntBlockIndexOutput(dir.createOutput(fileName), blockSize) {
+ public FixedIntBlockIndexOutput createOutput(IndexOutput out, String fileName, boolean isChild) throws IOException {
+ return new FixedIntBlockIndexOutput(out, blockSize) {
final byte bytes[] = new byte[blockSize*5]; // header * max(Vint)
@Override
@@ -149,7 +152,7 @@ public class BulkVIntCodec extends Codec
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
- PostingsWriterBase postingsWriter = new SepPostingsWriterImpl(state, new BulkVIntFactory());
+ PostingsWriterBase postingsWriter = new FixedPostingsWriterImpl(state, new BulkVIntFactory());
boolean success = false;
TermsIndexWriterBase indexWriter;
@@ -180,7 +183,7 @@ public class BulkVIntCodec extends Codec
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
- PostingsReaderBase postingsReader = new SepPostingsReaderImpl(state.dir,
+ PostingsReaderBase postingsReader = new FixedPostingsReaderImpl(state.dir,
state.segmentInfo,
state.readBufferSize,
new BulkVIntFactory(), state.codecId);
@@ -226,14 +229,14 @@ public class BulkVIntCodec extends Codec
@Override
public void files(Directory dir, SegmentInfo segmentInfo, String codecId, Set<String> files) {
- SepPostingsReaderImpl.files(segmentInfo, codecId, files);
+ FixedPostingsReaderImpl.files(segmentInfo, codecId, files);
BlockTermsReader.files(dir, segmentInfo, codecId, files);
VariableGapTermsIndexReader.files(dir, segmentInfo, codecId, files);
}
@Override
public void getExtensions(Set<String> extensions) {
- SepPostingsWriterImpl.getExtensions(extensions);
+ FixedPostingsWriterImpl.getExtensions(extensions);
BlockTermsReader.getExtensions(extensions);
VariableGapTermsIndexReader.getIndexExtensions(extensions);
}
Added: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/fixed/FixedIntStreamFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/fixed/FixedIntStreamFactory.java?rev=1070580&view=auto
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/fixed/FixedIntStreamFactory.java (added)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/fixed/FixedIntStreamFactory.java Mon Feb 14 17:15:47 2011
@@ -0,0 +1,69 @@
+package org.apache.lucene.index.codecs.fixed;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.codecs.intblock.FixedIntBlockIndexInput;
+import org.apache.lucene.index.codecs.intblock.FixedIntBlockIndexOutput;
+import org.apache.lucene.index.codecs.sep.IntStreamFactory;
+import org.apache.lucene.store.BufferedIndexInput;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+
+/**
+ * Specialized factory for using fixed int block codecs.
+ * <p>
+ * You subclass this, and can then choose to use either the Sep or Fixed
+ * file formats.
+ */
+public abstract class FixedIntStreamFactory extends IntStreamFactory {
+
+ // nocommit?: the below three methods are dumb: they exist so your codec can easily support sep or interleaved.
+ @Override
+ public final FixedIntBlockIndexInput openInput(Directory dir, String fileName) throws IOException {
+ return openInput(dir, fileName, BufferedIndexInput.BUFFER_SIZE);
+ }
+
+ @Override
+ public final FixedIntBlockIndexInput openInput(Directory dir, String fileName, int readBufferSize) throws IOException {
+ return openInput(dir.openInput(fileName, readBufferSize), fileName, false);
+ }
+
+ @Override
+ public final FixedIntBlockIndexOutput createOutput(Directory dir, String fileName) throws IOException {
+ return createOutput(dir.createOutput(fileName), fileName, false);
+ }
+
+ // nocommit: its not good we force the codecs to wrap II/IO's, even though this is how they all work today...
+
+ /**
+ * Return a fixed block input, wrapping the underlying file.
+ * If <code>isChild</code> is true, then this is the codec for child blocks
+ * piggybacking upon a parent (e.g. freqs).
+ */
+ public abstract FixedIntBlockIndexInput openInput(IndexInput in, String fileName, boolean isChild) throws IOException;
+
+ /**
+ * Return a fixed block output, wrapping the underlying file.
+ * If <code>isChild</code> is true, then this is the codec for child blocks
+ * piggybacking upon a parent (e.g. freqs).
+ */
+ public abstract FixedIntBlockIndexOutput createOutput(IndexOutput out, String fileName, boolean isChild) throws IOException;
+}
Added: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/fixed/FixedPostingsReaderImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/fixed/FixedPostingsReaderImpl.java?rev=1070580&view=auto
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/fixed/FixedPostingsReaderImpl.java (added)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/fixed/FixedPostingsReaderImpl.java Mon Feb 14 17:15:47 2011
@@ -0,0 +1,1539 @@
+package org.apache.lucene.index.codecs.fixed;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Collection;
+
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.BulkPostingsEnum;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.index.TermState;
+import org.apache.lucene.index.codecs.BlockTermState;
+import org.apache.lucene.index.codecs.PostingsReaderBase;
+import org.apache.lucene.index.codecs.intblock.FixedIntBlockIndexInput;
+import org.apache.lucene.index.codecs.sep.IntIndexInput;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CodecUtil;
+
+/** Concrete class that reads the current doc+freq,skp,pos,pyl
+ * postings format.
+ *
+ * @lucene.experimental
+ */
+public class FixedPostingsReaderImpl extends PostingsReaderBase {
+
+ final FixedIntBlockIndexInput freqIn;
+ final FixedIntBlockIndexInput docIn;
+ final FixedIntBlockIndexInput posIn;
+ final IndexInput payloadIn;
+ final IndexInput skipIn;
+ final int blocksize;
+
+ int skipInterval;
+ int maxSkipLevels;
+ int skipMinimum;
+
+ public FixedPostingsReaderImpl(Directory dir, SegmentInfo segmentInfo, int readBufferSize, FixedIntStreamFactory intFactory, String codecId) throws IOException {
+
+ boolean success = false;
+ try {
+
+ final String docFileName = IndexFileNames.segmentFileName(segmentInfo.name, codecId, FixedPostingsWriterImpl.DOC_EXTENSION);
+ final IndexInput docsAndFreqsIn = dir.openInput(docFileName, readBufferSize);
+
+ docIn = intFactory.openInput(docsAndFreqsIn, docFileName, false);
+ blocksize = docIn.blockSize;
+ skipIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, codecId, FixedPostingsWriterImpl.SKIP_EXTENSION), readBufferSize);
+
+ if (segmentInfo.getHasProx()) {
+ freqIn = intFactory.openInput(docsAndFreqsIn, docFileName, true);
+ posIn = intFactory.openInput(dir, IndexFileNames.segmentFileName(segmentInfo.name, codecId, FixedPostingsWriterImpl.POS_EXTENSION), readBufferSize);
+ payloadIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, codecId, FixedPostingsWriterImpl.PAYLOAD_EXTENSION), readBufferSize);
+ } else {
+ posIn = null;
+ payloadIn = null;
+ freqIn = null;
+ }
+ success = true;
+ } finally {
+ if (!success) {
+ close();
+ }
+ }
+ }
+
+ public static void files(SegmentInfo segmentInfo, String codecId, Collection<String> files) {
+ files.add(IndexFileNames.segmentFileName(segmentInfo.name, codecId, FixedPostingsWriterImpl.DOC_EXTENSION));
+ files.add(IndexFileNames.segmentFileName(segmentInfo.name, codecId, FixedPostingsWriterImpl.SKIP_EXTENSION));
+
+ if (segmentInfo.getHasProx()) {
+ files.add(IndexFileNames.segmentFileName(segmentInfo.name, codecId, FixedPostingsWriterImpl.POS_EXTENSION));
+ files.add(IndexFileNames.segmentFileName(segmentInfo.name, codecId, FixedPostingsWriterImpl.PAYLOAD_EXTENSION));
+ }
+ }
+
+ @Override
+ public void init(IndexInput termsIn) throws IOException {
+ // Make sure we are talking to the matching past writer
+ CodecUtil.checkHeader(termsIn, FixedPostingsWriterImpl.CODEC,
+ FixedPostingsWriterImpl.VERSION_START, FixedPostingsWriterImpl.VERSION_START);
+ skipInterval = termsIn.readInt();
+ maxSkipLevels = termsIn.readInt();
+ skipMinimum = termsIn.readInt();
+ }
+
+ @Override
+ public void close() throws IOException {
+ try {
+ if (docIn != null)
+ docIn.close();
+ } finally {
+ try {
+ if (skipIn != null)
+ skipIn.close();
+ } finally {
+ try {
+ if (posIn != null) {
+ posIn.close();
+ }
+ } finally {
+ if (payloadIn != null) {
+ payloadIn.close();
+ }
+ }
+ }
+ }
+ }
+
+ private static final class FixedTermState extends BlockTermState {
+ // We store no seek point to freqs because
+ // any freqs are interleaved blocks in the doc file.
+ IntIndexInput.Index docIndex;
+ IntIndexInput.Index posIndex;
+ long payloadFP;
+ long skipFP;
+
+ // Only used for "primary" term state; these are never
+ // copied on clone:
+ byte[] bytes;
+ ByteArrayDataInput bytesReader;
+
+ @Override
+ public Object clone() {
+ FixedTermState other = (FixedTermState) super.clone();
+ other.docIndex = (IntIndexInput.Index) docIndex.clone();
+ if (posIndex != null) {
+ other.posIndex = (IntIndexInput.Index) posIndex.clone();
+ }
+ return other;
+ }
+
+ @Override
+ public void copyFrom(TermState _other) {
+ super.copyFrom(_other);
+ FixedTermState other = (FixedTermState) _other;
+ docIndex.set(other.docIndex);
+ if (posIndex != null && other.posIndex != null) {
+ posIndex.set(other.posIndex);
+ }
+ payloadFP = other.payloadFP;
+ skipFP = other.skipFP;
+ }
+
+ @Override
+ public String toString() {
+ return super.toString() + " docIndex=" + docIndex + " posIndex=" + posIndex + " payloadFP=" + payloadFP + " skipFP=" + skipFP;
+ }
+ }
+
+ @Override
+ public BlockTermState newTermState() throws IOException {
+ final FixedTermState state = new FixedTermState();
+ state.docIndex = docIn.index();
+ if (posIn != null) {
+ state.posIndex = posIn.index();
+ }
+ return state;
+ }
+
+ @Override
+ public void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, BlockTermState _termState) throws IOException {
+ final FixedTermState termState = (FixedTermState) _termState;
+ final int len = termsIn.readVInt();
+ if (termState.bytes == null) {
+ termState.bytes = new byte[ArrayUtil.oversize(len, 1)];
+ termState.bytesReader = new ByteArrayDataInput(termState.bytes);
+ } else if (termState.bytes.length < len) {
+ termState.bytes = new byte[ArrayUtil.oversize(len, 1)];
+ }
+ termState.bytesReader.reset(termState.bytes, 0, len);
+ termsIn.readBytes(termState.bytes, 0, len);
+ }
+
+ @Override
+ public void nextTerm(FieldInfo fieldInfo, BlockTermState _termState) throws IOException {
+ final FixedTermState termState = (FixedTermState) _termState;
+
+ final boolean isFirstTerm = termState.termCount == 0;
+ termState.docIndex.read(termState.bytesReader, isFirstTerm);
+
+ if (!fieldInfo.omitTermFreqAndPositions) {
+ termState.posIndex.read(termState.bytesReader, isFirstTerm);
+
+ if (fieldInfo.storePayloads) {
+ if (isFirstTerm) {
+ termState.payloadFP = termState.bytesReader.readVLong();
+ } else {
+ termState.payloadFP += termState.bytesReader.readVLong();
+ }
+ }
+ }
+
+ if (termState.docFreq >= skipMinimum) {
+ if (isFirstTerm) {
+ termState.skipFP = termState.bytesReader.readVLong();
+ } else {
+ termState.skipFP += termState.bytesReader.readVLong();
+ }
+ } else if (isFirstTerm) {
+ termState.skipFP = termState.bytesReader.readVLong();
+ }
+ }
+
+ @Override
+ public DocsEnum docs(FieldInfo fieldInfo, BlockTermState _termState, Bits skipDocs, DocsEnum reuse) throws IOException {
+ final FixedTermState termState = (FixedTermState) _termState;
+
+ // we separate the omitTF case (docs only) versus the docs+freqs case.
+ // when omitTF is enabled, the blocks are structured differently (D, D, D, D, ...)
+ // versus when frequencies are available (D, F, D, F, ...)
+
+ if (fieldInfo.omitTermFreqAndPositions) {
+ FixedDocsEnum docsEnum;
+ if (reuse == null || !(reuse instanceof FixedDocsEnum) || !((FixedDocsEnum) reuse).canReuse(docIn)) {
+ docsEnum = new FixedDocsEnum();
+ } else {
+ docsEnum = (FixedDocsEnum) reuse;
+ }
+
+ return docsEnum.init(fieldInfo, termState, skipDocs);
+ } else {
+ FixedDocsAndFreqsEnum docsEnum;
+ if (reuse == null || !(reuse instanceof FixedDocsAndFreqsEnum) || !((FixedDocsAndFreqsEnum) reuse).canReuse(docIn)) {
+ docsEnum = new FixedDocsAndFreqsEnum();
+ } else {
+ docsEnum = (FixedDocsAndFreqsEnum) reuse;
+ }
+
+ return docsEnum.init(fieldInfo, termState, skipDocs);
+ }
+ }
+
+ private FixedBulkPostingsEnum lastBulkEnum;
+
+ @Override
+ public BulkPostingsEnum bulkPostings(FieldInfo fieldInfo, BlockTermState _termState, BulkPostingsEnum reuse, boolean doFreqs, boolean doPositions) throws IOException {
+ final FixedTermState termState = (FixedTermState) _termState;
+ final FixedBulkPostingsEnum lastBulkEnum = this.lastBulkEnum;
+ if (lastBulkEnum != null && reuse == lastBulkEnum) {
+ // fastpath
+ return lastBulkEnum.init(termState);
+ } else {
+ FixedBulkPostingsEnum postingsEnum;
+ if (reuse == null || !(reuse instanceof FixedBulkPostingsEnum) || !((FixedBulkPostingsEnum) reuse).canReuse(fieldInfo, docIn, doFreqs, doPositions, fieldInfo.omitTermFreqAndPositions)) {
+ postingsEnum = new FixedBulkPostingsEnum(fieldInfo, doFreqs, doPositions);
+ } else {
+ postingsEnum = (FixedBulkPostingsEnum) reuse;
+ }
+ this.lastBulkEnum = postingsEnum;
+ return postingsEnum.init(termState);
+ }
+ }
+
+ public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState _termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
+ assert !fieldInfo.omitTermFreqAndPositions;
+ final FixedTermState termState = (FixedTermState) _termState;
+
+ // we separate the no-payloads case (positions only) versus the case where payload
+ // lengths are encoded into the position deltas.
+ // when positions are pure, we can skip entire blocks of pending positions efficiently
+ // without decode, and some operations are more efficient.
+
+ if (!fieldInfo.storePayloads) {
+ FixedDocsAndPositionsEnum postingsEnum;
+ if (reuse == null || !(reuse instanceof FixedDocsAndPositionsEnum) || !((FixedDocsAndPositionsEnum) reuse).canReuse(docIn)) {
+ postingsEnum = new FixedDocsAndPositionsEnum();
+ } else {
+ postingsEnum = (FixedDocsAndPositionsEnum) reuse;
+ }
+
+ return postingsEnum.init(fieldInfo, termState, skipDocs);
+ } else {
+ FixedDocsAndPositionsAndPayloadsEnum postingsEnum;
+ if (reuse == null || !(reuse instanceof FixedDocsAndPositionsAndPayloadsEnum) || !((FixedDocsAndPositionsAndPayloadsEnum) reuse).canReuse(docIn)) {
+ postingsEnum = new FixedDocsAndPositionsAndPayloadsEnum();
+ } else {
+ postingsEnum = (FixedDocsAndPositionsAndPayloadsEnum) reuse;
+ }
+
+ return postingsEnum.init(fieldInfo, termState, skipDocs);
+ }
+ }
+
+ /** This docsenum class is used when the field omits frequencies (omitTFAP)
+ * In this case, the underlying file format is different, as it is just
+ * a pure stream of doc deltas.
+ */
+ final class FixedDocsEnum extends DocsEnum {
+ int docFreq;
+ int doc;
+ int count;
+
+ private boolean storePayloads;
+ private Bits skipDocs;
+ private final FixedIntBlockIndexInput.Reader docReader;
+ private final int[] docDeltaBuffer;
+ private int upto;
+ private long skipFP;
+
+ private final IntIndexInput.Index docIndex;
+ private final IntIndexInput.Index posIndex;
+ private final IntIndexInput startDocIn;
+
+ boolean skipped;
+ FixedSkipListReader skipper;
+
+ public FixedDocsEnum() throws IOException {
+ startDocIn = docIn;
+ docReader = docIn.reader();
+ docDeltaBuffer = docReader.getBuffer();
+ docIndex = docIn.index();
+ if (posIn != null) {
+ posIndex = posIn.index(); // only init this so skipper can read it
+ } else {
+ posIndex = null;
+ }
+ }
+
+ // nocommit -- somehow we have to prevent re-decode of
+ // the same block if we have just .next()'d to next term
+ // in the terms dict -- this is an O(N^2) cost to eg
+ // TermRangeQuery when it steps through low freq terms!!
+ FixedDocsEnum init(FieldInfo fieldInfo, FixedTermState termState, Bits skipDocs) throws IOException {
+ this.skipDocs = skipDocs;
+ assert fieldInfo.omitTermFreqAndPositions == true;
+ storePayloads = fieldInfo.storePayloads;
+
+ // TODO: can't we only do this if consumer
+ // skipped consuming the previous docs?
+ docIndex.set(termState.docIndex);
+
+ docIndex.seek(docReader);
+
+ upto = docReader.offset();
+
+ docFreq = termState.docFreq;
+ assert docFreq > 0;
+ // NOTE: unused if docFreq < skipMinimum:
+ skipFP = termState.skipFP;
+ count = 0;
+ doc = 0;
+ skipped = false;
+
+ return this;
+ }
+
+ public boolean canReuse(IntIndexInput docsIn) {
+ return startDocIn == docsIn;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+
+ while(true) {
+ if (count == docFreq) {
+ return doc = NO_MORE_DOCS;
+ }
+
+ assert upto <= blocksize: "docDeltaUpto=" + upto + " docDeltaLimit=" + blocksize;
+
+ if (upto == blocksize) {
+ // refill
+ docReader.fill();
+ upto = 0;
+ }
+
+ count++;
+
+ // Decode next doc
+ doc += docDeltaBuffer[upto++];
+
+ if (skipDocs == null || !skipDocs.get(doc)) {
+ break;
+ }
+ }
+ return doc;
+ }
+
+ @Override
+ public int freq() {
+ return 1;
+ }
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ if ((target - blocksize) >= doc && docFreq >= skipMinimum) {
+
+ // There are enough docs in the posting to have
+ // skip data, and its not too close
+
+ if (skipper == null) {
+ // This DocsEnum has never done any skipping
+ skipper = new FixedSkipListReader((IndexInput) skipIn.clone(),
+ docIn,
+ posIn,
+ maxSkipLevels, skipInterval);
+
+ }
+
+ if (!skipped) {
+ // We haven't yet skipped for this posting
+ skipper.init(skipFP,
+ docIndex,
+ posIndex,
+ 0,
+ docFreq,
+ storePayloads);
+ skipper.setOmitTF(true);
+
+ skipped = true;
+ }
+
+ final int newCount = skipper.skipTo(target);
+
+ if (newCount > count) {
+ // Skipper did move
+ skipper.getDocIndex().seek(docReader);
+
+ upto = docReader.offset();
+
+ count = newCount;
+ doc = skipper.getDoc();
+ }
+ }
+
+ // Now, linear scan for the rest:
+ do {
+ if (nextDoc() == NO_MORE_DOCS) {
+ return NO_MORE_DOCS;
+ }
+ } while (target > doc);
+
+ return doc;
+ }
+ }
+
+ /** This docsenum class is used when the field has frequencies
+ * In this case, the underlying file format is interleaved
+ * blocks of doc deltas and freqs.
+ */
+ // nocommit: keep freqs buffer "one step behind" docs buffer
+ // and lazily decode it when freq() is called?
+ // this could help something like conjunctions that next()/advance()
+ // often, but evaluate freq() less often, but this
+ // api is not used by anything serious anymore?!
+ final class FixedDocsAndFreqsEnum extends DocsEnum {
+ int docFreq;
+ int doc;
+ int count;
+ int freq;
+
+ private boolean storePayloads;
+ private Bits skipDocs;
+ private final FixedIntBlockIndexInput.Reader docReader;
+ private final int[] docDeltaBuffer;
+ private final FixedIntBlockIndexInput.Reader freqReader;
+ private final int[] freqBuffer;
+ private int upto;
+ private long skipFP;
+
+ private final IntIndexInput.Index docIndex;
+ private final IntIndexInput.Index posIndex;
+ private final IntIndexInput startDocIn;
+
+ boolean skipped;
+ FixedSkipListReader skipper;
+
+ public FixedDocsAndFreqsEnum() throws IOException {
+ startDocIn = docIn;
+ docReader = docIn.reader();
+ docDeltaBuffer = docReader.getBuffer();
+ docIndex = docIn.index();
+ if (freqIn != null) {
+ freqReader = freqIn.reader(docReader);
+ freqBuffer = freqReader.getBuffer();
+ } else {
+ freqReader = null;
+ freqBuffer = null;
+ }
+ if (posIn != null) {
+ posIndex = posIn.index(); // only init this so skipper can read it
+ } else {
+ posIndex = null;
+ }
+ }
+
+ // nocommit -- somehow we have to prevent re-decode of
+ // the same block if we have just .next()'d to next term
+ // in the terms dict -- this is an O(N^2) cost to eg
+ // TermRangeQuery when it steps through low freq terms!!
+ FixedDocsAndFreqsEnum init(FieldInfo fieldInfo, FixedTermState termState, Bits skipDocs) throws IOException {
+ this.skipDocs = skipDocs;
+ assert fieldInfo.omitTermFreqAndPositions == false;
+ storePayloads = fieldInfo.storePayloads;
+
+ // TODO: can't we only do this if consumer
+ // skipped consuming the previous docs?
+ docIndex.set(termState.docIndex);
+
+ docReader.seek(docIndex, freqReader, true);
+
+ upto = docReader.offset();
+
+ docFreq = termState.docFreq;
+ assert docFreq > 0;
+ // NOTE: unused if docFreq < skipMinimum:
+ skipFP = termState.skipFP;
+ count = 0;
+ doc = 0;
+ skipped = false;
+
+ return this;
+ }
+
+ public boolean canReuse(IntIndexInput docsIn) {
+ return startDocIn == docsIn;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ while(true) {
+ if (count == docFreq) {
+ return doc = NO_MORE_DOCS;
+ }
+
+ assert upto <= blocksize: "docDeltaUpto=" + upto + " docDeltaLimit=" + blocksize;
+
+ if (upto == blocksize) {
+ // refill
+ docReader.fill();
+ upto = 0;
+ freqReader.fill();
+ }
+
+ count++;
+
+ freq = freqBuffer[upto];
+ doc += docDeltaBuffer[upto++];
+
+ if (skipDocs == null || !skipDocs.get(doc)) {
+ break;
+ }
+ }
+ return doc;
+ }
+
+ @Override
+ public int freq() {
+ return freq;
+ }
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+
+ // nocommit: with the current skip settings we probably
+ // are skipping to much. instead it would be better to only
+ // actually try to skip on higher differences, since we have
+ // an optimized block-skipping scan(). a successful skip,
+ // as currently implemented, is going to populate both doc and freq
+ // blocks, but scan() can scan just docs... so maybe 8*blocksize
+ // or similar would work better... need to benchmark
+
+ if ((target - blocksize) >= doc && docFreq >= skipMinimum) {
+
+ // There are enough docs in the posting to have
+ // skip data, and its not too close
+
+ if (skipper == null) {
+ // This DocsEnum has never done any skipping
+ skipper = new FixedSkipListReader((IndexInput) skipIn.clone(),
+ docIn,
+ posIn,
+ maxSkipLevels, skipInterval);
+
+ }
+
+ if (!skipped) {
+ // We haven't yet skipped for this posting
+ skipper.init(skipFP,
+ docIndex,
+ posIndex,
+ 0,
+ docFreq,
+ storePayloads);
+ skipper.setOmitTF(false);
+
+ skipped = true;
+ }
+
+ final int newCount = skipper.skipTo(target);
+
+ if (newCount > count) {
+ // Skipper did move
+ final IntIndexInput.Index idx = skipper.getDocIndex();
+
+ docReader.seek(idx, freqReader, true);
+
+ upto = docReader.offset();
+
+ count = newCount;
+ doc = skipper.getDoc();
+ }
+ }
+
+ // Now, linear scan for the rest:
+ return scan(target);
+ }
+
+ /**
+ * optimized scan, it reads docbuffer one step ahead of freqs,
+ * so that it can skipBlock() whenever possible over freqs
+ */
+ int scan(int target) throws IOException {
+ boolean freqsPending = false;
+
+ while (true) {
+ if (count == docFreq) {
+ // nocommit: can we skipBlock / must we do this?
+ if (freqsPending) {
+ freqReader.fill();
+ }
+ return (doc = NO_MORE_DOCS);
+ }
+
+ if (upto == blocksize) {
+ // leapfrog
+ if (freqsPending) {
+ freqReader.skipBlock();
+ }
+ // refill
+ docReader.fill();
+ upto = 0;
+ freqsPending = true;
+ }
+
+ count++;
+
+ doc += docDeltaBuffer[upto++];
+
+ if (doc >= target && (skipDocs == null || !skipDocs.get(doc))) {
+ if (freqsPending) {
+ freqReader.fill();
+ }
+ freq = freqBuffer[upto-1];
+ return doc;
+ }
+ }
+ }
+ }
+
+ /** This dp enum class is used when the field has no payloads:
+ * positions are just pure deltas and no payloads ever exist.
+ *
+ * Because our blocksizes are in sync, we can skip over pending
+ * positions very efficiently: skipping over N positions means
+ * calling skipBlock N / blocksize times and positioning the offset
+ * to N % blocksize.
+ */
+ final class FixedDocsAndPositionsEnum extends DocsAndPositionsEnum {
+ int docFreq;
+ int doc;
+ int count;
+ int freq;
+
+ private Bits skipDocs;
+ private final FixedIntBlockIndexInput.Reader docReader;
+ private final int[] docDeltaBuffer;
+ private final FixedIntBlockIndexInput.Reader freqReader;
+ private final int[] freqBuffer;
+ private int upto;
+ private final FixedIntBlockIndexInput.Reader posReader;
+ private final int[] posBuffer;
+ private int posUpto;
+ private long skipFP;
+
+ private final IndexInput payloadIn;
+
+ private final IntIndexInput.Index docIndex;
+ private final IntIndexInput.Index posIndex;
+ private final IntIndexInput startDocIn;
+
+ private long payloadFP;
+
+ private int pendingPosCount;
+ private int position;
+ private boolean posSeekPending;
+
+ boolean skipped;
+ FixedSkipListReader skipper;
+
+ public FixedDocsAndPositionsEnum() throws IOException {
+ startDocIn = docIn;
+ docReader = docIn.reader();
+ docDeltaBuffer = docReader.getBuffer();
+ docIndex = docIn.index();
+ freqReader = freqIn.reader(docReader);
+ freqBuffer = freqReader.getBuffer();
+ posReader = posIn.reader();
+ posBuffer = posReader.getBuffer();
+ posIndex = posIn.index();
+ payloadIn = (IndexInput) FixedPostingsReaderImpl.this.payloadIn.clone();
+ }
+
+ // nocommit -- somehow we have to prevent re-decode of
+ // the same block if we have just .next()'d to next term
+ // in the terms dict -- this is an O(N^2) cost to eg
+ // TermRangeQuery when it steps through low freq terms!!
+ FixedDocsAndPositionsEnum init(FieldInfo fieldInfo, FixedTermState termState, Bits skipDocs) throws IOException {
+ this.skipDocs = skipDocs;
+ assert !fieldInfo.omitTermFreqAndPositions;
+ assert !fieldInfo.storePayloads;
+
+ // TODO: can't we only do this if consumer
+ // skipped consuming the previous docs?
+ docIndex.set(termState.docIndex);
+ // nocommit -- verify, during merge, this seek is
+ // sometimes w/in block:
+ docReader.seek(docIndex, freqReader, true);
+ upto = docReader.offset();
+
+ posIndex.set(termState.posIndex);
+ posSeekPending = true;
+
+ payloadFP = termState.payloadFP;
+ skipFP = termState.skipFP;
+
+ docFreq = termState.docFreq;
+ assert docFreq > 0;
+ count = 0;
+ doc = 0;
+ pendingPosCount = 0;
+ skipped = false;
+
+ return this;
+ }
+
+ public boolean canReuse(IntIndexInput docsIn) {
+ return startDocIn == docsIn;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ while(true) {
+ if (count == docFreq) {
+ return doc = NO_MORE_DOCS;
+ }
+
+ if (upto == blocksize) {
+ // refill
+ docReader.fill();
+ freqReader.fill();
+ upto = 0;
+ }
+
+ count++;
+
+ // Decode next doc
+ freq = freqBuffer[upto];
+ doc += docDeltaBuffer[upto++];
+
+ pendingPosCount += freq;
+
+ if (skipDocs == null || !skipDocs.get(doc)) {
+ break;
+ }
+ }
+
+ position = 0;
+ return doc;
+ }
+
+ @Override
+ public int freq() {
+ return freq;
+ }
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+
+ // nocommit: again we should likely use a higher distance,
+ // and avoid true skipping since we can block-skip pending positions.
+ // need to benchmark.
+
+ if ((target - blocksize) >= doc && docFreq >= skipMinimum) {
+
+ // There are enough docs in the posting to have
+ // skip data, and its not too close
+
+ if (skipper == null) {
+ // This DocsEnum has never done any skipping
+ skipper = new FixedSkipListReader((IndexInput) skipIn.clone(),
+ docIn,
+ posIn,
+ maxSkipLevels, skipInterval);
+
+ }
+
+ if (!skipped) {
+ // We haven't yet skipped for this posting
+ skipper.init(skipFP,
+ docIndex,
+ posIndex,
+ payloadFP,
+ docFreq,
+ false);
+ skipped = true;
+ }
+ final int newCount = skipper.skipTo(target);
+
+ if (newCount > count) {
+
+ // Skipper did move
+ final IntIndexInput.Index idx = skipper.getDocIndex();
+ docReader.seek(idx, freqReader, true);
+ upto = docReader.offset();
+
+ // NOTE: don't seek pos here; do it lazily
+ // instead. Eg a PhraseQuery may skip to many
+ // docs before finally asking for positions...
+ posIndex.set(skipper.getPosIndex());
+ posSeekPending = true;
+ count = newCount;
+ doc = skipper.getDoc();
+
+ payloadFP = skipper.getPayloadPointer();
+ pendingPosCount = 0;
+ }
+ }
+
+ // Now, linear scan for the rest:
+ do {
+ if (nextDoc() == NO_MORE_DOCS) {
+ return NO_MORE_DOCS;
+ }
+ } while (target > doc);
+
+ return doc;
+ }
+
+ @Override
+ public int nextPosition() throws IOException {
+ if (posSeekPending) {
+ posIndex.seek(posReader);
+ posUpto = posReader.offset();
+ payloadIn.seek(payloadFP);
+ posSeekPending = false;
+ }
+
+ // scan over any docs that were iterated without their
+ // positions. if the pending positions is within-block,
+ // we just increase the block offset. otherwise we skip
+ // over positions blocks and set the block offset to
+ // the remainder.
+
+ if (pendingPosCount > freq) {
+ int remaining = pendingPosCount - freq;
+ final int bufferedRemaining = blocksize - posUpto;
+ if (remaining <= bufferedRemaining) {
+ posUpto += remaining; // just advance upto
+ } else {
+ remaining -= bufferedRemaining;
+ final int blocksToSkip = remaining / blocksize;
+ for (int i = 0; i < blocksToSkip; i++)
+ posReader.skipBlock();
+ posReader.fill();
+ posUpto = remaining % blocksize;
+ }
+ pendingPosCount = freq;
+ }
+
+ if (posUpto == blocksize) {
+ posReader.fill();
+ posUpto = 0;
+ }
+
+ position += posBuffer[posUpto++];
+
+ pendingPosCount--;
+ assert pendingPosCount >= 0;
+ return position;
+ }
+
+ @Override
+ public BytesRef getPayload() throws IOException {
+ throw new IOException("no payload exists for this field.");
+ }
+
+ @Override
+ public boolean hasPayload() {
+ return false;
+ }
+ }
+
+ /**
+ * This class is only used when the field has payloads
+ * it would be better for payload-users to separate the payload lengths
+ * somehow from the positions.
+ */
+ final class FixedDocsAndPositionsAndPayloadsEnum extends DocsAndPositionsEnum {
+ int docFreq;
+ int doc;
+ int count;
+ int freq;
+
+ private Bits skipDocs;
+ private final FixedIntBlockIndexInput.Reader docReader;
+ private final int[] docDeltaBuffer;
+ private final FixedIntBlockIndexInput.Reader freqReader;
+ private final int[] freqBuffer;
+ private int upto;
+ private final BulkPostingsEnum.BlockReader posReader;
+ private final int[] posBuffer;
+ private int posUpto;
+ private long skipFP;
+
+ private final IndexInput payloadIn;
+
+ private final IntIndexInput.Index docIndex;
+ private final IntIndexInput.Index posIndex;
+ private final IntIndexInput startDocIn;
+
+ private long payloadFP;
+
+ private int pendingPosCount;
+ private int position;
+ private int payloadLength;
+ private long pendingPayloadBytes;
+ private boolean payloadPending;
+ private boolean posSeekPending;
+
+ boolean skipped;
+ FixedSkipListReader skipper;
+
+ public FixedDocsAndPositionsAndPayloadsEnum() throws IOException {
+ startDocIn = docIn;
+ docReader = docIn.reader();
+ docDeltaBuffer = docReader.getBuffer();
+ docIndex = docIn.index();
+ freqReader = freqIn.reader(docReader);
+ freqBuffer = freqReader.getBuffer();
+ posReader = posIn.reader();
+ posBuffer = posReader.getBuffer();
+ posIndex = posIn.index();
+ payloadIn = (IndexInput) FixedPostingsReaderImpl.this.payloadIn.clone();
+ }
+
+ // nocommit -- somehow we have to prevent re-decode of
+ // the same block if we have just .next()'d to next term
+ // in the terms dict -- this is an O(N^2) cost to eg
+ // TermRangeQuery when it steps through low freq terms!!
+ FixedDocsAndPositionsAndPayloadsEnum init(FieldInfo fieldInfo, FixedTermState termState, Bits skipDocs) throws IOException {
+ this.skipDocs = skipDocs;
+
+ assert !fieldInfo.omitTermFreqAndPositions;
+ assert fieldInfo.storePayloads == true;
+
+ // TODO: can't we only do this if consumer
+ // skipped consuming the previous docs?
+ docIndex.set(termState.docIndex);
+ // nocommit -- verify, during merge, this seek is
+ // sometimes w/in block:
+ docReader.seek(docIndex, freqReader, true);
+ upto = docReader.offset();
+
+ posIndex.set(termState.posIndex);
+ posSeekPending = true;
+ payloadPending = false;
+
+ payloadFP = termState.payloadFP;
+ skipFP = termState.skipFP;
+
+ docFreq = termState.docFreq;
+ assert docFreq > 0;
+ count = 0;
+ doc = 0;
+ pendingPosCount = 0;
+ pendingPayloadBytes = 0;
+ skipped = false;
+
+ return this;
+ }
+
+ public boolean canReuse(IntIndexInput docsIn) {
+ return startDocIn == docsIn;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ while(true) {
+ if (count == docFreq) {
+ return doc = NO_MORE_DOCS;
+ }
+
+ if (upto == blocksize) {
+ // refill
+ docReader.fill();
+ freqReader.fill();
+ upto = 0;
+ }
+
+ count++;
+
+ // Decode next doc
+ freq = freqBuffer[upto];
+ doc += docDeltaBuffer[upto++];
+
+ pendingPosCount += freq;
+
+ if (skipDocs == null || !skipDocs.get(doc)) {
+ break;
+ }
+ }
+
+ position = 0;
+ return doc;
+ }
+
+ @Override
+ public int freq() {
+ return freq;
+ }
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ if ((target - blocksize) >= doc && docFreq >= skipMinimum) {
+
+ // There are enough docs in the posting to have
+ // skip data, and its not too close
+
+ if (skipper == null) {
+ // This DocsEnum has never done any skipping
+ skipper = new FixedSkipListReader((IndexInput) skipIn.clone(),
+ docIn,
+ posIn,
+ maxSkipLevels, skipInterval);
+
+ }
+
+ if (!skipped) {
+ // We haven't yet skipped for this posting
+ skipper.init(skipFP,
+ docIndex,
+ posIndex,
+ payloadFP,
+ docFreq,
+ true);
+ skipped = true;
+ }
+ final int newCount = skipper.skipTo(target);
+
+ if (newCount > count) {
+
+ // Skipper did move
+ final IntIndexInput.Index idx = skipper.getDocIndex();
+ docReader.seek(idx, freqReader, true);
+ upto = docReader.offset();
+
+ // NOTE: don't seek pos here; do it lazily
+ // instead. Eg a PhraseQuery may skip to many
+ // docs before finally asking for positions...
+ posIndex.set(skipper.getPosIndex());
+ posSeekPending = true;
+ count = newCount;
+ doc = skipper.getDoc();
+
+ payloadFP = skipper.getPayloadPointer();
+ pendingPosCount = 0;
+ pendingPayloadBytes = 0;
+ payloadPending = false;
+ payloadLength = skipper.getPayloadLength();
+ }
+ }
+
+ // Now, linear scan for the rest:
+ do {
+ if (nextDoc() == NO_MORE_DOCS) {
+ return NO_MORE_DOCS;
+ }
+ } while (target > doc);
+
+ return doc;
+ }
+
+ @Override
+ public int nextPosition() throws IOException {
+ if (posSeekPending) {
+ posIndex.seek(posReader);
+ posUpto = posReader.offset();
+ payloadIn.seek(payloadFP);
+ posSeekPending = false;
+ }
+
+ // scan over any docs that were iterated without their
+ // positions
+ while (pendingPosCount > freq) {
+
+ final int code = nextPosInt();
+
+ if ((code & 1) != 0) {
+ // Payload length has changed
+ payloadLength = nextPosInt();
+ assert payloadLength >= 0;
+ }
+ pendingPosCount--;
+ position = 0;
+ pendingPayloadBytes += payloadLength;
+ }
+
+ final int code = nextPosInt();
+
+ assert code >= 0;
+ if ((code & 1) != 0) {
+ // Payload length has changed
+ payloadLength = nextPosInt();
+ assert payloadLength >= 0;
+ }
+ position += code >> 1;
+ pendingPayloadBytes += payloadLength;
+ payloadPending = payloadLength > 0;
+
+ pendingPosCount--;
+ assert pendingPosCount >= 0;
+ return position;
+ }
+
+ private int nextPosInt() throws IOException {
+ if (posUpto == blocksize) {
+ posReader.fill();
+ posUpto = 0;
+ }
+ return posBuffer[posUpto++];
+ }
+
+ private BytesRef payload;
+
+ @Override
+ public BytesRef getPayload() throws IOException {
+ if (!payloadPending) {
+ throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once.");
+ }
+
+ assert pendingPayloadBytes >= payloadLength;
+
+ if (pendingPayloadBytes > payloadLength) {
+ payloadIn.seek(payloadIn.getFilePointer() + (pendingPayloadBytes - payloadLength));
+ }
+
+ if (payload == null) {
+ payload = new BytesRef();
+ payload.bytes = new byte[payloadLength];
+ } else if (payload.bytes.length < payloadLength) {
+ payload.grow(payloadLength);
+ }
+
+ payloadIn.readBytes(payload.bytes, 0, payloadLength);
+ payloadPending = false;
+ payload.length = payloadLength;
+ pendingPayloadBytes = 0;
+ return payload;
+ }
+
+ @Override
+ public boolean hasPayload() {
+ return payloadPending && payloadLength > 0;
+ }
+ }
+
+ /**
+ * This class is returned to the caller as a docs reader when they dont want freqs,
+ * but freqs are actually in the file. we skipBlock() in tandem with doc delta fills.
+ */
+ static final class FreqSkippingDocReader extends BulkPostingsEnum.BlockReader {
+ final BulkPostingsEnum.BlockReader docs;
+ final FixedIntBlockIndexInput.Reader freqs;
+
+ FreqSkippingDocReader(BulkPostingsEnum.BlockReader docs, FixedIntBlockIndexInput.Reader freqs) {
+ this.docs = docs;
+ this.freqs = freqs;
+ }
+
+ @Override
+ public int[] getBuffer() {
+ return docs.getBuffer();
+ }
+
+ @Override
+ public int fill() throws IOException {
+ final int ret = docs.fill();
+ freqs.skipBlock();
+ return ret;
+ }
+
+ @Override
+ public int end() {
+ return docs.end();
+ }
+
+ @Override
+ public int offset() {
+ return docs.offset();
+ }
+ }
+
+ final class FixedBulkPostingsEnum extends BulkPostingsEnum {
+ private int docFreq;
+
+ private final BulkPostingsEnum.BlockReader docReader;
+ // nocommit: confusing. just a pointer to the above, when doFreq = true
+ // when doFreq = false and omitTF is false, freqReader is null,
+ // but this one automatically fills() the child.
+ private final FixedIntBlockIndexInput.Reader parentDocReader;
+
+ private final IntIndexInput.Index docIndex;
+
+ private final FixedIntBlockIndexInput.Reader freqReader;
+ // nocommit: confusing. just a pointer to the above, when doFreq = true
+ // when doFreq = false and omitTF is false, freqReader is null,
+ // but this one is still valid so it can skipBlocks
+ private final FixedIntBlockIndexInput.Reader childFreqReader;
+
+ private final BulkPostingsEnum.BlockReader posReader;
+ private final IntIndexInput.Index posIndex;
+
+ private final boolean storePayloads;
+ private final boolean omitTF;
+ private long skipFP;
+
+ private final IntIndexInput startDocIn;
+
+ private boolean skipped;
+ private FixedSkipListReader skipper;
+
+ public FixedBulkPostingsEnum(FieldInfo fieldInfo, boolean doFreq, boolean doPos) throws IOException {
+ this.storePayloads = fieldInfo.storePayloads;
+ this.omitTF = fieldInfo.omitTermFreqAndPositions;
+ startDocIn = docIn;
+
+ parentDocReader = docIn.reader();
+
+ if (doFreq && !omitTF) {
+ childFreqReader = freqReader = freqIn.reader(parentDocReader);
+ } else if (!doFreq && !omitTF) {
+ childFreqReader = freqIn.reader(parentDocReader); // for skipping blocks
+ freqReader = null;
+ } else {
+ childFreqReader = null;
+ freqReader = null;
+ }
+
+ if (!doFreq && !omitTF) {
+ docReader = new FreqSkippingDocReader(parentDocReader, childFreqReader);
+ } else {
+ docReader = parentDocReader;
+ }
+
+ docIndex = docIn.index();
+
+ if (doPos && !omitTF) {
+ if (storePayloads) {
+ // Must rewrite each posDelta:
+ posReader = new PosPayloadReader(posIn.reader());
+ } else {
+ // Pass through
+ posReader = posIn.reader();
+ }
+ } else {
+ posReader = null;
+ }
+
+ if (!omitTF) {
+ // we have to pull these even if doFreq is false
+ // just so we can decode the index from the docs
+ // file
+ posIndex = posIn.index();
+ } else {
+ posIndex = null;
+ }
+ }
+
+ public boolean canReuse(FieldInfo fieldInfo, IntIndexInput docIn, boolean doFreq, boolean doPos, boolean omitTF) {
+ return fieldInfo.storePayloads == storePayloads &&
+ startDocIn == docIn &&
+ doFreq == (freqReader != null) &&
+ omitTF == (this.omitTF) &&
+ doPos == (posReader != null);
+ }
+
+ // nocommit -- make sure this is tested!!
+
+ // Only used when payloads were stored -- we cannot do
+ // pass-through read for this since the payload lengths
+ // are also encoded into the position deltas
+ private final class PosPayloadReader extends BulkPostingsEnum.BlockReader {
+ final BulkPostingsEnum.BlockReader other;
+ private boolean fillPending;
+ private int pendingOffset;
+ private int limit;
+ private boolean skipNext;
+
+ public PosPayloadReader(BulkPostingsEnum.BlockReader other) {
+ this.other = other;
+ }
+
+ void doAfterSeek() {
+ limit = 0;
+ skipNext = false;
+ fillPending = false;
+ }
+
+ @Override
+ public int[] getBuffer() {
+ return other.getBuffer();
+ }
+
+ // nocommit -- make sure this works correctly in the
+ // "reuse"/seek case
+ @Override
+ public int offset() {
+ pendingOffset = other.offset();
+ return 0;
+ }
+
+ @Override
+ public int fill() throws IOException {
+ // Translate code back to pos deltas, and filter out
+ // any changes in payload length. NOTE: this is a
+ // perf hit on indices that encode payloads, even if
+ // they use "normal" positional queries
+ limit = 0;
+ boolean skippedLast = false;
+ do {
+ final int otherLimit = fillPending ? other.fill() : other.end();
+ fillPending = true;
+ assert otherLimit > pendingOffset;
+ final int[] buffer = other.getBuffer();
+ for(int i=pendingOffset;i<otherLimit;i++) {
+ if (skipNext) {
+ skipNext = false;
+ skippedLast = true;
+ } else {
+ skippedLast = false;
+ final int code = buffer[i];
+ buffer[limit++] = code >>> 1;
+ if ((code & 1) != 0) {
+ // skip the payload length
+ skipNext = true;
+ }
+ }
+ }
+ pendingOffset = 0;
+ /*
+ * some readers will only fill a single element of the buffer
+ * if that single element is skipped we need to do another round.
+ */
+ }while(limit == 0 && skippedLast);
+ return limit;
+ }
+
+ @Override
+ public int end() {
+ return limit;
+ }
+ }
+
+ /** Position readers to the specified term */
+ FixedBulkPostingsEnum init(FixedTermState termState) throws IOException {
+
+ // To reduce cost of scanning the terms dict, sep
+ // codecs store only the docDelta index in the terms
+ // dict, and then stuff the other term metadata (freq
+ // index, pos index, skip offset) into the front of
+ // the docDeltas. So here we seek the docReader and
+ // decode this metadata:
+
+ // nocommit -- make sure seek w/in buffer is efficient
+ // here:
+
+ // TODO: can't we only do this if consumer
+ // skipped consuming the previous docs?
+ docIndex.set(termState.docIndex);
+
+ if (!omitTF) {
+ // nocommit -- would be better (fewer bytes used) to
+ // make this a relative index read (pass false not
+ // true), eg relative to first term in the terms
+ // index block
+ parentDocReader.seek(docIndex, childFreqReader, freqReader != null);
+
+ posIndex.set(termState.posIndex);
+ } else {
+ docIndex.seek(parentDocReader);
+ }
+
+ skipFP = termState.skipFP;
+
+ if (posReader != null) {
+ if (storePayloads) {
+ PosPayloadReader posPayloadReader = (PosPayloadReader) posReader;
+ posIndex.seek(posPayloadReader.other);
+ posPayloadReader.doAfterSeek();
+ } else {
+ posIndex.seek(posReader);
+ }
+ }
+
+ docFreq = termState.docFreq;
+ skipped = false;
+
+ return this;
+ }
+
+ @Override
+ public BulkPostingsEnum.BlockReader getDocDeltasReader() {
+ // Maximize perf -- just pass through the underlying
+ // intblock reader:
+ return docReader;
+ }
+
+ @Override
+ public BulkPostingsEnum.BlockReader getFreqsReader() {
+ // Maximize perf -- just pass through the underlying
+ // intblock reader:
+ return freqReader;
+ }
+
+ @Override
+ public BulkPostingsEnum.BlockReader getPositionDeltasReader() {
+ // Maximize perf -- just pass through the underlying
+ // intblock reader (if payloads were not indexed):
+ return posReader;
+ }
+
+ private final JumpResult jumpResult = new JumpResult();
+
+ @Override
+ public JumpResult jump(int target, int curCount) throws IOException {
+
+ // TODO: require jump to take current docid and prevent skipping for close jumps?
+ // we can do all kinds of cool stuff here.
+
+ if (docFreq >= skipMinimum) {
+
+ // There are enough docs in the posting to have
+ // skip data
+
+ if (skipper == null) {
+ // This enum has never done any skipping
+ skipper = new FixedSkipListReader((IndexInput) skipIn.clone(),
+ docIn,
+ posIn,
+ maxSkipLevels, skipInterval);
+ }
+
+ if (!skipped) {
+ // We haven't yet skipped for this particular posting
+ skipper.init(skipFP,
+ docIndex,
+ posIndex,
+ 0,
+ docFreq,
+ storePayloads);
+ skipper.setOmitTF(omitTF);
+ skipped = true;
+ }
+
+ final int newCount = skipper.skipTo(target);
+
+ if (newCount > curCount) {
+
+ // Skipper did move -- seek all readers:
+ final IntIndexInput.Index idx = skipper.getDocIndex();
+
+ if (!omitTF) {
+ parentDocReader.seek(idx, childFreqReader, freqReader != null);
+ } else {
+ idx.seek(parentDocReader);
+ }
+
+ if (posReader != null) {
+ if (storePayloads) {
+ PosPayloadReader posPayloadReader = (PosPayloadReader) posReader;
+ skipper.getPosIndex().seek(posPayloadReader.other);
+ posPayloadReader.doAfterSeek();
+ } else {
+ skipper.getPosIndex().seek(posReader);
+ }
+ }
+ jumpResult.count = newCount;
+ jumpResult.docID = skipper.getDoc();
+ return jumpResult;
+ }
+ }
+ return null;
+ }
+ }
+}
Added: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/fixed/FixedPostingsWriterImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/fixed/FixedPostingsWriterImpl.java?rev=1070580&view=auto
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/fixed/FixedPostingsWriterImpl.java (added)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/fixed/FixedPostingsWriterImpl.java Mon Feb 14 17:15:47 2011
@@ -0,0 +1,342 @@
+package org.apache.lucene.index.codecs.fixed;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Set;
+
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.codecs.PostingsWriterBase;
+import org.apache.lucene.index.codecs.TermStats;
+import org.apache.lucene.index.codecs.intblock.FixedIntBlockIndexOutput;
+import org.apache.lucene.index.codecs.sep.IntIndexOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RAMOutputStream;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CodecUtil;
+
+/**
+ * Writes frq and doc blocks in interleaved fashion to .doc, pos to .pos, payloads
+ * to .pyl, skip data to .skp
+ *
+ * NOTE: when omitTF is enabled, only pure doc blocks are written.
+ *
+ * The doc, freq, and positions codecs must have the same fixed blocksize to use
+ * this layout: however they can be different compression algorithms.
+ *
+ * @lucene.experimental */
+public final class FixedPostingsWriterImpl extends PostingsWriterBase {
+ final static String CODEC = "FixedDocFreqSkip";
+
+ final static String DOC_EXTENSION = "doc";
+ final static String SKIP_EXTENSION = "skp";
+ final static String POS_EXTENSION = "pos";
+ final static String PAYLOAD_EXTENSION = "pyl";
+
+ // Increment version to change it:
+ final static int VERSION_START = 0;
+ final static int VERSION_CURRENT = VERSION_START;
+
+ final FixedIntBlockIndexOutput posOut;
+ final IntIndexOutput.Index posIndex;
+
+ IntIndexOutput docOut; // pointer to either docio or docfreqio
+ final InterleavedIntBlockIndexOutput docfreqio; // for !omitTF fields, doc+freq stream
+ final FixedIntBlockIndexOutput docio; // for omitTF fields, the underlying doc-only stream
+ final IntIndexOutput.Index docIndex;
+
+ final IndexOutput payloadOut;
+
+ final IndexOutput skipOut;
+ IndexOutput termsOut;
+
+ final FixedSkipListWriter skipListWriter;
+ /** Expert: The fraction of TermDocs entries stored in skip tables,
+ * used to accelerate {@link DocsEnum#advance(int)}. Larger values result in
+ * smaller indexes, greater acceleration, but fewer accelerable cases, while
+ * smaller values result in bigger indexes, less acceleration and more
+ * accelerable cases. More detailed experiments would be useful here. */
+ final int skipInterval;
+
+ /**
+ * Expert: minimum docFreq to write any skip data at all
+ */
+ final int skipMinimum;
+
+ /** Expert: The maximum number of skip levels. Smaller values result in
+ * slightly smaller indexes, but slower skipping in big posting lists.
+ */
+ final int maxSkipLevels = 10;
+
+ final int totalNumDocs;
+
+ boolean storePayloads;
+ boolean omitTF;
+
+ long lastSkipFP;
+
+ FieldInfo fieldInfo;
+
+ int lastPayloadLength;
+ int lastPosition;
+ long payloadStart;
+ long lastPayloadStart;
+ int lastDocID;
+ int df;
+ private int pendingTermCount;
+
+ // Holds pending byte[] blob for the current terms block
+ private final RAMOutputStream indexBytesWriter = new RAMOutputStream();
+
+ public FixedPostingsWriterImpl(SegmentWriteState state, FixedIntStreamFactory factory) throws IOException {
+ super();
+
+ final String docFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, DOC_EXTENSION);
+ IndexOutput stream = state.directory.createOutput(docFileName);
+ docio = factory.createOutput(stream, docFileName, false);
+ // nocommit: hardcode to more reasonable like 64?
+ skipMinimum = skipInterval = docio.blockSize;
+ if (state.fieldInfos.hasProx()) {
+ FixedIntBlockIndexOutput freqio = factory.createOutput(stream, docFileName, true);
+ docOut = docfreqio = new InterleavedIntBlockIndexOutput(docio, freqio);
+
+ final String posFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, POS_EXTENSION);
+ posOut = factory.createOutput(state.directory, posFileName);
+ posIndex = posOut.index();
+
+ // nocommit: clean up?
+ if (posOut.blockSize != docio.blockSize) {
+ throw new IllegalArgumentException("positions blocksize must be equal to docs and freqs blocksize");
+ }
+
+ // TODO: -- only if at least one field stores payloads?
+ final String payloadFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, PAYLOAD_EXTENSION);
+ payloadOut = state.directory.createOutput(payloadFileName);
+
+ } else {
+ docOut = docio; // docOut is just a pure doc stream only
+ docfreqio = null;
+ posOut = null;
+ posIndex = null;
+ payloadOut = null;
+ }
+
+ docIndex = docOut.index();
+ final String skipFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, SKIP_EXTENSION);
+ skipOut = state.directory.createOutput(skipFileName);
+
+ totalNumDocs = state.numDocs;
+
+ skipListWriter = new FixedSkipListWriter(skipInterval,
+ maxSkipLevels,
+ state.numDocs,
+ docOut,
+ posOut, payloadOut);
+ }
+
+ @Override
+ public void start(IndexOutput termsOut) throws IOException {
+ this.termsOut = termsOut;
+ CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT);
+ // TODO: -- just ask skipper to "start" here
+ termsOut.writeInt(skipInterval); // write skipInterval
+ termsOut.writeInt(maxSkipLevels); // write maxSkipLevels
+ termsOut.writeInt(skipMinimum); // write skipMinimum
+ }
+
+ @Override
+ public void startTerm() throws IOException {
+ docIndex.mark();
+ if (!omitTF) {
+ posIndex.mark();
+ payloadStart = payloadOut.getFilePointer();
+ lastPayloadLength = -1;
+ }
+ skipListWriter.resetSkip(docIndex, posIndex);
+ }
+
+ // Currently, this instance is re-used across fields, so
+ // our parent calls setField whenever the field changes
+ @Override
+ public void setField(FieldInfo fieldInfo) {
+ this.fieldInfo = fieldInfo;
+
+ // omitTF's differ, we must flush any buffered docs/freqs
+ // nocommit: ugly!
+ if (omitTF != fieldInfo.omitTermFreqAndPositions) {
+ try {
+ if (docOut instanceof InterleavedIntBlockIndexOutput) {
+ ((InterleavedIntBlockIndexOutput) docOut).flush();
+ } else {
+ ((FixedIntBlockIndexOutput) docOut).flush();
+ }
+ } catch (IOException e) { throw new RuntimeException(e); }
+ }
+
+ omitTF = fieldInfo.omitTermFreqAndPositions;
+ docOut = omitTF ? docio : docfreqio;
+ skipListWriter.setOmitTF(omitTF);
+ storePayloads = !omitTF && fieldInfo.storePayloads;
+ }
+
+ /** Adds a new doc in this term. If this returns null
+ * then we just skip consuming positions/payloads. */
+ @Override
+ public void startDoc(int docID, int termDocFreq) throws IOException {
+
+ final int delta = docID - lastDocID;
+
+ if (docID < 0 || (df > 0 && delta <= 0)) {
+ throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )");
+ }
+
+ if ((++df % skipInterval) == 0) {
+ // TODO: -- awkward we have to make these two
+ // separate calls to skipper
+ skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength);
+ skipListWriter.bufferSkip(df);
+ }
+
+ lastDocID = docID;
+ docOut.write(delta);
+ if (!omitTF) {
+ docOut.write(termDocFreq);
+ }
+ }
+
+ @Override
+ public void flushTermsBlock() throws IOException {
+ termsOut.writeVLong((int) indexBytesWriter.getFilePointer());
+ indexBytesWriter.writeTo(termsOut);
+ indexBytesWriter.reset();
+ pendingTermCount = 0;
+ }
+
+ /** Add a new position & payload */
+ @Override
+ public void addPosition(int position, BytesRef payload) throws IOException {
+ assert !omitTF;
+
+ final int delta = position - lastPosition;
+ assert delta > 0 || position == 0: "position=" + position + " lastPosition=" + lastPosition; // not quite right (if pos=0 is repeated twice we don't catch it)
+ lastPosition = position;
+
+ if (storePayloads) {
+ final int payloadLength = payload == null ? 0 : payload.length;
+ if (payloadLength != lastPayloadLength) {
+ lastPayloadLength = payloadLength;
+ // TODO: explore whether we get better compression
+ // by not storing payloadLength into prox stream?
+ posOut.write((delta<<1)|1);
+ posOut.write(payloadLength);
+ } else {
+ posOut.write(delta << 1);
+ }
+
+ if (payloadLength > 0) {
+ payloadOut.writeBytes(payload.bytes, payload.offset, payloadLength);
+ }
+ } else {
+ posOut.write(delta);
+ }
+
+ lastPosition = position;
+ }
+
+ /** Called when we are done adding positions & payloads */
+ @Override
+ public void finishDoc() {
+ lastPosition = 0;
+ }
+
+ /** Called when we are done adding docs to this term */
+ @Override
+ public void finishTerm(TermStats stats) throws IOException {
+ // TODO: -- wasteful we are counting this in two places?
+ assert stats.docFreq > 0;
+ assert stats.docFreq == df;
+
+ final boolean isFirstTerm = pendingTermCount == 0;
+
+ docIndex.write(indexBytesWriter, isFirstTerm);
+
+ if (!omitTF) {
+ posIndex.write(indexBytesWriter, isFirstTerm);
+ if (storePayloads) {
+ if (isFirstTerm) {
+ indexBytesWriter.writeVLong(payloadStart);
+ } else {
+ indexBytesWriter.writeVLong(payloadStart - lastPayloadStart);
+ }
+ lastPayloadStart = payloadStart;
+ }
+ }
+
+ if (df >= skipMinimum) {
+ final long skipFP = skipOut.getFilePointer();
+ skipListWriter.writeSkip(skipOut);
+ if (isFirstTerm) {
+ indexBytesWriter.writeVLong(skipFP);
+ } else {
+ indexBytesWriter.writeVLong(skipFP - lastSkipFP);
+ }
+ lastSkipFP = skipFP;
+ } else if (isFirstTerm) {
+ // TODO: this is somewhat wasteful; eg if no terms in
+ // this block will use skip data, we don't need to
+ // write this:
+ final long skipFP = skipOut.getFilePointer();
+ indexBytesWriter.writeVLong(skipFP);
+ lastSkipFP = skipFP;
+ }
+
+ lastDocID = 0;
+ df = 0;
+ pendingTermCount++;
+ }
+
+ @Override
+ public void close() throws IOException {
+ try {
+ docOut.close();
+ } finally {
+ try {
+ skipOut.close();
+ } finally {
+ if (posOut != null) {
+ try {
+ posOut.close();
+ } finally {
+ payloadOut.close();
+ }
+ }
+ }
+ }
+ }
+
+ public static void getExtensions(Set<String> extensions) {
+ extensions.add(DOC_EXTENSION);
+ extensions.add(SKIP_EXTENSION);
+ extensions.add(POS_EXTENSION);
+ extensions.add(PAYLOAD_EXTENSION);
+ }
+}
Added: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/fixed/FixedSkipListReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/fixed/FixedSkipListReader.java?rev=1070580&view=auto
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/fixed/FixedSkipListReader.java (added)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/fixed/FixedSkipListReader.java Mon Feb 14 17:15:47 2011
@@ -0,0 +1,175 @@
+package org.apache.lucene.index.codecs.fixed;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.index.codecs.MultiLevelSkipListReader;
+import org.apache.lucene.index.codecs.sep.IntIndexInput;
+
+/**
+ * Implements the skip list reader for the default posting list format
+ * that stores positions and payloads.
+ *
+ * @lucene.experimental
+ */
+
+// TODO: rewrite this as recursive classes?
+class FixedSkipListReader extends MultiLevelSkipListReader {
+ private boolean currentFieldStoresPayloads;
+ private IntIndexInput.Index docIndex[];
+ private IntIndexInput.Index posIndex[];
+ private long payloadPointer[];
+ private int payloadLength[];
+
+ private final IntIndexInput.Index lastDocIndex;
+ // TODO: -- make private again
+ final IntIndexInput.Index lastPosIndex;
+
+ private long lastPayloadPointer;
+ private int lastPayloadLength;
+
+ FixedSkipListReader(IndexInput skipStream,
+ IntIndexInput docIn,
+ IntIndexInput posIn,
+ int maxSkipLevels,
+ int skipInterval)
+ throws IOException {
+ super(skipStream, maxSkipLevels, skipInterval);
+ docIndex = new IntIndexInput.Index[maxSkipLevels];
+ if (posIn != null) {
+ posIndex = new IntIndexInput.Index[maxNumberOfSkipLevels];
+ }
+ for(int i=0;i<maxSkipLevels;i++) {
+ docIndex[i] = docIn.index();
+ if (posIn != null) {
+ posIndex[i] = posIn.index();
+ }
+ }
+ payloadPointer = new long[maxSkipLevels];
+ payloadLength = new int[maxSkipLevels];
+
+ lastDocIndex = docIn.index();
+ if (posIn != null) {
+ lastPosIndex = posIn.index();
+ } else {
+ lastPosIndex = null;
+ }
+ }
+
+ boolean omitTF;
+
+ void setOmitTF(boolean v) {
+ omitTF = v;
+ }
+
+ void init(long skipPointer,
+ IntIndexInput.Index docBaseIndex,
+ IntIndexInput.Index posBaseIndex,
+ long payloadBasePointer,
+ int df,
+ boolean storesPayloads) {
+
+ super.init(skipPointer, df);
+ this.currentFieldStoresPayloads = storesPayloads;
+
+ lastPayloadPointer = payloadBasePointer;
+
+ for(int i=0;i<maxNumberOfSkipLevels;i++) {
+ docIndex[i].set(docBaseIndex);
+ if (posBaseIndex != null) {
+ posIndex[i].set(posBaseIndex);
+ }
+ }
+ Arrays.fill(payloadPointer, payloadBasePointer);
+ Arrays.fill(payloadLength, 0);
+ }
+
+ long getPayloadPointer() {
+ return lastPayloadPointer;
+ }
+
+ /** Returns the payload length of the payload stored just before
+ * the doc to which the last call of {@link MultiLevelSkipListReader#skipTo(int)}
+ * has skipped. */
+ int getPayloadLength() {
+ return lastPayloadLength;
+ }
+
+ @Override
+ protected void seekChild(int level) throws IOException {
+ super.seekChild(level);
+ payloadPointer[level] = lastPayloadPointer;
+ payloadLength[level] = lastPayloadLength;
+ }
+
+ @Override
+ protected void setLastSkipData(int level) {
+ super.setLastSkipData(level);
+
+ lastPayloadPointer = payloadPointer[level];
+ lastPayloadLength = payloadLength[level];
+ lastDocIndex.set(docIndex[level]);
+ if (lastPosIndex != null) {
+ lastPosIndex.set(posIndex[level]);
+ }
+
+ if (level > 0) {
+ docIndex[level-1].set(docIndex[level]);
+ if (posIndex != null) {
+ posIndex[level-1].set(posIndex[level]);
+ }
+ }
+ }
+
+ IntIndexInput.Index getPosIndex() {
+ return lastPosIndex;
+ }
+
+ IntIndexInput.Index getDocIndex() {
+ return lastDocIndex;
+ }
+
+ @Override
+ protected int readSkipData(int level, IndexInput skipStream) throws IOException {
+ int delta;
+ if (currentFieldStoresPayloads) {
+ // the current field stores payloads.
+ // if the doc delta is odd then we have
+ // to read the current payload length
+ // because it differs from the length of the
+ // previous payload
+ delta = skipStream.readVInt();
+ if ((delta & 1) != 0) {
+ payloadLength[level] = skipStream.readVInt();
+ }
+ delta >>>= 1;
+ } else {
+ delta = skipStream.readVInt();
+ }
+ docIndex[level].read(skipStream, false);
+ if (!omitTF) {
+ posIndex[level].read(skipStream, false);
+ payloadPointer[level] += skipStream.readVInt();
+ }
+
+ return delta;
+ }
+}
Added: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/fixed/FixedSkipListWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/fixed/FixedSkipListWriter.java?rev=1070580&view=auto
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/fixed/FixedSkipListWriter.java (added)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/fixed/FixedSkipListWriter.java Mon Feb 14 17:15:47 2011
@@ -0,0 +1,183 @@
+package org.apache.lucene.index.codecs.fixed;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.index.codecs.MultiLevelSkipListWriter;
+import org.apache.lucene.index.codecs.sep.IntIndexOutput;
+
+// TODO: -- skip data should somehow be more local to the
+// particular stream (doc, freq, pos, payload)
+
+/**
+ * Implements the skip list writer for the default posting list format
+ * that stores positions and payloads.
+ *
+ * @lucene.experimental
+ */
+class FixedSkipListWriter extends MultiLevelSkipListWriter {
+ private int[] lastSkipDoc;
+ private int[] lastSkipPayloadLength;
+ private long[] lastSkipPayloadPointer;
+
+ private IntIndexOutput.Index[] docIndex;
+ private IntIndexOutput.Index[] posIndex;
+
+ // TODO: -- private again
+ IntIndexOutput posOutput;
+ // TODO: -- private again
+ IndexOutput payloadOutput;
+
+ private int curDoc;
+ private boolean curStorePayloads;
+ private int curPayloadLength;
+ private long curPayloadPointer;
+
+ FixedSkipListWriter(int skipInterval, int numberOfSkipLevels, int docCount,
+ IntIndexOutput docOutput,
+ IntIndexOutput posOutput,
+ IndexOutput payloadOutput)
+ throws IOException {
+ super(skipInterval, numberOfSkipLevels, docCount);
+
+ this.posOutput = posOutput;
+ this.payloadOutput = payloadOutput;
+
+ lastSkipDoc = new int[numberOfSkipLevels];
+ lastSkipPayloadLength = new int[numberOfSkipLevels];
+ // TODO: -- also cutover normal IndexOutput to use getIndex()?
+ lastSkipPayloadPointer = new long[numberOfSkipLevels];
+
+ docIndex = new IntIndexOutput.Index[numberOfSkipLevels];
+ posIndex = new IntIndexOutput.Index[numberOfSkipLevels];
+
+ for(int i=0;i<numberOfSkipLevels;i++) {
+ docIndex[i] = docOutput.index();
+ if (posOutput != null) {
+ posIndex[i] = posOutput.index();
+ }
+ }
+ }
+
+ boolean omitTF;
+
+ void setOmitTF(boolean v) {
+ omitTF = v;
+ }
+
+ void setPosOutput(IntIndexOutput posOutput) throws IOException {
+ this.posOutput = posOutput;
+ for(int i=0;i<numberOfSkipLevels;i++) {
+ posIndex[i] = posOutput.index();
+ }
+ }
+
+ void setPayloadOutput(IndexOutput payloadOutput) {
+ this.payloadOutput = payloadOutput;
+ }
+
+ /**
+ * Sets the values for the current skip data.
+ */
+ // Called @ every index interval (every 128th (by default)
+ // doc)
+ void setSkipData(int doc, boolean storePayloads, int payloadLength) {
+ this.curDoc = doc;
+ this.curStorePayloads = storePayloads;
+ this.curPayloadLength = payloadLength;
+ if (payloadOutput != null) {
+ this.curPayloadPointer = payloadOutput.getFilePointer();
+ }
+ }
+
+ // Called @ start of new term
+ protected void resetSkip(IntIndexOutput.Index topDocIndex, IntIndexOutput.Index topPosIndex)
+ throws IOException {
+ super.resetSkip();
+
+ Arrays.fill(lastSkipDoc, 0);
+ Arrays.fill(lastSkipPayloadLength, -1); // we don't have to write the first length in the skip list
+ for(int i=0;i<numberOfSkipLevels;i++) {
+ docIndex[i].set(topDocIndex);
+ if (posOutput != null) {
+ posIndex[i].set(topPosIndex);
+ }
+ }
+ if (payloadOutput != null) {
+ Arrays.fill(lastSkipPayloadPointer, payloadOutput.getFilePointer());
+ }
+ }
+
+ @Override
+ protected void writeSkipData(int level, IndexOutput skipBuffer) throws IOException {
+ // To efficiently store payloads in the posting lists we do not store the length of
+ // every payload. Instead we omit the length for a payload if the previous payload had
+ // the same length.
+ // However, in order to support skipping the payload length at every skip point must be known.
+ // So we use the same length encoding that we use for the posting lists for the skip data as well:
+ // Case 1: current field does not store payloads
+ // SkipDatum --> DocSkip, FreqSkip, ProxSkip
+ // DocSkip,FreqSkip,ProxSkip --> VInt
+ // DocSkip records the document number before every SkipInterval th document in TermFreqs.
+ // Document numbers are represented as differences from the previous value in the sequence.
+ // Case 2: current field stores payloads
+ // SkipDatum --> DocSkip, PayloadLength?, FreqSkip,ProxSkip
+ // DocSkip,FreqSkip,ProxSkip --> VInt
+ // PayloadLength --> VInt
+ // In this case DocSkip/2 is the difference between
+ // the current and the previous value. If DocSkip
+ // is odd, then a PayloadLength encoded as VInt follows,
+ // if DocSkip is even, then it is assumed that the
+ // current payload length equals the length at the previous
+ // skip point
+
+ assert !omitTF || !curStorePayloads;
+
+ if (curStorePayloads) {
+ int delta = curDoc - lastSkipDoc[level];
+ if (curPayloadLength == lastSkipPayloadLength[level]) {
+ // the current payload length equals the length at the previous skip point,
+ // so we don't store the length again
+ skipBuffer.writeVInt(delta << 1);
+ } else {
+ // the payload length is different from the previous one. We shift the DocSkip,
+ // set the lowest bit and store the current payload length as VInt.
+ skipBuffer.writeVInt(delta << 1 | 1);
+ skipBuffer.writeVInt(curPayloadLength);
+ lastSkipPayloadLength[level] = curPayloadLength;
+ }
+ } else {
+ // current field does not store payloads
+ skipBuffer.writeVInt(curDoc - lastSkipDoc[level]);
+ }
+
+ docIndex[level].mark();
+ docIndex[level].write(skipBuffer, false);
+ if (!omitTF) {
+ posIndex[level].mark();
+ posIndex[level].write(skipBuffer, false);
+ skipBuffer.writeVInt((int) (curPayloadPointer - lastSkipPayloadPointer[level]));
+ }
+
+ lastSkipDoc[level] = curDoc;
+ lastSkipPayloadPointer[level] = curPayloadPointer;
+ }
+}