You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by mi...@apache.org on 2010/01/30 11:18:25 UTC
svn commit: r904750 [2/4] - in /lucene/java/branches/flex_1458: ./
contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/
contrib/misc/src/java/org/apache/lucene/index/
contrib/queries/src/java/org/apache/lucene/search/
src/java/org/apache/luce...
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java?rev=904750&r1=904749&r2=904750&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java Sat Jan 30 10:16:35 2010
@@ -24,19 +24,19 @@
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.codecs.Codec;
-import org.apache.lucene.index.codecs.standard.StandardDocsConsumer;
-import org.apache.lucene.index.codecs.standard.StandardDocsProducer;
+import org.apache.lucene.index.codecs.standard.StandardPostingsWriter;
+import org.apache.lucene.index.codecs.standard.StandardPostingsWriterImpl;
+import org.apache.lucene.index.codecs.standard.StandardPostingsReader;
+import org.apache.lucene.index.codecs.standard.StandardPostingsReaderImpl;
import org.apache.lucene.index.codecs.FieldsConsumer;
import org.apache.lucene.index.codecs.FieldsProducer;
import org.apache.lucene.index.codecs.standard.SimpleStandardTermsIndexReader;
import org.apache.lucene.index.codecs.standard.SimpleStandardTermsIndexWriter;
-import org.apache.lucene.index.codecs.standard.StandardCodec;
-import org.apache.lucene.index.codecs.standard.StandardDocsReader;
-import org.apache.lucene.index.codecs.standard.StandardDocsWriter;
import org.apache.lucene.index.codecs.standard.StandardTermsDictReader;
import org.apache.lucene.index.codecs.standard.StandardTermsDictWriter;
import org.apache.lucene.index.codecs.standard.StandardTermsIndexReader;
import org.apache.lucene.index.codecs.standard.StandardTermsIndexWriter;
+import org.apache.lucene.index.codecs.standard.StandardCodec;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
@@ -55,14 +55,14 @@
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
- // We wrap StandardDocsWriter, but any DocsConsumer
+ // We wrap StandardPostingsWriterImpl, but any StandardPostingsWriter
// will work:
- StandardDocsConsumer docsWriter = new StandardDocsWriter(state);
+ StandardPostingsWriter docsWriter = new StandardPostingsWriterImpl(state);
// Terms that have <= freqCutoff number of docs are
// "pulsed" (inlined):
final int freqCutoff = 1;
- StandardDocsConsumer pulsingWriter = new PulsingDocsWriter(state, freqCutoff, docsWriter);
+ StandardPostingsWriter pulsingWriter = new PulsingPostingsWriterImpl(freqCutoff, docsWriter);
// Terms dict index
StandardTermsIndexWriter indexWriter;
@@ -96,10 +96,10 @@
@Override
public FieldsProducer fieldsProducer(Directory dir, FieldInfos fieldInfos, SegmentInfo si, int readBufferSize, int indexDivisor) throws IOException {
- // We wrap StandardDocsReader, but any DocsProducer
+ // We wrap StandardPostingsReaderImpl, but any StandardPostingsReader
// will work:
- StandardDocsProducer docs = new StandardDocsReader(dir, si, readBufferSize);
- StandardDocsProducer docsReader = new PulsingDocsReader(dir, si, readBufferSize, docs);
+ StandardPostingsReader docsReader = new StandardPostingsReaderImpl(dir, si, readBufferSize);
+ StandardPostingsReader pulsingReader = new PulsingPostingsReaderImpl(docsReader);
// Terms dict index reader
StandardTermsIndexReader indexReader;
@@ -114,7 +114,7 @@
success = true;
} finally {
if (!success) {
- docs.close();
+ pulsingReader.close();
}
}
@@ -123,15 +123,16 @@
try {
FieldsProducer ret = new StandardTermsDictReader(indexReader,
dir, fieldInfos, si.name,
- docsReader,
+ pulsingReader,
readBufferSize,
- BytesRef.getUTF8SortedAsUTF16Comparator());
+ BytesRef.getUTF8SortedAsUTF16Comparator(),
+ StandardCodec.TERMS_CACHE_SIZE);
success = true;
return ret;
} finally {
if (!success) {
try {
- docs.close();
+ pulsingReader.close();
} finally {
indexReader.close();
}
@@ -141,7 +142,7 @@
@Override
public void files(Directory dir, SegmentInfo segmentInfo, Collection<String> files) throws IOException {
- StandardDocsReader.files(dir, segmentInfo, files);
+ StandardPostingsReaderImpl.files(dir, segmentInfo, files);
StandardTermsDictReader.files(dir, segmentInfo, files);
SimpleStandardTermsIndexReader.files(dir, segmentInfo, files);
}
Added: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java?rev=904750&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java (added)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java Sat Jan 30 10:16:35 2010
@@ -0,0 +1,396 @@
+package org.apache.lucene.index.codecs.pulsing;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.index.codecs.Codec;
+import org.apache.lucene.index.codecs.standard.TermState;
+import org.apache.lucene.index.codecs.standard.StandardPostingsReader;
+import org.apache.lucene.index.codecs.pulsing.PulsingPostingsWriterImpl.Document;
+import org.apache.lucene.index.codecs.pulsing.PulsingPostingsWriterImpl.Position;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+
+/** Concrete class that reads the current doc/freq/skip
+ * postings format */
+
+// nocommit -- should we switch "hasProx" higher up? and
+// create two separate docs readers, one that also reads
+// prox and one that doesn't?
+
+public class PulsingPostingsReaderImpl extends StandardPostingsReader {
+
+ // Fallback reader for non-pulsed terms:
+ final StandardPostingsReader wrappedPostingsReader;
+ int maxPulsingDocFreq;
+
+ public PulsingPostingsReaderImpl(StandardPostingsReader wrappedPostingsReader) throws IOException {
+ this.wrappedPostingsReader = wrappedPostingsReader;
+ }
+
+ @Override
+ public void init(IndexInput termsIn) throws IOException {
+ Codec.checkHeader(termsIn, PulsingPostingsWriterImpl.CODEC, PulsingPostingsWriterImpl.VERSION_START);
+ maxPulsingDocFreq = termsIn.readVInt();
+ wrappedPostingsReader.init(termsIn);
+ }
+
+ private static class PulsingTermState extends TermState {
+ private Document docs[];
+ private TermState wrappedTermState;
+ private boolean pendingIndexTerm;
+
+ public Object clone() {
+ PulsingTermState clone;
+ clone = (PulsingTermState) super.clone();
+ clone.docs = (Document[]) docs.clone();
+ for(int i=0;i<clone.docs.length;i++) {
+ final Document doc = clone.docs[i];
+ if (doc != null) {
+ clone.docs[i] = (Document) doc.clone();
+ }
+ }
+ clone.wrappedTermState = (TermState) wrappedTermState.clone();
+ return clone;
+ }
+
+ public void copy(TermState _other) {
+ super.copy(_other);
+ PulsingTermState other = (PulsingTermState) _other;
+ pendingIndexTerm = other.pendingIndexTerm;
+ wrappedTermState.copy(other.wrappedTermState);
+ for(int i=0;i<docs.length;i++) {
+ if (other.docs[i] != null) {
+ docs[i] = (Document) other.docs[i].clone();
+ }
+ }
+ }
+ }
+
+ @Override
+ public TermState newTermState() throws IOException {
+ PulsingTermState state = new PulsingTermState();
+ state.wrappedTermState = wrappedPostingsReader.newTermState();
+ state.docs = new Document[maxPulsingDocFreq];
+ return state;
+ }
+
+ @Override
+ public void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState _termState, boolean isIndexTerm) throws IOException {
+
+ PulsingTermState termState = (PulsingTermState) _termState;
+
+ if (Codec.DEBUG) {
+ System.out.println("pulsr.readTerm docFreq=" + termState.docFreq + " indexTerm=" + isIndexTerm);
+ }
+
+ termState.pendingIndexTerm |= isIndexTerm;
+
+ if (termState.docFreq <= maxPulsingDocFreq) {
+
+ if (Codec.DEBUG) {
+ System.out.println(" pulsed");
+ }
+
+ // Inlined into terms dict -- read everything in
+
+ // TODO: maybe only read everything in lazily? But
+ // then we'd need to store length so we could seek
+ // over it when docs/pos enum was not requested
+
+ // TODO: it'd be better to share this encoding logic
+ // in some inner codec that knows how to write a
+ // single doc / single position, etc. This way if a
+ // given codec wants to store other interesting
+ // stuff, it could use this pulsing codec to do so
+
+ int docID = 0;
+ for(int i=0;i<termState.docFreq;i++) {
+ Document doc = termState.docs[i];
+ if (doc == null) {
+ doc = termState.docs[i] = new Document();
+ }
+ final int code = termsIn.readVInt();
+ if (fieldInfo.omitTermFreqAndPositions) {
+ docID += code;
+ doc.numPositions = 1;
+ if (Codec.DEBUG) {
+ System.out.println(" doc=" + docID);
+ }
+ } else {
+ docID += code>>>1;
+ if ((code & 1) != 0) {
+ doc.numPositions = 1;
+ } else {
+ doc.numPositions = termsIn.readVInt();
+ }
+
+ if (Codec.DEBUG) {
+ System.out.println(" doc=" + docID + " numPos=" + doc.numPositions);
+ }
+
+ if (doc.numPositions > doc.positions.length) {
+ doc.reallocPositions(doc.numPositions);
+ }
+
+ int position = 0;
+ int payloadLength = -1;
+
+ for(int j=0;j<doc.numPositions;j++) {
+ final Position pos = doc.positions[j];
+ final int code2 = termsIn.readVInt();
+ if (fieldInfo.storePayloads) {
+ position += code2 >>> 1;
+ if ((code2 & 1) != 0) {
+ payloadLength = termsIn.readVInt();
+ }
+
+ if (payloadLength > 0) {
+ if (pos.payload == null) {
+ pos.payload = new BytesRef();
+ pos.payload.bytes = new byte[payloadLength];
+ } else if (payloadLength > pos.payload.bytes.length) {
+ pos.payload.grow(payloadLength);
+ }
+ pos.payload.length = payloadLength;
+ termsIn.readBytes(pos.payload.bytes, 0, payloadLength);
+ } else if (pos.payload != null) {
+ pos.payload.length = 0;
+ }
+ } else {
+ position += code2;
+ }
+ pos.pos = position;
+ }
+ }
+ doc.docID = docID;
+ }
+ } else {
+ if (Codec.DEBUG) {
+ System.out.println(" not pulsed pass isIndex=" + termState.pendingIndexTerm);
+ }
+ termState.wrappedTermState.docFreq = termState.docFreq;
+ wrappedPostingsReader.readTerm(termsIn, fieldInfo, termState.wrappedTermState, termState.pendingIndexTerm);
+ termState.pendingIndexTerm = false;
+ }
+ }
+
+ // nocommit -- not great that we can't always reuse
+ @Override
+ public DocsEnum docs(FieldInfo field, TermState _termState, Bits skipDocs, DocsEnum reuse) throws IOException {
+ PulsingTermState termState = (PulsingTermState) _termState;
+ if (termState.docFreq <= maxPulsingDocFreq) {
+ if (reuse instanceof PulsingDocsEnum) {
+ return ((PulsingDocsEnum) reuse).reset(skipDocs, termState);
+ } else {
+ PulsingDocsEnum docsEnum = new PulsingDocsEnum();
+ return docsEnum.reset(skipDocs, termState);
+ }
+ } else {
+ if (reuse instanceof PulsingDocsEnum) {
+ return wrappedPostingsReader.docs(field, termState.wrappedTermState, skipDocs, null);
+ } else {
+ return wrappedPostingsReader.docs(field, termState.wrappedTermState, skipDocs, reuse);
+ }
+ }
+ }
+
+ // nocommit -- not great that we can't always reuse
+ @Override
+ public DocsAndPositionsEnum docsAndPositions(FieldInfo field, TermState _termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
+ PulsingTermState termState = (PulsingTermState) _termState;
+ if (termState.docFreq <= maxPulsingDocFreq) {
+ if (reuse instanceof PulsingDocsAndPositionsEnum) {
+ return ((PulsingDocsAndPositionsEnum) reuse).reset(skipDocs, termState);
+ } else {
+ PulsingDocsAndPositionsEnum postingsEnum = new PulsingDocsAndPositionsEnum();
+ return postingsEnum.reset(skipDocs, termState);
+ }
+ } else {
+ if (reuse instanceof PulsingDocsAndPositionsEnum) {
+ return wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, skipDocs, null);
+ } else {
+ return wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, skipDocs, reuse);
+ }
+ }
+ }
+
+ static class PulsingDocsEnum extends DocsEnum {
+ private int nextRead;
+ private Bits skipDocs;
+ private Document doc;
+ private PulsingTermState state;
+
+ public void close() {}
+
+ PulsingDocsEnum reset(Bits skipDocs, PulsingTermState termState) {
+ // nocommit -- not great we have to clone here --
+ // merging is wasteful; TermRangeQuery too
+ state = (PulsingTermState) termState.clone();
+ this.skipDocs = skipDocs;
+ nextRead = 0;
+ return this;
+ }
+
+ @Override
+ public int nextDoc() {
+ while(true) {
+ if (nextRead >= state.docFreq) {
+ return NO_MORE_DOCS;
+ } else {
+ doc = state.docs[nextRead++];
+ if (skipDocs == null || !skipDocs.get(doc.docID)) {
+ return doc.docID;
+ }
+ }
+ }
+ }
+
+ @Override
+ public int read(int[] docs, int[] freqs) {
+ int i=0;
+ // nocommit -- ob1?
+ while(nextRead < state.docFreq) {
+ doc = state.docs[nextRead++];
+ if (skipDocs == null || !skipDocs.get(doc.docID)) {
+ docs[i] = doc.docID;
+ freqs[i] = doc.numPositions;
+ i++;
+ }
+ }
+ return i;
+ }
+
+ @Override
+ public int freq() {
+ return doc.numPositions;
+ }
+
+ @Override
+ public int docID() {
+ return doc.docID;
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ int doc;
+ while((doc=nextDoc()) != NO_MORE_DOCS) {
+ if (doc >= target)
+ return doc;
+ }
+ return NO_MORE_DOCS;
+ }
+ }
+
+ static class PulsingDocsAndPositionsEnum extends DocsAndPositionsEnum {
+ private int nextRead;
+ private int nextPosRead;
+ private Bits skipDocs;
+ private Document doc;
+ private Position pos;
+ private PulsingTermState state;
+
+ // Only here to emulate limitation of standard codec,
+ // which only allows retrieving payload more than once
+ private boolean payloadRetrieved;
+
+ public void close() {}
+
+ PulsingDocsAndPositionsEnum reset(Bits skipDocs, PulsingTermState termState) {
+ // nocommit -- not great we have to clone here --
+ // merging is wasteful; TermRangeQuery too
+ state = (PulsingTermState) termState.clone();
+ this.skipDocs = skipDocs;
+ nextRead = 0;
+ nextPosRead = 0;
+ return this;
+ }
+
+ @Override
+ public int nextDoc() {
+ while(true) {
+ if (nextRead >= state.docFreq) {
+ return NO_MORE_DOCS;
+ } else {
+ doc = state.docs[nextRead++];
+ if (skipDocs == null || !skipDocs.get(doc.docID)) {
+ nextPosRead = 0;
+ return doc.docID;
+ }
+ }
+ }
+ }
+
+ @Override
+ public int freq() {
+ return doc.numPositions;
+ }
+
+ @Override
+ public int docID() {
+ return doc.docID;
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ int doc;
+ while((doc=nextDoc()) != NO_MORE_DOCS) {
+ if (doc >= target) {
+ return doc;
+ }
+ }
+ return NO_MORE_DOCS;
+ }
+
+ @Override
+ public int nextPosition() {
+ assert nextPosRead < doc.numPositions;
+ pos = doc.positions[nextPosRead++];
+ payloadRetrieved = false;
+ return pos.pos;
+ }
+
+ @Override
+ public int getPayloadLength() {
+ return payloadRetrieved || pos.payload == null ? 0 : pos.payload.length;
+ }
+
+ @Override
+ public boolean hasPayload() {
+ return !payloadRetrieved && pos.payload != null && pos.payload.length > 0;
+ }
+
+ @Override
+ public BytesRef getPayload() {
+ payloadRetrieved = true;
+ return pos.payload;
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ wrappedPostingsReader.close();
+ }
+}
Propchange: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java?rev=904750&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java (added)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java Sat Jan 30 10:16:35 2010
@@ -0,0 +1,326 @@
+package org.apache.lucene.index.codecs.pulsing;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.codecs.Codec;
+import org.apache.lucene.index.codecs.standard.StandardPostingsWriter;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+
+// TODO: we now pulse entirely according to docFreq of the
+// term; it might be better to eg pulse by "net bytes used"
+// so that a term that has only 1 doc but zillions of
+// positions would not be inlined. Though this is
+// presumably rare in practice...
+
+//nocommit: public
+public final class PulsingPostingsWriterImpl extends StandardPostingsWriter {
+
+ final static String CODEC = "PulsedPostings";
+
+ // To add a new version, increment from the last one, and
+ // change VERSION_CURRENT to point to your new version:
+ final static int VERSION_START = 0;
+
+ final static int VERSION_CURRENT = VERSION_START;
+
+ IndexOutput termsOut;
+
+ boolean omitTF;
+ boolean storePayloads;
+
+ // Starts a new term
+ FieldInfo fieldInfo;
+
+ // nocommit
+ String desc;
+
+ // nocommit: public
+ public static class Document {
+ int docID;
+ int termDocFreq;
+ int numPositions;
+ Position[] positions;
+ Document() {
+ positions = new Position[1];
+ positions[0] = new Position();
+ }
+
+ @Override
+ public Object clone() {
+ Document doc = new Document();
+ doc.docID = docID;
+ doc.termDocFreq = termDocFreq;
+ doc.numPositions = numPositions;
+ doc.positions = new Position[positions.length];
+ for(int i = 0; i < positions.length; i++) {
+ doc.positions[i] = (Position) positions[i].clone();
+ }
+
+ return doc;
+ }
+
+ void reallocPositions(int minSize) {
+ final Position[] newArray = new Position[ArrayUtil.getNextSize(minSize)];
+ System.arraycopy(positions, 0, newArray, 0, positions.length);
+ for(int i=positions.length;i<newArray.length;i++) {
+ newArray[i] = new Position();
+ }
+ positions = newArray;
+ }
+ }
+
+ final Document[] pendingDocs;
+ int pendingDocCount = 0;
+ Document currentDoc;
+ boolean pulsed; // false if we've seen > maxPulsingDocFreq docs
+
+ static class Position {
+ BytesRef payload;
+ int pos;
+
+ @Override
+ public Object clone() {
+ Position position = new Position();
+ position.pos = pos;
+ if (payload != null) {
+ position.payload = new BytesRef(payload);
+ }
+ return position;
+ }
+ }
+
+ // nocommit -- lazy init this? ie, if every single term
+ // was pulsed then we never need to use this fallback?
+ // Fallback writer for non-pulsed terms:
+ final StandardPostingsWriter wrappedPostingsWriter;
+
+ /** If docFreq <= maxPulsingDocFreq, its postings are
+ * inlined into terms dict */
+ public PulsingPostingsWriterImpl(int maxPulsingDocFreq, StandardPostingsWriter wrappedPostingsWriter) throws IOException {
+ super();
+
+ pendingDocs = new Document[maxPulsingDocFreq];
+ for(int i=0;i<maxPulsingDocFreq;i++) {
+ pendingDocs[i] = new Document();
+ }
+
+ // We simply wrap another postings writer, but only call
+ // on it when doc freq is higher than our cutoff
+ this.wrappedPostingsWriter = wrappedPostingsWriter;
+ }
+
+ @Override
+ public void start(IndexOutput termsOut) throws IOException {
+ this.termsOut = termsOut;
+ Codec.writeHeader(termsOut, CODEC, VERSION_CURRENT);
+ termsOut.writeVInt(pendingDocs.length);
+ wrappedPostingsWriter.start(termsOut);
+ }
+
+ @Override
+ public void startTerm() {
+ assert pendingDocCount == 0;
+ pulsed = false;
+ }
+
+ // nocommit -- should we NOT reuse across fields? would
+ // be cleaner
+
+ // Currently, this instance is re-used across fields, so
+ // our parent calls setField whenever the field changes
+ @Override
+ public void setField(FieldInfo fieldInfo) {
+ this.fieldInfo = fieldInfo;
+ omitTF = fieldInfo.omitTermFreqAndPositions;
+ storePayloads = fieldInfo.storePayloads;
+ wrappedPostingsWriter.setField(fieldInfo);
+ }
+
+ @Override
+ public void addDoc(int docID, int termDocFreq) throws IOException {
+
+ assert docID >= 0: "got docID=" + docID;
+
+ if (Codec.DEBUG) {
+ System.out.println("PW.addDoc: docID=" + docID + " pendingDocCount=" + pendingDocCount + " vs " + pendingDocs.length + " pulsed=" + pulsed);
+ }
+
+ if (!pulsed && pendingDocCount == pendingDocs.length) {
+
+ // OK we just crossed the threshold, this term should
+ // now be written with our wrapped codec:
+ wrappedPostingsWriter.startTerm();
+
+ if (Codec.DEBUG) {
+ System.out.println(" now flush buffer");
+ }
+
+ // Flush all buffered docs
+ for(int i=0;i<pendingDocCount;i++) {
+ final Document doc = pendingDocs[i];
+ if (Codec.DEBUG)
+ System.out.println(" docID=" + doc.docID);
+
+ wrappedPostingsWriter.addDoc(doc.docID, doc.termDocFreq);
+
+ if (!omitTF) {
+ assert doc.termDocFreq == doc.numPositions;
+ for(int j=0;j<doc.termDocFreq;j++) {
+ final Position pos = doc.positions[j];
+ if (pos.payload != null && pos.payload.length > 0) {
+ assert storePayloads;
+ wrappedPostingsWriter.addPosition(pos.pos, pos.payload);
+ } else {
+ wrappedPostingsWriter.addPosition(pos.pos, null);
+ }
+ }
+ wrappedPostingsWriter.finishDoc();
+ }
+ }
+
+ pendingDocCount = 0;
+
+ pulsed = true;
+ }
+
+ if (pulsed) {
+ // We've already seen too many docs for this term --
+ // just forward to our fallback writer
+ wrappedPostingsWriter.addDoc(docID, termDocFreq);
+ } else {
+ currentDoc = pendingDocs[pendingDocCount++];
+ currentDoc.docID = docID;
+ // nocommit -- need not store in doc? only used for alloc & assert
+ currentDoc.termDocFreq = termDocFreq;
+ if (termDocFreq > currentDoc.positions.length) {
+ currentDoc.reallocPositions(termDocFreq);
+ }
+ currentDoc.numPositions = 0;
+ }
+ }
+
+ @Override
+ public void addPosition(int position, BytesRef payload) throws IOException {
+ if (pulsed) {
+ wrappedPostingsWriter.addPosition(position, payload);
+ } else {
+ // just buffer up
+ Position pos = currentDoc.positions[currentDoc.numPositions++];
+ pos.pos = position;
+ if (payload != null && payload.length > 0) {
+ if (pos.payload == null) {
+ pos.payload = new BytesRef(payload);
+ } else {
+ pos.payload.copy(payload);
+ }
+ } else if (pos.payload != null) {
+ pos.payload.length = 0;
+ }
+ }
+ }
+
+ @Override
+ public void finishDoc() {
+ assert currentDoc.numPositions == currentDoc.termDocFreq;
+ }
+
+ boolean pendingIsIndexTerm;
+
+ int pulsedCount;
+ int nonPulsedCount;
+
+ /** Called when we are done adding docs to this term */
+ @Override
+ public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
+
+ if (Codec.DEBUG) {
+ System.out.println("PW: finishTerm pendingDocCount=" + pendingDocCount);
+ }
+
+ pendingIsIndexTerm |= isIndexTerm;
+
+ if (pulsed) {
+ wrappedPostingsWriter.finishTerm(docCount, pendingIsIndexTerm);
+ pendingIsIndexTerm = false;
+ pulsedCount++;
+ } else {
+ nonPulsedCount++;
+ // OK, there were few enough occurrences for this
+ // term, so we fully inline our postings data into
+ // terms dict, now:
+ int lastDocID = 0;
+ for(int i=0;i<pendingDocCount;i++) {
+ final Document doc = pendingDocs[i];
+ final int delta = doc.docID - lastDocID;
+ lastDocID = doc.docID;
+ if (omitTF) {
+ termsOut.writeVInt(delta);
+ } else {
+ assert doc.numPositions == doc.termDocFreq;
+ if (doc.numPositions == 1)
+ termsOut.writeVInt((delta<<1)|1);
+ else {
+ termsOut.writeVInt(delta<<1);
+ termsOut.writeVInt(doc.numPositions);
+ }
+
+ // TODO: we could do better in encoding
+ // payloadLength, eg, if it's always the same
+ // across all terms
+ int lastPosition = 0;
+ int lastPayloadLength = -1;
+
+ for(int j=0;j<doc.numPositions;j++) {
+ final Position pos = doc.positions[j];
+ final int delta2 = pos.pos - lastPosition;
+ lastPosition = pos.pos;
+ if (storePayloads) {
+ final int payloadLength = pos.payload == null ? 0 : pos.payload.length;
+ if (payloadLength != lastPayloadLength) {
+ termsOut.writeVInt((delta2 << 1)|1);
+ termsOut.writeVInt(payloadLength);
+ lastPayloadLength = payloadLength;
+ } else {
+ termsOut.writeVInt(delta2 << 1);
+ }
+
+ if (payloadLength > 0) {
+ termsOut.writeBytes(pos.payload.bytes, 0, pos.payload.length);
+ }
+ } else {
+ termsOut.writeVInt(delta2);
+ }
+ }
+ }
+ }
+ }
+
+ pendingDocCount = 0;
+ }
+
+ @Override
+ public void close() throws IOException {
+ wrappedPostingsWriter.close();
+ }
+}
Propchange: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/IntIndexInput.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/IntIndexInput.java?rev=904750&r1=904749&r2=904750&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/IntIndexInput.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/IntIndexInput.java Sat Jan 30 10:16:35 2010
@@ -36,8 +36,7 @@
public abstract Index index() throws IOException;
- public class IndexState {};
-
+ // nocommit -- can we simplify this?
public abstract static class Index {
// nocommit
@@ -50,11 +49,7 @@
public abstract void set(Index other);
- // nocommit handle with set and/or clone?
- public abstract IndexState captureState();
-
- // nocommit handle with set and/or clone?
- public abstract void setState(IndexState state);
+ public abstract Object clone();
}
public abstract static class Reader {
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/IntIndexOutput.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/IntIndexOutput.java?rev=904750&r1=904749&r2=904750&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/IntIndexOutput.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/IntIndexOutput.java Sat Jan 30 10:16:35 2010
@@ -33,6 +33,7 @@
*
* @lucene.experimental */
public abstract class IntIndexOutput implements Closeable {
+
/** Write an int to the primary file */
public abstract void write(int v) throws IOException;
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java?rev=904750&r1=904749&r2=904750&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java Sat Jan 30 10:16:35 2010
@@ -28,12 +28,13 @@
import org.apache.lucene.index.codecs.FieldsProducer;
import org.apache.lucene.index.codecs.standard.SimpleStandardTermsIndexReader;
import org.apache.lucene.index.codecs.standard.SimpleStandardTermsIndexWriter;
-import org.apache.lucene.index.codecs.standard.StandardDocsConsumer;
-import org.apache.lucene.index.codecs.standard.StandardDocsProducer;
+import org.apache.lucene.index.codecs.standard.StandardPostingsReader;
+import org.apache.lucene.index.codecs.standard.StandardPostingsWriter;
import org.apache.lucene.index.codecs.standard.StandardTermsDictReader;
import org.apache.lucene.index.codecs.standard.StandardTermsDictWriter;
import org.apache.lucene.index.codecs.standard.StandardTermsIndexReader;
import org.apache.lucene.index.codecs.standard.StandardTermsIndexWriter;
+import org.apache.lucene.index.codecs.standard.StandardCodec;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
@@ -47,7 +48,7 @@
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
- StandardDocsConsumer docsWriter = new SepDocsWriter(state, new SingleIntFactory());
+ StandardPostingsWriter postingsWriter = new SepPostingsWriterImpl(state, new SingleIntFactory());
boolean success = false;
StandardTermsIndexWriter indexWriter;
@@ -56,19 +57,19 @@
success = true;
} finally {
if (!success) {
- docsWriter.close();
+ postingsWriter.close();
}
}
success = false;
try {
- FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, docsWriter, BytesRef.getUTF8SortedAsUTF16Comparator());
+ FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUTF16Comparator());
success = true;
return ret;
} finally {
if (!success) {
try {
- docsWriter.close();
+ postingsWriter.close();
} finally {
indexWriter.close();
}
@@ -85,7 +86,7 @@
@Override
public FieldsProducer fieldsProducer(Directory dir, FieldInfos fieldInfos, SegmentInfo si, int readBufferSize, int indexDivisor) throws IOException {
- StandardDocsProducer docsReader = new SepDocsReader(dir, si, readBufferSize, new SingleIntFactory());
+ StandardPostingsReader postingsReader = new SepPostingsReaderImpl(dir, si, readBufferSize, new SingleIntFactory());
StandardTermsIndexReader indexReader;
boolean success = false;
@@ -98,7 +99,7 @@
success = true;
} finally {
if (!success) {
- docsReader.close();
+ postingsReader.close();
}
}
@@ -106,15 +107,16 @@
try {
FieldsProducer ret = new StandardTermsDictReader(indexReader,
dir, fieldInfos, si.name,
- docsReader,
+ postingsReader,
readBufferSize,
- BytesRef.getUTF8SortedAsUTF16Comparator());
+ BytesRef.getUTF8SortedAsUTF16Comparator(),
+ StandardCodec.TERMS_CACHE_SIZE);
success = true;
return ret;
} finally {
if (!success) {
try {
- docsReader.close();
+ postingsReader.close();
} finally {
indexReader.close();
}
@@ -124,7 +126,7 @@
@Override
public void files(Directory dir, SegmentInfo segmentInfo, Collection<String> files) {
- SepDocsReader.files(segmentInfo, files);
+ SepPostingsReaderImpl.files(segmentInfo, files);
StandardTermsDictReader.files(dir, segmentInfo, files);
SimpleStandardTermsIndexReader.files(dir, segmentInfo, files);
}
Added: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java?rev=904750&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java (added)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java Sat Jan 30 10:16:35 2010
@@ -0,0 +1,814 @@
+package org.apache.lucene.index.codecs.sep;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Collection;
+
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.index.codecs.Codec;
+import org.apache.lucene.index.codecs.standard.StandardPostingsReader;
+import org.apache.lucene.index.codecs.standard.TermState;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+
+/** Concrete class that reads the current doc/freq/skip
+ * postings format.
+ *
+ * @lucene.experimental
+ */
+
+// nocommit -- should we switch "hasProx" higher up? and
+// create two separate docs readers, one that also reads
+// prox and one that doesn't?
+
+public class SepPostingsReaderImpl extends StandardPostingsReader {
+
+ final IntIndexInput freqIn;
+ final IntIndexInput docIn;
+ final IntIndexInput posIn;
+ final IndexInput payloadIn;
+ final IndexInput skipIn;
+
+ int skipInterval;
+ int maxSkipLevels;
+
+ public SepPostingsReaderImpl(Directory dir, SegmentInfo segmentInfo, int readBufferSize, IntStreamFactory intFactory) throws IOException {
+
+ boolean success = false;
+ try {
+
+ final String docFileName = IndexFileNames.segmentFileName(segmentInfo.name, SepCodec.DOC_EXTENSION);
+ docIn = intFactory.openInput(dir, docFileName);
+
+ skipIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, SepCodec.SKIP_EXTENSION), readBufferSize);
+
+ if (segmentInfo.getHasProx()) {
+ freqIn = intFactory.openInput(dir, IndexFileNames.segmentFileName(segmentInfo.name, SepCodec.FREQ_EXTENSION));
+ posIn = intFactory.openInput(dir, IndexFileNames.segmentFileName(segmentInfo.name, SepCodec.POS_EXTENSION), readBufferSize);
+ payloadIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, SepCodec.PAYLOAD_EXTENSION), readBufferSize);
+ } else {
+ posIn = null;
+ payloadIn = null;
+ freqIn = null;
+ }
+ success = true;
+ } finally {
+ if (!success) {
+ close();
+ }
+ }
+ }
+
+ public static void files(SegmentInfo segmentInfo, Collection<String> files) {
+ files.add(IndexFileNames.segmentFileName(segmentInfo.name, SepCodec.DOC_EXTENSION));
+ files.add(IndexFileNames.segmentFileName(segmentInfo.name, SepCodec.SKIP_EXTENSION));
+
+ if (segmentInfo.getHasProx()) {
+ files.add(IndexFileNames.segmentFileName(segmentInfo.name, SepCodec.FREQ_EXTENSION));
+ files.add(IndexFileNames.segmentFileName(segmentInfo.name, SepCodec.POS_EXTENSION));
+ files.add(IndexFileNames.segmentFileName(segmentInfo.name, SepCodec.PAYLOAD_EXTENSION));
+ }
+ }
+
+ @Override
+ public void init(IndexInput termsIn) throws IOException {
+ // Make sure we are talking to the matching past writer
+ Codec.checkHeader(termsIn, SepPostingsWriterImpl.CODEC, SepPostingsWriterImpl.VERSION_START);
+ skipInterval = termsIn.readInt();
+ maxSkipLevels = termsIn.readInt();
+ }
+
+ @Override
+ public void close() throws IOException {
+ try {
+ if (freqIn != null)
+ freqIn.close();
+ } finally {
+ try {
+ if (docIn != null)
+ docIn.close();
+ } finally {
+ try {
+ if (skipIn != null)
+ skipIn.close();
+ } finally {
+ try {
+ if (posIn != null) {
+ posIn.close();
+ }
+ } finally {
+ if (payloadIn != null) {
+ payloadIn.close();
+ }
+ }
+ }
+ }
+ }
+ }
+
+ private static class SepTermState extends TermState {
+ IntIndexInput.Index docIndex;
+ IntIndexInput.Index freqIndex;
+ IntIndexInput.Index posIndex;
+ long skipOffset;
+ long payloadOffset;
+
+ public Object clone() {
+ SepTermState other = (SepTermState) super.clone();
+ other.docIndex = (IntIndexInput.Index) docIndex.clone();
+ if (freqIndex != null) {
+ other.freqIndex = (IntIndexInput.Index) freqIndex.clone();
+ }
+ if (posIndex != null) {
+ other.posIndex = (IntIndexInput.Index) posIndex.clone();
+ }
+ return other;
+ }
+
+ public void copy(TermState _other) {
+ super.copy(_other);
+ SepTermState other = (SepTermState) _other;
+ docIndex.set(other.docIndex);
+ if (other.posIndex != null) {
+ if (posIndex == null) {
+ posIndex = (IntIndexInput.Index) other.posIndex.clone();
+ } else {
+ posIndex.set(other.posIndex);
+ }
+ }
+ if (other.freqIndex != null) {
+ if (freqIndex == null) {
+ freqIndex = (IntIndexInput.Index) other.freqIndex.clone();
+ } else {
+ freqIndex.set(other.freqIndex);
+ }
+ }
+ skipOffset = other.skipOffset;
+ payloadOffset = other.payloadOffset;
+ }
+ }
+
+ @Override
+ public TermState newTermState() throws IOException {
+ final SepTermState state = new SepTermState();
+ state.docIndex = docIn.index();
+ return state;
+ }
+
+ @Override
+ public void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState _termState, boolean isIndexTerm) throws IOException {
+ final SepTermState termState = (SepTermState) _termState;
+
+ if (Codec.DEBUG) {
+ System.out.println(" dr.readTerm termsFP=" + termsIn.getFilePointer() + " df=" + termState.docFreq + " isIndex=" + isIndexTerm);
+ System.out.println(" start freqFP=" + termState.freqIndex + " docFP=" + termState.docIndex + " skipFP=" + termState.skipOffset);
+ }
+
+ // read freq index
+ if (!fieldInfo.omitTermFreqAndPositions) {
+ if (termState.freqIndex == null) {
+ assert isIndexTerm;
+ termState.freqIndex = freqIn.index();
+ termState.posIndex = posIn.index();
+ }
+ termState.freqIndex.read(termsIn, isIndexTerm);
+ }
+
+ // read doc index
+ termState.docIndex.read(termsIn, isIndexTerm);
+
+ // read skip index
+ if (isIndexTerm) {
+ termState.skipOffset = termsIn.readVLong();
+ } else if (termState.docFreq >= skipInterval) {
+ termState.skipOffset += termsIn.readVLong();
+ }
+
+ // read pos, payload index
+ if (!fieldInfo.omitTermFreqAndPositions) {
+ termState.posIndex.read(termsIn, isIndexTerm);
+ final long v = termsIn.readVLong();
+ if (isIndexTerm) {
+ termState.payloadOffset = v;
+ } else {
+ termState.payloadOffset += v;
+ }
+ }
+
+ if (Codec.DEBUG) {
+ System.out.println(" freqFP=" + termState.freqIndex + " docFP=" + termState.docIndex + " skipFP=" + termState.skipOffset);
+ }
+ }
+
+ @Override
+ public DocsEnum docs(FieldInfo fieldInfo, TermState _termState, Bits skipDocs, DocsEnum reuse) throws IOException {
+ final SepTermState termState = (SepTermState) _termState;
+ if (reuse == null) {
+ return (new SepDocsEnum()).init(fieldInfo, termState, skipDocs);
+ } else {
+ return ((SepDocsEnum) reuse).init(fieldInfo, termState, skipDocs);
+ }
+ }
+
+ @Override
+ public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, TermState _termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
+ assert !fieldInfo.omitTermFreqAndPositions;
+ final SepTermState termState = (SepTermState) _termState;
+ if (reuse == null) {
+ return (new SepDocsAndPositionsEnum()).init(fieldInfo, termState, skipDocs);
+ } else {
+ return ((SepDocsAndPositionsEnum) reuse).init(fieldInfo, termState, skipDocs);
+ }
+ }
+
+ class SepDocsEnum extends DocsEnum {
+ int docFreq;
+ int doc;
+ int count;
+ int freq;
+ long freqStart;
+
+ // nocommit -- should we do omitTF with 2 different enum classes?
+ private boolean omitTF;
+ private boolean storePayloads;
+ private Bits skipDocs;
+ private final IntIndexInput.Reader docReader;
+ private final IntIndexInput.Reader freqReader;
+ private long skipOffset;
+
+ private final IntIndexInput.Index docIndex;
+ private final IntIndexInput.Index freqIndex;
+ private final IntIndexInput.Index posIndex;
+
+ // nocommit -- should we do hasProx with 2 different enum classes?
+
+ boolean skipped;
+ SepSkipListReader skipper;
+
+ SepDocsEnum() throws IOException {
+ if (Codec.DEBUG) {
+ Codec.debug("sep: new DocsEnum");
+ }
+ docReader = docIn.reader();
+ docIndex = docIn.index();
+ if (freqIn != null) {
+ freqReader = freqIn.reader();
+ freqIndex = freqIn.index();
+ } else {
+ freqReader = null;
+ freqIndex = null;
+ }
+ if (posIn != null) {
+ posIndex = posIn.index(); // only init this so skipper can read it
+ } else {
+ posIndex = null;
+ }
+ }
+
+ SepDocsEnum init(FieldInfo fieldInfo, SepTermState termState, Bits skipDocs) throws IOException {
+ if (Codec.DEBUG) {
+ System.out.println("[" + desc + "] dr.init freqIn seek " + freqIndex + " this=" + this + " (in=" + freqIn + "; this=" + this + ")");
+ }
+ this.skipDocs = skipDocs;
+ omitTF = fieldInfo.omitTermFreqAndPositions;
+ storePayloads = fieldInfo.storePayloads;
+
+ // nocommit: can't we only do this if consumer
+ // skipped consuming the previous docs?
+ docIndex.set(termState.docIndex);
+ docIndex.seek(docReader);
+
+ skipOffset = termState.skipOffset;
+
+ if (!omitTF) {
+ freqIndex.set(termState.freqIndex);
+ freqIndex.seek(freqReader);
+ } else {
+ freq = 1;
+ }
+ docFreq = termState.docFreq;
+ count = 0;
+ doc = 0;
+ skipped = false;
+
+ return this;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+
+ if (Codec.DEBUG) {
+ if (!omitTF) {
+ Codec.debug("sep.reader.docs.nextDoc count=" + count + " vs df=" + docFreq + " freqFP=" + freqReader.descFilePointer() + " docFP=" + docReader.descFilePointer() + " skipDocs?=" + (skipDocs != null), desc);
+ } else {
+ Codec.debug("sep.reader.docs.nextDoc count=" + count + " vs df=" + docFreq + " docFP=" + docReader.descFilePointer() + " skipDocs?=" + (skipDocs != null), desc);
+ }
+ }
+
+ while(true) {
+ if (count == docFreq) {
+ return doc = NO_MORE_DOCS;
+ }
+
+ count++;
+
+ // Decode next doc
+ doc += docReader.next();
+
+ if (!omitTF) {
+ freq = freqReader.next();
+ }
+
+ if (Codec.DEBUG) {
+ System.out.println(" decode doc=" + doc + " freq=" + freq);
+ }
+
+ if (skipDocs == null || !skipDocs.get(doc)) {
+ break;
+ } else if (Codec.DEBUG) {
+ System.out.println(" doc=" + doc + " is skipped");
+ }
+ }
+
+ // nocommit
+ if (Codec.DEBUG) {
+ System.out.println(" return doc=" + doc);
+ }
+ return doc;
+ }
+
+ @Override
+ public int read(int[] docs, int[] freqs) throws IOException {
+ // nocommit -- switch to bulk read api in IntIndexInput
+ int i = 0;
+ final int length = docs.length;
+ while (i < length && count < docFreq) {
+ count++;
+ // manually inlined call to next() for speed
+ doc += docReader.next();
+ if (!omitTF) {
+ freq = freqReader.next();
+ }
+
+ if (skipDocs == null || !skipDocs.get(doc)) {
+ docs[i] = doc;
+ freqs[i] = freq;
+ i++;
+ }
+ }
+
+ return i;
+ }
+
+ @Override
+ public int freq() {
+ return freq;
+ }
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+
+ // TODO: jump right to next() if target is < X away
+ // from where we are now?
+
+ if (Codec.DEBUG) {
+ Codec.debug("sep.reader.docs: advance target=" + target + " omitTF=" + omitTF, desc);
+ }
+
+ if (docFreq >= skipInterval) {
+
+ // There are enough docs in the posting to have
+ // skip data
+
+ if (skipper == null) {
+ // This DocsEnum has never done any skipping
+ if (Codec.DEBUG) {
+ System.out.println(" create skipper");
+ }
+
+ skipper = new SepSkipListReader((IndexInput) skipIn.clone(),
+ freqIn,
+ docIn,
+ posIn,
+ maxSkipLevels, skipInterval);
+
+ }
+
+ if (!skipped) {
+ // We haven't yet skipped for this posting
+ skipper.init(skipOffset,
+ docIndex,
+ freqIndex,
+ posIndex,
+ 0,
+ docFreq,
+ storePayloads);
+ skipper.setOmitTF(omitTF);
+
+ if (Codec.DEBUG) {
+ System.out.println(" init skipper: base skipFP=" + skipOffset + " docFP=" + docIndex + " freqFP=" + freqIndex);
+ }
+
+ skipped = true;
+ }
+
+ final int newCount = skipper.skipTo(target);
+
+ if (newCount > count) {
+
+ // Skipper did move
+ if (Codec.DEBUG) {
+ System.out.println("sdr [" + desc + "]: skipper moved to newCount=" + newCount +
+ " docFP=" + skipper.getDocIndex() +
+ " freqFP=" + skipper.getFreqIndex() +
+ " doc=" + skipper.getDoc());
+ }
+
+ if (!omitTF) {
+ skipper.getFreqIndex().seek(freqReader);
+ }
+ skipper.getDocIndex().seek(docReader);
+ count = newCount;
+ doc = skipper.getDoc();
+ } else if (Codec.DEBUG) {
+ System.out.println(" no skipping to be done");
+ }
+ }
+
+ // Now, linear scan for the rest:
+ do {
+ if (nextDoc() == NO_MORE_DOCS) {
+ return NO_MORE_DOCS;
+ }
+ } while (target > doc);
+
+ if (Codec.DEBUG) {
+ Codec.debug(" skip return doc=" + doc);
+ }
+
+ return doc;
+ }
+ }
+
+ class SepDocsAndPositionsEnum extends DocsAndPositionsEnum {
+ int docFreq;
+ int doc;
+ int count;
+ int freq;
+ long freqStart;
+
+ // nocommit -- should we do omitTF with 2 different enum classes?
+ private boolean omitTF;
+ private boolean storePayloads;
+ private Bits skipDocs;
+ private final IntIndexInput.Reader docReader;
+ private final IntIndexInput.Reader freqReader;
+ private final IntIndexInput.Reader posReader;
+ private final IndexInput payloadIn;
+ private long skipOffset;
+
+ private final IntIndexInput.Index docIndex;
+ private final IntIndexInput.Index freqIndex;
+ private final IntIndexInput.Index posIndex;
+ private long payloadOffset;
+
+ private int pendingPosCount;
+ private int position;
+ private int payloadLength;
+ private long pendingPayloadBytes;
+
+ private boolean skipped;
+ private SepSkipListReader skipper;
+ private boolean payloadPending;
+ private boolean posSeekPending;
+
+ SepDocsAndPositionsEnum() throws IOException {
+ if (Codec.DEBUG) {
+ Codec.debug("sep: new DocsAndPositionsEnum");
+ }
+ docReader = docIn.reader();
+ docIndex = docIn.index();
+ freqReader = freqIn.reader();
+ freqIndex = freqIn.index();
+ posReader = posIn.reader();
+ posIndex = posIn.index();
+ payloadIn = (IndexInput) SepPostingsReaderImpl.this.payloadIn.clone();
+ }
+
+ SepDocsAndPositionsEnum init(FieldInfo fieldInfo, SepTermState termState, Bits skipDocs) throws IOException {
+ if (Codec.DEBUG) {
+ Codec.debug("sep.reader.init freqIn seek " + termState.freqIndex);
+ }
+ this.skipDocs = skipDocs;
+ storePayloads = fieldInfo.storePayloads;
+
+ // nocommit: can't we only do this if consumer
+ // skipped consuming the previous docs?
+ docIndex.set(termState.docIndex);
+ docIndex.seek(docReader);
+
+ freqIndex.set(termState.freqIndex);
+ freqIndex.seek(freqReader);
+
+ posIndex.set(termState.posIndex);
+ posSeekPending = true;
+ //posIndex.seek(posReader);
+
+ skipOffset = termState.skipOffset;
+ payloadOffset = termState.payloadOffset;
+ //payloadIn.seek(payloadOffset);
+
+ docFreq = termState.docFreq;
+ count = 0;
+ doc = 0;
+ pendingPosCount = 0;
+ pendingPayloadBytes = 0;
+ skipped = false;
+
+ return this;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+
+ if (Codec.DEBUG) {
+ if (!omitTF) {
+ Codec.debug("sep.reader.nextDoc next count=" + count + " vs df=" + docFreq + " freqFP=" + freqReader.descFilePointer() + " docFP=" + docReader.descFilePointer() + " skipDocs?=" + (skipDocs != null), desc);
+ } else {
+ Codec.debug("sep.reader.nextDoc next count=" + count + " vs df=" + docFreq + " docFP=" + docReader.descFilePointer() + " skipDocs?=" + (skipDocs != null), desc);
+ }
+ }
+
+ while(true) {
+ if (count == docFreq) {
+ return doc = NO_MORE_DOCS;
+ }
+
+ count++;
+
+ // TODO: maybe we should do the 1-bit trick for encoding
+ // freq=1 case?
+
+ // Decode next doc
+ doc += docReader.next();
+
+ freq = freqReader.next();
+
+ pendingPosCount += freq;
+
+ if (Codec.DEBUG) {
+ System.out.println(" decode doc=" + doc + " freq=" + freq);
+ }
+
+ if (skipDocs == null || !skipDocs.get(doc)) {
+ break;
+ } else if (Codec.DEBUG) {
+ System.out.println(" doc=" + doc + " is skipped");
+ }
+ }
+
+ // nocommit
+ if (Codec.DEBUG) {
+ System.out.println(" return doc=" + doc);
+ }
+ position = 0;
+ return doc;
+ }
+
+ @Override
+ public int freq() {
+ return freq;
+ }
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+
+ // TODO: jump right to next() if target is < X away
+ // from where we are now?
+
+ if (Codec.DEBUG) {
+ Codec.debug("sep.reader.advance current doc=" + doc + " target=" + target, desc);
+ }
+
+ if (docFreq >= skipInterval) {
+
+ // There are enough docs in the posting to have
+ // skip data
+
+ if (skipper == null) {
+ // This DocsEnum has never done any skipping
+ if (Codec.DEBUG) {
+ System.out.println(" create skipper");
+ }
+
+ skipper = new SepSkipListReader((IndexInput) skipIn.clone(),
+ freqIn,
+ docIn,
+ posIn,
+ maxSkipLevels, skipInterval);
+ }
+
+ if (!skipped) {
+ // We haven't yet skipped for this posting
+ skipper.init(skipOffset,
+ docIndex,
+ freqIndex,
+ posIndex,
+ payloadOffset,
+ docFreq,
+ storePayloads);
+
+ if (Codec.DEBUG) {
+ System.out.println(" init skipper: base skipFP=" + skipOffset + " docFP=" + docIndex + " freqFP=" + freqIndex + " proxFP=" +
+ posIndex + " payloadFP=" + payloadOffset);
+ }
+
+ skipped = true;
+ }
+
+ final int newCount = skipper.skipTo(target);
+
+ if (newCount > count) {
+
+ // Skipper did move
+ if (Codec.DEBUG) {
+ Codec.debug(" skipper moved to newCount=" + newCount +
+ " docFP=" + skipper.getDocIndex() +
+ " freqFP=" + skipper.getFreqIndex() +
+ " doc=" + skipper.getDoc());
+ }
+
+ skipper.getFreqIndex().seek(freqReader);
+ skipper.getDocIndex().seek(docReader);
+ //skipper.getPosIndex().seek(posReader);
+ posIndex.set(skipper.getPosIndex());
+ posSeekPending = true;
+ count = newCount;
+ doc = skipper.getDoc();
+ //payloadIn.seek(skipper.getPayloadPointer());
+ payloadOffset = skipper.getPayloadPointer();
+ pendingPosCount = 0;
+ pendingPayloadBytes = 0;
+ payloadPending = false;
+ payloadLength = skipper.getPayloadLength();
+
+ } else if (Codec.DEBUG) {
+ System.out.println(" no skipping to be done");
+ }
+
+ } else {
+ if (Codec.DEBUG) {
+ Codec.debug("[" + desc + "]: no skip data");
+ }
+ }
+
+ // Now, linear scan for the rest:
+ do {
+ if (nextDoc() == NO_MORE_DOCS) {
+ return NO_MORE_DOCS;
+ }
+ } while (target > doc);
+
+ if (Codec.DEBUG) {
+ Codec.debug("advance done return doc=" + doc, desc);
+ }
+ return doc;
+ }
+
+ @Override
+ public int nextPosition() throws IOException {
+ if (Codec.DEBUG) {
+ Codec.debug("sep.reader.nextPos pendingPosCount=" + pendingPosCount + " freq=" + freq, desc);
+ }
+
+ if (posSeekPending) {
+ posIndex.seek(posReader);
+ payloadIn.seek(payloadOffset);
+ posSeekPending = false;
+ }
+
+ // scan over any docs that were iterated without their
+ // positions
+ while (pendingPosCount > freq) {
+ if (Codec.DEBUG) {
+ System.out.println(" skip position payloadBytesPending=" + pendingPayloadBytes);
+ }
+ final int code = posReader.next();
+ if (storePayloads) {
+ if ((code & 1) != 0) {
+ // Payload length has changed
+ payloadLength = posReader.next();
+ assert payloadLength >= 0;
+ if (Codec.DEBUG) {
+ System.out.println(" new payloadLen=" + payloadLength);
+ }
+ }
+ }
+ pendingPosCount--;
+ payloadPending = true;
+ position = 0;
+ pendingPayloadBytes += payloadLength;
+ }
+
+ final int code = posReader.next();
+ if (storePayloads) {
+ if ((code & 1) != 0) {
+ // Payload length has changed
+ payloadLength = posReader.next();
+ assert payloadLength >= 0;
+ if (Codec.DEBUG) {
+ System.out.println(" new payloadLen=" + payloadLength);
+ }
+ }
+ position += code >> 1;
+ } else {
+ position += code;
+ }
+
+ pendingPayloadBytes += payloadLength;
+ payloadPending = payloadLength > 0;
+ pendingPosCount--;
+ payloadPending = true;
+ assert pendingPosCount >= 0;
+
+ if (Codec.DEBUG) {
+ System.out.println(" return pos=" + position);
+ }
+ return position;
+ }
+
+ @Override
+ public int getPayloadLength() {
+ return payloadLength;
+ }
+
+ private BytesRef payload;
+
+ @Override
+ public BytesRef getPayload() throws IOException {
+ if (!payloadPending) {
+ throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once.");
+ }
+
+ if (Codec.DEBUG) {
+ Codec.debug(" getPayload payloadFP=" + payloadIn.getFilePointer() + " len=" + payloadLength + " pendingPayloadBytes=" + pendingPayloadBytes, desc);
+ }
+
+ assert pendingPayloadBytes >= payloadLength;
+
+ if (pendingPayloadBytes > payloadLength) {
+ payloadIn.seek(payloadIn.getFilePointer() + (pendingPayloadBytes - payloadLength));
+ }
+
+ if (payload == null) {
+ payload = new BytesRef();
+ payload.bytes = new byte[payloadLength];
+ } else if (payload.bytes.length < payloadLength) {
+ payload.grow(payloadLength);
+ }
+
+ payloadIn.readBytes(payload.bytes, 0, payloadLength);
+ payloadPending = false;
+ payload.length = payloadLength;
+ pendingPayloadBytes = 0;
+ return payload;
+ }
+
+ @Override
+ public boolean hasPayload() {
+ return payloadPending && payloadLength > 0;
+ }
+ }
+}
Propchange: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java?rev=904750&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java (added)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java Sat Jan 30 10:16:35 2010
@@ -0,0 +1,334 @@
+package org.apache.lucene.index.codecs.sep;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+u * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.codecs.Codec;
+import org.apache.lucene.index.codecs.standard.StandardPostingsWriter;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BytesRef;
+
+/** Writes frq to .frq, docs to .doc, pos to .pos, payloads
+ * to .pyl, skip data to .skp
+ *
+ * @lucene.experimental */
+public final class SepPostingsWriterImpl extends StandardPostingsWriter {
+ final static String CODEC = "SepDocFreqSkip";
+
+ // Increment version to change it:
+ final static int VERSION_START = 0;
+ final static int VERSION_CURRENT = VERSION_START;
+
+ final IntIndexOutput freqOut;
+ final IntIndexOutput.Index freqIndex;
+
+ final IntIndexOutput posOut;
+ final IntIndexOutput.Index posIndex;
+
+ final IntIndexOutput docOut;
+ final IntIndexOutput.Index docIndex;
+
+ final IndexOutput payloadOut;
+
+ final IndexOutput skipOut;
+ IndexOutput termsOut;
+
+ final SepSkipListWriter skipListWriter;
+ final int skipInterval;
+ final int maxSkipLevels;
+ final int totalNumDocs;
+
+ boolean storePayloads;
+ boolean omitTF;
+
+ // Starts a new term
+ long lastSkipStart;
+
+ FieldInfo fieldInfo;
+
+ int lastPayloadLength;
+ int lastPosition;
+ long payloadStart;
+ long lastPayloadStart;
+ int lastDocID;
+ int df;
+ int count;
+
+ public SepPostingsWriterImpl(SegmentWriteState state, IntStreamFactory factory) throws IOException {
+ super();
+
+ final String docFileName = IndexFileNames.segmentFileName(state.segmentName, SepCodec.DOC_EXTENSION);
+ state.flushedFiles.add(docFileName);
+ docOut = factory.createOutput(state.directory, docFileName);
+ docIndex = docOut.index();
+
+ if (state.fieldInfos.hasProx()) {
+ final String frqFileName = IndexFileNames.segmentFileName(state.segmentName, SepCodec.FREQ_EXTENSION);
+ state.flushedFiles.add(frqFileName);
+ freqOut = factory.createOutput(state.directory, frqFileName);
+ freqIndex = freqOut.index();
+
+ final String posFileName = IndexFileNames.segmentFileName(state.segmentName, SepCodec.POS_EXTENSION);
+ posOut = factory.createOutput(state.directory, posFileName);
+ state.flushedFiles.add(posFileName);
+ posIndex = posOut.index();
+
+ // nocommit -- only if at least one field stores payloads?
+ final String payloadFileName = IndexFileNames.segmentFileName(state.segmentName, SepCodec.PAYLOAD_EXTENSION);
+ state.flushedFiles.add(payloadFileName);
+ payloadOut = state.directory.createOutput(payloadFileName);
+
+ } else {
+ freqOut = null;
+ freqIndex = null;
+ posOut = null;
+ posIndex = null;
+ payloadOut = null;
+ }
+
+ final String skipFileName = IndexFileNames.segmentFileName(state.segmentName, SepCodec.SKIP_EXTENSION);
+ state.flushedFiles.add(skipFileName);
+ skipOut = state.directory.createOutput(skipFileName);
+
+ totalNumDocs = state.numDocs;
+
+ // nocommit -- abstraction violation
+ skipListWriter = new SepSkipListWriter(state.skipInterval,
+ state.maxSkipLevels,
+ state.numDocs,
+ freqOut, docOut,
+ posOut, payloadOut);
+
+ skipInterval = state.skipInterval;
+ maxSkipLevels = state.maxSkipLevels;
+ }
+
+ @Override
+ public void start(IndexOutput termsOut) throws IOException {
+ this.termsOut = termsOut;
+ Codec.writeHeader(termsOut, CODEC, VERSION_CURRENT);
+ // nocommit -- just ask skipper to "start" here
+ termsOut.writeInt(skipInterval); // write skipInterval
+ termsOut.writeInt(maxSkipLevels); // write maxSkipLevels
+ }
+
+ @Override
+ public void startTerm() throws IOException {
+ if (Codec.DEBUG) {
+ Codec.debug("sep.writer.startTerm");
+ }
+ docIndex.mark();
+ if (!omitTF) {
+ freqIndex.mark();
+ posIndex.mark();
+ payloadStart = payloadOut.getFilePointer();
+ lastPayloadLength = -1;
+ }
+ skipListWriter.resetSkip(docIndex, freqIndex, posIndex);
+ }
+
+ // nocommit -- should we NOT reuse across fields? would
+ // be cleaner
+
+ // Currently, this instance is re-used across fields, so
+ // our parent calls setField whenever the field changes
+ @Override
+ public void setField(FieldInfo fieldInfo) {
+ this.fieldInfo = fieldInfo;
+ omitTF = fieldInfo.omitTermFreqAndPositions;
+ skipListWriter.setOmitTF(omitTF);
+ storePayloads = !omitTF && fieldInfo.storePayloads;
+ }
+
+
+ /** Adds a new doc in this term. If this returns null
+ * then we just skip consuming positions/payloads. */
+ @Override
+ public void addDoc(int docID, int termDocFreq) throws IOException {
+
+ final int delta = docID - lastDocID;
+
+ if (Codec.DEBUG) {
+ System.out.println(" dw.addDoc [" + desc + "] count=" + (count++) + " docID=" + docID + " lastDocID=" + lastDocID + " delta=" + delta + " omitTF=" + omitTF + " freq=" + termDocFreq);
+ }
+
+ if (docID < 0 || (df > 0 && delta <= 0)) {
+ throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )");
+ }
+
+ if ((++df % skipInterval) == 0) {
+ // nocommit -- awkward we have to make these two
+ // separate calls to skipper
+ skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength);
+ skipListWriter.bufferSkip(df);
+
+ if (Codec.DEBUG) {
+ System.out.println(" bufferSkip lastDocID=" + lastDocID +
+ " df=" + df +
+ " docFP=" + docOut.descFilePointer() +
+ " freqFP=" + freqOut.descFilePointer() +
+ " posFP=" + posOut.descFilePointer() +
+ " payloadFP=" + payloadOut.getFilePointer() +
+ " payloadLen=" + lastPayloadLength);
+ }
+ }
+
+ lastDocID = docID;
+ docOut.write(delta);
+ if (!omitTF) {
+ freqOut.write(termDocFreq);
+ }
+ }
+
+ /** Add a new position & payload */
+ @Override
+ public void addPosition(int position, BytesRef payload) throws IOException {
+ assert !omitTF;
+
+ if (Codec.DEBUG) {
+ if (payload != null && payload.length > 0) {
+ System.out.println("pw.addPos [" + desc + "]: pos=" + position + " posFP=" + posOut.descFilePointer() + " payloadFP=" + payloadOut.getFilePointer() + " payload=" + payload.length + " bytes");
+ } else {
+ System.out.println("pw.addPos [" + desc + "]: pos=" + position + " posFP=" + posOut.descFilePointer() + " payloadFP=" + payloadOut.getFilePointer());
+ }
+ }
+
+ final int delta = position - lastPosition;
+ lastPosition = position;
+
+ if (storePayloads) {
+ final int payloadLength = payload == null ? 0 : payload.length;
+ if (Codec.DEBUG) {
+ System.out.println(" store payload len=" + payloadLength);
+ }
+ if (payloadLength != lastPayloadLength) {
+ if (Codec.DEBUG) {
+ System.out.println(" payload len change old=" + lastPayloadLength + " new=" + payloadLength);
+ }
+ lastPayloadLength = payloadLength;
+ // TODO: explore whether we get better compression
+ // by not storing payloadLength into prox stream?
+ posOut.write((delta<<1)|1);
+ posOut.write(payloadLength);
+ } else {
+ posOut.write(delta << 1);
+ }
+
+ if (payloadLength > 0) {
+ if (Codec.DEBUG) {
+ System.out.println(" write @ payloadFP=" + payloadOut.getFilePointer());
+ }
+ payloadOut.writeBytes(payload.bytes, payload.offset, payloadLength);
+ }
+ } else {
+ posOut.write(delta);
+ }
+
+ lastPosition = position;
+ }
+
+ /** Called when we are done adding positions & payloads */
+ @Override
+ public void finishDoc() {
+ lastPosition = 0;
+ }
+
+ /** Called when we are done adding docs to this term */
+ @Override
+ public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
+
+ long skipPos = skipOut.getFilePointer();
+
+ // nocommit -- wasteful we are counting this in two places?
+ assert docCount == df;
+ if (Codec.DEBUG) {
+ System.out.println("dw.finishTerm termsFP=" + termsOut.getFilePointer() + " df=" + df + " skipPos=" + skipPos);
+ }
+
+ // nocommit -- only do this if once (consolidate the
+ // conditional things that are written)
+ if (!omitTF) {
+ freqIndex.write(termsOut, isIndexTerm);
+ }
+ docIndex.write(termsOut, isIndexTerm);
+
+ if (df >= skipInterval) {
+ if (Codec.DEBUG) {
+ System.out.println(" writeSkip skipPos=" + skipPos + " lastSkipPos=" + lastSkipStart);
+ }
+
+ skipListWriter.writeSkip(skipOut);
+ }
+
+ if (isIndexTerm) {
+ termsOut.writeVLong(skipPos);
+ lastSkipStart = skipPos;
+ } else if (df >= skipInterval) {
+ termsOut.writeVLong(skipPos-lastSkipStart);
+ lastSkipStart = skipPos;
+ }
+
+ if (!omitTF) {
+ posIndex.write(termsOut, isIndexTerm);
+ if (isIndexTerm) {
+ // Write absolute at seek points
+ termsOut.writeVLong(payloadStart);
+ } else {
+ termsOut.writeVLong(payloadStart-lastPayloadStart);
+ }
+ lastPayloadStart = payloadStart;
+ }
+
+ lastDocID = 0;
+ df = 0;
+
+ // nocommit
+ count = 0;
+ }
+
+ @Override
+ public void close() throws IOException {
+ if (Codec.DEBUG) {
+ System.out.println("sep.writer.close skipFP=" + skipOut.getFilePointer());
+ }
+ try {
+ docOut.close();
+ } finally {
+ try {
+ skipOut.close();
+ } finally {
+ if (freqOut != null) {
+ try {
+ freqOut.close();
+ } finally {
+ try {
+ posOut.close();
+ } finally {
+ payloadOut.close();
+ }
+ }
+ }
+ }
+ }
+ }
+}
Propchange: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepSkipListReader.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepSkipListReader.java?rev=904750&r1=904749&r2=904750&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepSkipListReader.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepSkipListReader.java Sat Jan 30 10:16:35 2010
@@ -97,6 +97,12 @@
}
}
+ boolean omitTF;
+
+ void setOmitTF(boolean v) {
+ omitTF = v;
+ }
+
void init(long skipPointer,
IntIndexInput.Index docBaseIndex,
IntIndexInput.Index freqBaseIndex,
@@ -199,14 +205,11 @@
} else {
delta = skipStream.readVInt();
}
- //System.out.println(" delta=" + delta + " level=" +
- //level);
- if (freqIndex != null) {
+ if (!omitTF) {
freqIndex[level].read(skipStream, false);
}
docIndex[level].read(skipStream, false);
- // nocommit -- make this explicit w/ omitTF, matching SepSkipListWriter
- if (posIndex != null) {
+ if (!omitTF) {
posIndex[level].read(skipStream, false);
payloadPointer[level] += skipStream.readVInt();
}
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepSkipListWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepSkipListWriter.java?rev=904750&r1=904749&r2=904750&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepSkipListWriter.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepSkipListWriter.java Sat Jan 30 10:16:35 2010
@@ -77,9 +77,11 @@
posIndex = new IntIndexOutput.Index[numberOfSkipLevels];
for(int i=0;i<numberOfSkipLevels;i++) {
- freqIndex[i] = freqOutput.index();
- if (Codec.DEBUG) {
- freqIndex[i].desc = "sslw.freq.level" + i;
+ if (freqOutput != null) {
+ freqIndex[i] = freqOutput.index();
+ if (Codec.DEBUG) {
+ freqIndex[i].desc = "sslw.freq.level" + i;
+ }
}
docIndex[i] = docOutput.index();
if (Codec.DEBUG) {
@@ -142,7 +144,9 @@
Arrays.fill(lastSkipPayloadLength, -1); // we don't have to write the first length in the skip list
for(int i=0;i<numberOfSkipLevels;i++) {
docIndex[i].set(topDocIndex);
- freqIndex[i].set(topFreqIndex);
+ if (freqOutput != null) {
+ freqIndex[i].set(topFreqIndex);
+ }
if (posOutput != null) {
posIndex[i].set(topPosIndex);
}
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SingleIntIndexInput.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SingleIntIndexInput.java?rev=904750&r1=904749&r2=904750&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SingleIntIndexInput.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SingleIntIndexInput.java Sat Jan 30 10:16:35 2010
@@ -69,13 +69,9 @@
}
}
- class State extends IndexState {
- long fp;
- boolean first;
- }
-
class Index extends IntIndexInput.Index {
private long fp;
+ // nocmmit: only for asserts
boolean first = true;
@Override
@@ -110,24 +106,13 @@
return Long.toString(fp);
}
- // nocommit handle with set and/or clone?
- @Override
- public IndexState captureState() {
- State state = SingleIntIndexInput.this.new State();
- state.fp = fp;
- state.first = first;
- return state;
- }
-
- // nocommit handle with set and/or clone?
@Override
- public void setState(IndexState state) {
- State iState = (State) state;
- this.fp = iState.fp;
- this.first = iState.first;
-
+ public Object clone() {
+ Index other = new Index();
+ other.first = first;
+ other.fp = fp;
+ return other;
}
-
}
@Override
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SingleIntIndexOutput.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SingleIntIndexOutput.java?rev=904750&r1=904749&r2=904750&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SingleIntIndexOutput.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SingleIntIndexOutput.java Sat Jan 30 10:16:35 2010
@@ -24,7 +24,7 @@
import java.io.IOException;
/** Writes ints directly to the file (not in blocks) as
- * vInt
+ * vInt.
*
* @lucene.experimental
*/
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/DefaultSkipListWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/DefaultSkipListWriter.java?rev=904750&r1=904749&r2=904750&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/DefaultSkipListWriter.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/DefaultSkipListWriter.java Sat Jan 30 10:16:35 2010
@@ -60,14 +60,14 @@
}
// nocommit -- made public
- public void setFreqOutput(IndexOutput freqOutput) {
- this.freqOutput = freqOutput;
- }
+ //public void setFreqOutput(IndexOutput freqOutput) {
+ //this.freqOutput = freqOutput;
+ //}
// nocommit -- made public
- public void setProxOutput(IndexOutput proxOutput) {
- this.proxOutput = proxOutput;
- }
+ //public void setProxOutput(IndexOutput proxOutput) {
+ //this.proxOutput = proxOutput;
+ //}
/**
* Sets the values for the current skip data.
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java?rev=904750&r1=904749&r2=904750&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java Sat Jan 30 10:16:35 2010
@@ -73,16 +73,7 @@
this.termComp = termComp;
- // nocommit -- why was this needed?
- String file = IndexFileNames.segmentFileName(segment, StandardCodec.TERMS_INDEX_EXTENSION);
- if (!dir.fileExists(file)) {
- indexInterval = 0;
- totalIndexInterval = 0;
- this.indexDivisor = indexDivisor;
- in = null;
- return;
- }
- IndexInput in = dir.openInput(file);
+ IndexInput in = dir.openInput(IndexFileNames.segmentFileName(segment, StandardCodec.TERMS_INDEX_EXTENSION));
boolean success = false;
@@ -90,7 +81,7 @@
Codec.checkHeader(in, SimpleStandardTermsIndexWriter.CODEC_NAME, SimpleStandardTermsIndexWriter.VERSION_START);
if (Codec.DEBUG) {
- System.out.println(" readDirStart @ " + in.getFilePointer());
+ Codec.debug(" sstir init: header tii.fp=" + in.getFilePointer());
}
final long dirOffset = in.readLong();
@@ -444,7 +435,7 @@
public final void getIndexOffset(BytesRef term, TermsIndexResult result) throws IOException {
if (Codec.DEBUG) {
- System.out.println("getIndexOffset field=" + fieldInfo.name + " term=" + term + " indexLen = " + blockPointer.length + " numIndexTerms=" + fileOffset.length + " this=" + this + " numIndexedTerms=" + fileOffset.length);
+ System.out.println("getIndexOffset field=" + fieldInfo.name + " term=" + term + " indexLen = " + blockPointer.length + " numIndexTerms=" + fileOffset.length + " numIndexedTerms=" + fileOffset.length);
}
int lo = 0; // binary search
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java?rev=904750&r1=904749&r2=904750&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java Sat Jan 30 10:16:35 2010
@@ -93,10 +93,8 @@
if (0 == (numTerms++ % termIndexInterval)) {
final long termsPointer = termsOut.getFilePointer();
if (Codec.DEBUG) {
- System.out.println("sstiw.checkIndexTerm write index field=" + fieldInfo.name + " term=" + text + " termsFP=" + termsPointer + " numIndexTerms=" + numIndexTerms + " outFP=" + out.getFilePointer());
+ Codec.debug("sstiw.checkIndexTerm write index field=" + fieldInfo.name + " term=" + text + " termsFP=" + termsPointer + " numIndexTerms=" + numIndexTerms + " outFP=" + out.getFilePointer());
}
- // mxx
- //System.out.println(Thread.currentThread().getName() + ": ii seg=" + segment + " term=" + fieldInfo.name + ":" + new String(term, 0, termLength, "UTF-8") + " numTerms=" + (numTerms-1) + " termFP=" + termsPointer);
termWriter.write(text);
out.writeVLong(termsPointer - lastTermsPointer);
lastTermsPointer = termsPointer;
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java?rev=904750&r1=904749&r2=904750&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java Sat Jan 30 10:16:35 2010
@@ -38,7 +38,7 @@
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
- StandardDocsConsumer docs = new StandardDocsWriter(state);
+ StandardPostingsWriter docs = new StandardPostingsWriterImpl(state);
StandardTermsIndexWriter indexWriter;
boolean success = false;
@@ -67,9 +67,11 @@
}
}
+ public final static int TERMS_CACHE_SIZE = 1024;
+
@Override
public FieldsProducer fieldsProducer(Directory dir, FieldInfos fieldInfos, SegmentInfo si, int readBufferSize, int indexDivisor) throws IOException {
- StandardDocsReader docs = new StandardDocsReader(dir, si, readBufferSize);
+ StandardPostingsReader postings = new StandardPostingsReaderImpl(dir, si, readBufferSize);
StandardTermsIndexReader indexReader;
// nocommit -- not clean that every codec must deal w/
@@ -84,7 +86,7 @@
success = true;
} finally {
if (!success) {
- docs.close();
+ postings.close();
}
}
@@ -92,15 +94,16 @@
try {
FieldsProducer ret = new StandardTermsDictReader(indexReader,
dir, fieldInfos, si.name,
- docs,
+ postings,
readBufferSize,
- BytesRef.getUTF8SortedAsUTF16Comparator());
+ BytesRef.getUTF8SortedAsUTF16Comparator(),
+ TERMS_CACHE_SIZE);
success = true;
return ret;
} finally {
if (!success) {
try {
- docs.close();
+ postings.close();
} finally {
indexReader.close();
}
@@ -122,7 +125,7 @@
@Override
public void files(Directory dir, SegmentInfo segmentInfo, Collection<String> files) throws IOException {
- StandardDocsReader.files(dir, segmentInfo, files);
+ StandardPostingsReaderImpl.files(dir, segmentInfo, files);
StandardTermsDictReader.files(dir, segmentInfo, files);
SimpleStandardTermsIndexReader.files(dir, segmentInfo, files);
}