You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/07/23 22:49:22 UTC
svn commit: r1364795 - in /lucene/dev/branches/branch_4x: ./ lucene/
lucene/core/ lucene/core/src/java/org/apache/lucene/codecs/
lucene/core/src/java/org/apache/lucene/index/
lucene/core/src/test/org/apache/lucene/index/ lucene/test-framework/
lucene/t...
Author: rmuir
Date: Mon Jul 23 20:49:22 2012
New Revision: 1364795
URL: http://svn.apache.org/viewvc?rev=1364795&view=rev
Log:
LUCENE-4828: add AssertingPostingsConsumer, fix minor inconsistencies in producers
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/lucene/ (props changed)
lucene/dev/branches/branch_4x/lucene/core/ (props changed)
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/TermsConsumer.java
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java
lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java
lucene/dev/branches/branch_4x/lucene/test-framework/ (props changed)
lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPostingsFormat.java
Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java?rev=1364795&r1=1364794&r2=1364795&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java Mon Jul 23 20:49:22 2012
@@ -49,14 +49,17 @@ import org.apache.lucene.util.FixedBitSe
*/
public abstract class PostingsConsumer {
- /** Adds a new doc in this term. */
+ /** Adds a new doc in this term.
+ * <code>freq</code> will be -1 when term frequencies are omitted
+ * for the field. */
public abstract void startDoc(int docID, int freq) throws IOException;
/** Add a new position & payload, and start/end offset. A
* null payload means no payload; a non-null payload with
* zero length also means no payload. Caller may reuse
* the {@link BytesRef} for the payload between calls
- * (method must fully consume the payload). */
+ * (method must fully consume the payload). <code>startOffset</code>
+ * and <code>endOffset</code> will be -1 when offsets are not indexed. */
public abstract void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException;
/** Called when we are done adding positions & payloads
@@ -78,7 +81,7 @@ public abstract class PostingsConsumer {
break;
}
visitedDocs.set(doc);
- this.startDoc(doc, 0);
+ this.startDoc(doc, -1);
this.finishDoc();
df++;
}
Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/TermsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/TermsConsumer.java?rev=1364795&r1=1364794&r2=1364795&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/TermsConsumer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/TermsConsumer.java Mon Jul 23 20:49:22 2012
@@ -57,10 +57,14 @@ public abstract class TermsConsumer {
* no docs. */
public abstract PostingsConsumer startTerm(BytesRef text) throws IOException;
- /** Finishes the current term; numDocs must be > 0. */
+ /** Finishes the current term; numDocs must be > 0.
+ * <code>stats.totalTermFreq</code> will be -1 when term
+ * frequencies are omitted for the field. */
public abstract void finishTerm(BytesRef text, TermStats stats) throws IOException;
- /** Called when we are done adding terms to this field */
+ /** Called when we are done adding terms to this field.
+ * <code>sumTotalTermFreq</code> will be -1 when term
+ * frequencies are omitted for the field. */
public abstract void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException;
/** Return the BytesRef Comparator used to sort terms
Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java?rev=1364795&r1=1364794&r2=1364795&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java Mon Jul 23 20:49:22 2012
@@ -430,7 +430,7 @@ final class FreqProxTermsWriterPerField
if (readTermFreq) {
termDocFreq = postings.docFreqs[termID];
} else {
- termDocFreq = 0;
+ termDocFreq = -1;
}
postings.lastDocCodes[termID] = -1;
} else {
@@ -441,7 +441,7 @@ final class FreqProxTermsWriterPerField
final int code = freq.readVInt();
if (!readTermFreq) {
docID += code;
- termDocFreq = 0;
+ termDocFreq = -1;
} else {
docID += code >>> 1;
if ((code & 1) != 0) {
@@ -469,7 +469,7 @@ final class FreqProxTermsWriterPerField
// 2nd sweep does the real flush, but I suspect
// that'd add too much time to flush.
visitedDocs.set(docID);
- postingsConsumer.startDoc(docID, termDocFreq);
+ postingsConsumer.startDoc(docID, writeTermFreq ? termDocFreq : -1);
if (docID < delDocLimit) {
// Mark it deleted. TODO: we could also skip
// writing its postings; this would be
Modified: lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java?rev=1364795&r1=1364794&r2=1364795&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java Mon Jul 23 20:49:22 2012
@@ -155,7 +155,7 @@ public class TestCodecs extends LuceneTe
for(int i=0;i<docs.length;i++) {
final int termDocFreq;
if (field.omitTF) {
- termDocFreq = 0;
+ termDocFreq = -1;
} else {
termDocFreq = positions[i].length;
}
@@ -166,8 +166,8 @@ public class TestCodecs extends LuceneTe
final PositionData pos = positions[i][j];
postingsConsumer.addPosition(pos.pos, pos.payload, -1, -1);
}
- postingsConsumer.finishDoc();
}
+ postingsConsumer.finishDoc();
}
termsConsumer.finishTerm(text, new TermStats(docs.length, field.omitTF ? -1 : totTF));
return totTF;
Modified: lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java?rev=1364795&r1=1364794&r2=1364795&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java Mon Jul 23 20:49:22 2012
@@ -406,7 +406,7 @@ public class TestPostingsFormat extends
if (VERBOSE) {
System.out.println(" " + docCount + ": docID=" + posting.docID + " freq=" + posting.positions.size());
}
- postingsConsumer.startDoc(posting.docID, posting.positions.size());
+ postingsConsumer.startDoc(posting.docID, doFreq ? posting.positions.size() : -1);
seenDocs.set(posting.docID);
if (doPos) {
totalTF += posting.positions.size();
Modified: lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPostingsFormat.java?rev=1364795&r1=1364794&r2=1364795&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPostingsFormat.java (original)
+++ lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPostingsFormat.java Mon Jul 23 20:49:22 2012
@@ -35,6 +35,7 @@ import org.apache.lucene.index.SegmentRe
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.OpenBitSet;
/**
* Just like {@link Lucene40PostingsFormat} but with additional asserts.
@@ -118,22 +119,23 @@ public class AssertingPostingsFormat ext
private final FieldInfo fieldInfo;
private BytesRef lastTerm = null;
private TermsConsumerState state = TermsConsumerState.INITIAL;
+ private AssertingPostingsConsumer lastPostingsConsumer = null;
+ private long sumTotalTermFreq = 0;
+ private long sumDocFreq = 0;
+ private OpenBitSet visitedDocs = new OpenBitSet();
AssertingTermsConsumer(TermsConsumer in, FieldInfo fieldInfo) {
this.in = in;
this.fieldInfo = fieldInfo;
}
- // TODO: AssertingPostingsConsumer
@Override
public PostingsConsumer startTerm(BytesRef text) throws IOException {
- // TODO: assert that if state == START (no finishTerm called), that no actual docs were fed.
- // TODO: this makes the api really confusing! we should try to clean this up!
- assert state == TermsConsumerState.INITIAL || state == TermsConsumerState.START;
+ assert state == TermsConsumerState.INITIAL || state == TermsConsumerState.START && lastPostingsConsumer.docFreq == 0;
state = TermsConsumerState.START;
assert lastTerm == null || in.getComparator().compare(text, lastTerm) > 0;
lastTerm = BytesRef.deepCopyOf(text);
- return in.startTerm(text);
+ return lastPostingsConsumer = new AssertingPostingsConsumer(in.startTerm(text), fieldInfo, visitedDocs);
}
@Override
@@ -142,24 +144,30 @@ public class AssertingPostingsFormat ext
state = TermsConsumerState.INITIAL;
assert text.equals(lastTerm);
assert stats.docFreq > 0; // otherwise, this method should not be called.
+ assert stats.docFreq == lastPostingsConsumer.docFreq;
+ sumDocFreq += stats.docFreq;
if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) {
assert stats.totalTermFreq == -1;
+ } else {
+ assert stats.totalTermFreq == lastPostingsConsumer.totalTermFreq;
+ sumTotalTermFreq += stats.totalTermFreq;
}
in.finishTerm(text, stats);
}
@Override
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
- // TODO: assert that if state == START (no finishTerm called), that no actual docs were fed.
- // TODO: this makes the api really confusing! we should try to clean this up!
- assert state == TermsConsumerState.INITIAL || state == TermsConsumerState.START;
+ assert state == TermsConsumerState.INITIAL || state == TermsConsumerState.START && lastPostingsConsumer.docFreq == 0;
state = TermsConsumerState.FINISHED;
assert docCount >= 0;
+ assert docCount == visitedDocs.cardinality();
assert sumDocFreq >= docCount;
+ assert sumDocFreq == this.sumDocFreq;
if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) {
assert sumTotalTermFreq == -1;
} else {
- assert sumTotalTermFreq >= sumDocFreq;
+ assert sumTotalTermFreq >= sumDocFreq;
+ assert sumTotalTermFreq == this.sumTotalTermFreq;
}
in.finish(sumTotalTermFreq, sumDocFreq, docCount);
}
@@ -169,4 +177,79 @@ public class AssertingPostingsFormat ext
return in.getComparator();
}
}
+
+ static enum PostingsConsumerState { INITIAL, START };
+ static class AssertingPostingsConsumer extends PostingsConsumer {
+ private final PostingsConsumer in;
+ private final FieldInfo fieldInfo;
+ private final OpenBitSet visitedDocs;
+ private PostingsConsumerState state = PostingsConsumerState.INITIAL;
+ private int freq;
+ private int positionCount;
+ private int lastPosition = 0;
+ private int lastStartOffset = 0;
+ int docFreq = 0;
+ long totalTermFreq = 0;
+
+ AssertingPostingsConsumer(PostingsConsumer in, FieldInfo fieldInfo, OpenBitSet visitedDocs) {
+ this.in = in;
+ this.fieldInfo = fieldInfo;
+ this.visitedDocs = visitedDocs;
+ }
+
+ @Override
+ public void startDoc(int docID, int freq) throws IOException {
+ assert state == PostingsConsumerState.INITIAL;
+ state = PostingsConsumerState.START;
+ assert docID >= 0;
+ if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) {
+ assert freq == -1;
+ this.freq = 0; // we don't expect any positions here
+ } else {
+ assert freq > 0;
+ this.freq = freq;
+ totalTermFreq += freq;
+ }
+ this.positionCount = 0;
+ this.lastPosition = 0;
+ this.lastStartOffset = 0;
+ docFreq++;
+ visitedDocs.set(docID);
+ in.startDoc(docID, freq);
+ }
+
+ @Override
+ public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
+ assert state == PostingsConsumerState.START;
+ assert positionCount < freq;
+ positionCount++;
+ assert position >= lastPosition || position == -1; /* we still allow -1 from old 3.x indexes */
+ lastPosition = position;
+ if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) {
+ assert startOffset >= 0;
+ assert startOffset >= lastStartOffset;
+ lastStartOffset = startOffset;
+ assert endOffset >= startOffset;
+ } else {
+ assert startOffset == -1;
+ assert endOffset == -1;
+ }
+ if (payload != null) {
+ assert fieldInfo.hasPayloads();
+ }
+ in.addPosition(position, payload, startOffset, endOffset);
+ }
+
+ @Override
+ public void finishDoc() throws IOException {
+ assert state == PostingsConsumerState.START;
+ state = PostingsConsumerState.INITIAL;
+ if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
+ assert positionCount == 0; // we should not have fed any positions!
+ } else {
+ assert positionCount == freq;
+ }
+ in.finishDoc();
+ }
+ }
}