You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/07/23 22:49:22 UTC

svn commit: r1364795 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/core/ lucene/core/src/java/org/apache/lucene/codecs/ lucene/core/src/java/org/apache/lucene/index/ lucene/core/src/test/org/apache/lucene/index/ lucene/test-framework/ lucene/t...

Author: rmuir
Date: Mon Jul 23 20:49:22 2012
New Revision: 1364795

URL: http://svn.apache.org/viewvc?rev=1364795&view=rev
Log:
LUCENE-4828: add AssertingPostingsConsumer, fix minor inconsistencies in producers

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/TermsConsumer.java
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java
    lucene/dev/branches/branch_4x/lucene/test-framework/   (props changed)
    lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPostingsFormat.java

Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java?rev=1364795&r1=1364794&r2=1364795&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java Mon Jul 23 20:49:22 2012
@@ -49,14 +49,17 @@ import org.apache.lucene.util.FixedBitSe
  */
 public abstract class PostingsConsumer {
 
-  /** Adds a new doc in this term. */
+  /** Adds a new doc in this term. 
+   * <code>freq</code> will be -1 when term frequencies are omitted
+   * for the field. */
   public abstract void startDoc(int docID, int freq) throws IOException;
 
   /** Add a new position & payload, and start/end offset.  A
    *  null payload means no payload; a non-null payload with
    *  zero length also means no payload.  Caller may reuse
    *  the {@link BytesRef} for the payload between calls
-   *  (method must fully consume the payload). */
+   *  (method must fully consume the payload). <code>startOffset</code>
+   *  and <code>endOffset</code> will be -1 when offsets are not indexed. */
   public abstract void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException;
 
   /** Called when we are done adding positions & payloads
@@ -78,7 +81,7 @@ public abstract class PostingsConsumer {
           break;
         }
         visitedDocs.set(doc);
-        this.startDoc(doc, 0);
+        this.startDoc(doc, -1);
         this.finishDoc();
         df++;
       }

Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/TermsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/TermsConsumer.java?rev=1364795&r1=1364794&r2=1364795&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/TermsConsumer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/TermsConsumer.java Mon Jul 23 20:49:22 2012
@@ -57,10 +57,14 @@ public abstract class TermsConsumer {
    *  no docs. */
   public abstract PostingsConsumer startTerm(BytesRef text) throws IOException;
 
-  /** Finishes the current term; numDocs must be > 0. */
+  /** Finishes the current term; numDocs must be > 0.
+   *  <code>stats.totalTermFreq</code> will be -1 when term 
+   *  frequencies are omitted for the field. */
   public abstract void finishTerm(BytesRef text, TermStats stats) throws IOException;
 
-  /** Called when we are done adding terms to this field */
+  /** Called when we are done adding terms to this field.
+   *  <code>sumTotalTermFreq</code> will be -1 when term 
+   *  frequencies are omitted for the field. */
   public abstract void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException;
 
   /** Return the BytesRef Comparator used to sort terms

Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java?rev=1364795&r1=1364794&r2=1364795&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java Mon Jul 23 20:49:22 2012
@@ -430,7 +430,7 @@ final class FreqProxTermsWriterPerField 
             if (readTermFreq) {
               termDocFreq = postings.docFreqs[termID];
             } else {
-              termDocFreq = 0;
+              termDocFreq = -1;
             }
             postings.lastDocCodes[termID] = -1;
           } else {
@@ -441,7 +441,7 @@ final class FreqProxTermsWriterPerField 
           final int code = freq.readVInt();
           if (!readTermFreq) {
             docID += code;
-            termDocFreq = 0;
+            termDocFreq = -1;
           } else {
             docID += code >>> 1;
             if ((code & 1) != 0) {
@@ -469,7 +469,7 @@ final class FreqProxTermsWriterPerField 
         // 2nd sweep does the real flush, but I suspect
         // that'd add too much time to flush.
         visitedDocs.set(docID);
-        postingsConsumer.startDoc(docID, termDocFreq);
+        postingsConsumer.startDoc(docID, writeTermFreq ? termDocFreq : -1);
         if (docID < delDocLimit) {
           // Mark it deleted.  TODO: we could also skip
           // writing its postings; this would be

Modified: lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java?rev=1364795&r1=1364794&r2=1364795&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java Mon Jul 23 20:49:22 2012
@@ -155,7 +155,7 @@ public class TestCodecs extends LuceneTe
       for(int i=0;i<docs.length;i++) {
         final int termDocFreq;
         if (field.omitTF) {
-          termDocFreq = 0;
+          termDocFreq = -1;
         } else {
           termDocFreq = positions[i].length;
         }
@@ -166,8 +166,8 @@ public class TestCodecs extends LuceneTe
             final PositionData pos = positions[i][j];
             postingsConsumer.addPosition(pos.pos, pos.payload, -1, -1);
           }
-          postingsConsumer.finishDoc();
         }
+        postingsConsumer.finishDoc();
       }
       termsConsumer.finishTerm(text, new TermStats(docs.length, field.omitTF ? -1 : totTF));
       return totTF;

Modified: lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java?rev=1364795&r1=1364794&r2=1364795&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java Mon Jul 23 20:49:22 2012
@@ -406,7 +406,7 @@ public class TestPostingsFormat extends 
           if (VERBOSE) {
             System.out.println("    " + docCount + ": docID=" + posting.docID + " freq=" + posting.positions.size());
           }
-          postingsConsumer.startDoc(posting.docID, posting.positions.size());
+          postingsConsumer.startDoc(posting.docID, doFreq ? posting.positions.size() : -1);
           seenDocs.set(posting.docID);
           if (doPos) {
             totalTF += posting.positions.size();

Modified: lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPostingsFormat.java?rev=1364795&r1=1364794&r2=1364795&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPostingsFormat.java (original)
+++ lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPostingsFormat.java Mon Jul 23 20:49:22 2012
@@ -35,6 +35,7 @@ import org.apache.lucene.index.SegmentRe
 import org.apache.lucene.index.SegmentWriteState;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.OpenBitSet;
 
 /**
  * Just like {@link Lucene40PostingsFormat} but with additional asserts.
@@ -118,22 +119,23 @@ public class AssertingPostingsFormat ext
     private final FieldInfo fieldInfo;
     private BytesRef lastTerm = null;
     private TermsConsumerState state = TermsConsumerState.INITIAL;
+    private AssertingPostingsConsumer lastPostingsConsumer = null;
+    private long sumTotalTermFreq = 0;
+    private long sumDocFreq = 0;
+    private OpenBitSet visitedDocs = new OpenBitSet();
     
     AssertingTermsConsumer(TermsConsumer in, FieldInfo fieldInfo) {
       this.in = in;
       this.fieldInfo = fieldInfo;
     }
     
-    // TODO: AssertingPostingsConsumer
     @Override
     public PostingsConsumer startTerm(BytesRef text) throws IOException {
-      // TODO: assert that if state == START (no finishTerm called), that no actual docs were fed.
-      // TODO: this makes the api really confusing! we should try to clean this up!
-      assert state == TermsConsumerState.INITIAL || state == TermsConsumerState.START;
+      assert state == TermsConsumerState.INITIAL || state == TermsConsumerState.START && lastPostingsConsumer.docFreq == 0;
       state = TermsConsumerState.START;
       assert lastTerm == null || in.getComparator().compare(text, lastTerm) > 0;
       lastTerm = BytesRef.deepCopyOf(text);
-      return in.startTerm(text);
+      return lastPostingsConsumer = new AssertingPostingsConsumer(in.startTerm(text), fieldInfo, visitedDocs);
     }
 
     @Override
@@ -142,24 +144,30 @@ public class AssertingPostingsFormat ext
       state = TermsConsumerState.INITIAL;
       assert text.equals(lastTerm);
       assert stats.docFreq > 0; // otherwise, this method should not be called.
+      assert stats.docFreq == lastPostingsConsumer.docFreq;
+      sumDocFreq += stats.docFreq;
       if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) {
         assert stats.totalTermFreq == -1;
+      } else {
+        assert stats.totalTermFreq == lastPostingsConsumer.totalTermFreq;
+        sumTotalTermFreq += stats.totalTermFreq;
       }
       in.finishTerm(text, stats);
     }
 
     @Override
     public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
-      // TODO: assert that if state == START (no finishTerm called), that no actual docs were fed.
-      // TODO: this makes the api really confusing! we should try to clean this up!
-      assert state == TermsConsumerState.INITIAL || state == TermsConsumerState.START;
+      assert state == TermsConsumerState.INITIAL || state == TermsConsumerState.START && lastPostingsConsumer.docFreq == 0;
       state = TermsConsumerState.FINISHED;
       assert docCount >= 0;
+      assert docCount == visitedDocs.cardinality();
       assert sumDocFreq >= docCount;
+      assert sumDocFreq == this.sumDocFreq;
       if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) {
         assert sumTotalTermFreq == -1;
       } else {
-        assert sumTotalTermFreq >= sumDocFreq;        
+        assert sumTotalTermFreq >= sumDocFreq;
+        assert sumTotalTermFreq == this.sumTotalTermFreq;
       }
       in.finish(sumTotalTermFreq, sumDocFreq, docCount);
     }
@@ -169,4 +177,79 @@ public class AssertingPostingsFormat ext
       return in.getComparator();
     }
   }
+  
+  static enum PostingsConsumerState { INITIAL, START };
+  static class AssertingPostingsConsumer extends PostingsConsumer {
+    private final PostingsConsumer in;
+    private final FieldInfo fieldInfo;
+    private final OpenBitSet visitedDocs;
+    private PostingsConsumerState state = PostingsConsumerState.INITIAL;
+    private int freq;
+    private int positionCount;
+    private int lastPosition = 0;
+    private int lastStartOffset = 0;
+    int docFreq = 0;
+    long totalTermFreq = 0;
+    
+    AssertingPostingsConsumer(PostingsConsumer in, FieldInfo fieldInfo, OpenBitSet visitedDocs) {
+      this.in = in;
+      this.fieldInfo = fieldInfo;
+      this.visitedDocs = visitedDocs;
+    }
+
+    @Override
+    public void startDoc(int docID, int freq) throws IOException {
+      assert state == PostingsConsumerState.INITIAL;
+      state = PostingsConsumerState.START;
+      assert docID >= 0;
+      if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) {
+        assert freq == -1;
+        this.freq = 0; // we don't expect any positions here
+      } else {
+        assert freq > 0;
+        this.freq = freq;
+        totalTermFreq += freq;
+      }
+      this.positionCount = 0;
+      this.lastPosition = 0;
+      this.lastStartOffset = 0;
+      docFreq++;
+      visitedDocs.set(docID);
+      in.startDoc(docID, freq);
+    }
+
+    @Override
+    public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
+      assert state == PostingsConsumerState.START;
+      assert positionCount < freq;
+      positionCount++;
+      assert position >= lastPosition || position == -1; /* we still allow -1 from old 3.x indexes */
+      lastPosition = position;
+      if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) {
+        assert startOffset >= 0;
+        assert startOffset >= lastStartOffset;
+        lastStartOffset = startOffset;
+        assert endOffset >= startOffset;
+      } else {
+        assert startOffset == -1;
+        assert endOffset == -1;
+      }
+      if (payload != null) {
+        assert fieldInfo.hasPayloads();
+      }
+      in.addPosition(position, payload, startOffset, endOffset);
+    }
+
+    @Override
+    public void finishDoc() throws IOException {
+      assert state == PostingsConsumerState.START;
+      state = PostingsConsumerState.INITIAL;
+      if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
+        assert positionCount == 0; // we should not have fed any positions!
+      } else {
+        assert positionCount == freq;
+      }
+      in.finishDoc();
+    }
+  }
 }