You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by si...@apache.org on 2010/12/15 14:31:14 UTC

svn commit: r1049543 - in /lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search: MatchOnlyTermScorer.java TermQuery.java TermScorer.java

Author: simonw
Date: Wed Dec 15 13:31:13 2010
New Revision: 1049543

URL: http://svn.apache.org/viewvc?rev=1049543&view=rev
Log:
LUCENE-2723: further improvements / simplification for TermScorer. TermWeight creates specialized MatchOnlyScorer if Frequencies are not available

Added:
    lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/MatchOnlyTermScorer.java   (with props)
Modified:
    lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/TermQuery.java
    lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/TermScorer.java

Added: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/MatchOnlyTermScorer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/MatchOnlyTermScorer.java?rev=1049543&view=auto
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/MatchOnlyTermScorer.java (added)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/MatchOnlyTermScorer.java Wed Dec 15 13:31:13 2010
@@ -0,0 +1,244 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.BulkPostingsEnum;
+import org.apache.lucene.index.BulkPostingsEnum.BlockReader;
+import org.apache.lucene.util.Bits;
+
+/** Expert: A <code>Scorer</code> for documents matching a <code>Term</code>.
+ * This scorer only makes sense for the omitTF=true case
+ */
+final class MatchOnlyTermScorer extends Scorer {
+  private final BulkPostingsEnum docsEnum;
+  private final byte[] norms;
+  private int doc;
+
+  private final int[] docDeltas;
+  private int docPointer;
+  private int docPointerMax;
+  private boolean first = true;
+
+  private final float rawScore;
+  private final BlockReader docDeltasReader;
+  private final Bits skipDocs;
+  private final int docFreq;
+  private int count;
+
+  /**
+   * Construct a <code>TermScorer</code>.
+   * 
+   * @param weight
+   *          The weight of the <code>Term</code> in the query.
+   * @param td
+   *          An iterator over the documents matching the <code>Term</code>.
+   * @param similarity
+   *          The </code>Similarity</code> implementation to be used for score
+   *          computations.
+   * @param norms
+   *          The field norms of the document fields for the <code>Term</code>.
+   */
+  MatchOnlyTermScorer(Weight weight, BulkPostingsEnum td, BlockReader docDeltasReader, int docFreq, Bits skipDocs, Similarity similarity, byte[] norms) throws IOException {
+    super(similarity, weight);
+    
+    assert td.getFreqsReader() == null;
+    
+    this.docsEnum = td;
+    this.docFreq = docFreq;
+    this.docDeltasReader = docDeltasReader;
+    docDeltas = docDeltasReader.getBuffer();
+    reset();
+
+    this.skipDocs = skipDocs;
+    this.norms = norms;
+    rawScore = getSimilarity().tf(1f) * weight.getValue();
+  }
+
+  @Override
+  public void score(Collector c) throws IOException {
+    score(c, Integer.MAX_VALUE, nextDoc());
+  }
+
+  // firstDocID is ignored since nextDoc() sets 'doc'
+  @Override
+  protected boolean score(Collector c, int end, int firstDocID) throws IOException {
+    c.setScorer(this);
+    // nocommit -- this can leave scorer on a deleted doc...
+    while (doc < end) {                           // for docs in window
+      if (skipDocs == null || !skipDocs.get(doc)) {
+        c.collect(doc);                      // collect
+      }
+      if (count == docFreq) {
+        doc = NO_MORE_DOCS;
+        return false;
+      }
+      count++;
+      fillDocDeltas(); 
+      doc += docDeltas[docPointer];
+    }
+    return true;
+  }
+
+
+
+  @Override
+  public int docID() {
+    return first ? -1 : doc;
+  }
+
+  @Override
+  public float freq() {
+    return 1.0f;
+  }
+
+  /**
+   * Advances to the next document matching the query. <br>
+   * The iterator over the matching documents is buffered using
+   * {@link TermDocs#read(int[],int[])}.
+   * 
+   * @return the document matching the query or NO_MORE_DOCS if there are no more documents.
+   */
+  @Override
+  public int nextDoc() throws IOException {
+    while(count < docFreq) {
+      fillDocDeltas(); 
+      count++;
+      doc += docDeltas[docPointer];
+      first = false;
+      assert doc >= 0 && (skipDocs == null || doc < skipDocs.length()) && doc != NO_MORE_DOCS: "doc=" + doc + " skipDocs=" + skipDocs + " skipDocs.length=" + (skipDocs==null? "n/a" : skipDocs.length());
+      if (skipDocs == null || !skipDocs.get(doc)) {
+        return doc;
+      }
+    }
+
+    return doc = NO_MORE_DOCS;
+  }
+  
+  @Override
+  public float score() {
+    assert !first;
+    assert doc != NO_MORE_DOCS;
+
+    return norms == null ? rawScore : rawScore * getSimilarity().decodeNormValue(norms[doc]); // normalize for field
+  }
+
+  /**
+   * Advances to the first match beyond the current whose document number is
+   * greater than or equal to a given target. <br>
+   * The implementation uses {@link DocsEnum#advance(int)}.
+   * 
+   * @param target
+   *          The target document number.
+   * @return the matching document or NO_MORE_DOCS if none exist.
+   */
+  @Override
+  public int advance(final int target) throws IOException {
+
+    // nocommit: should we, here, optimize .advance(target that isn't
+    // too far away) into scan?  seems like simple win?
+
+    // first scan current doc deltas block
+    for (docPointer++; docPointer < docPointerMax && count < docFreq; docPointer++) {
+      assert first || docDeltas[docPointer] > 0;
+      doc += docDeltas[docPointer];
+      first = false;
+      count++;
+
+      if (doc >= target && (skipDocs == null || !skipDocs.get(doc))) {
+        return doc;
+      }
+    }
+
+    if (count == docFreq) {
+      return doc = NO_MORE_DOCS;
+    }
+
+    // not found in current block, seek underlying stream
+    final BulkPostingsEnum.JumpResult jumpResult;
+    if (target - doc > docDeltas.length && // avoid useless jumps
+        (jumpResult = docsEnum.jump(target, count)) != null) {
+      count = jumpResult.count;
+      doc = jumpResult.docID;
+      first = false;
+      reset();
+    } else {
+      // seek did not jump -- just fill next buffer
+      docPointerMax = docDeltasReader.fill();
+      if (docPointerMax != 0) {
+        docPointer = 0;
+        assert first || docDeltas[0] > 0;
+        doc += docDeltas[0];
+        count++;
+        first = false;
+      } else {
+        return doc = NO_MORE_DOCS;
+      }
+    }
+
+    // now scan
+    return scan(target);
+  }
+
+  private int scan(final int target) throws IOException {
+    while(true) {
+      assert doc >= 0 && doc != NO_MORE_DOCS;
+      if (doc >= target && (skipDocs == null || !skipDocs.get(doc))) {
+        return doc;
+      }
+
+      if (count >= docFreq) {
+        break;
+      }
+
+      if (++docPointer >= docPointerMax) {
+        docPointerMax = docDeltasReader.fill();
+        if (docPointerMax != 0) {
+          docPointer = 0;
+        } else {
+          return doc = NO_MORE_DOCS;
+        }
+      }
+
+      assert first || docDeltas[docPointer] > 0;
+      doc += docDeltas[docPointer];
+      count++;
+    }
+    return doc = NO_MORE_DOCS;
+  }
+
+  private void fillDocDeltas() throws IOException {
+    if (++docPointer >= docPointerMax) {
+      docPointerMax = docDeltasReader.fill();
+      assert docPointerMax != 0;
+      docPointer = 0;
+    }
+  }
+  
+  private void reset() throws IOException {
+    docPointerMax = docDeltasReader.end();
+    docPointer = docDeltasReader.offset();
+    docPointer--;
+  }
+  
+  /** Returns a string representation of this <code>TermScorer</code>. */
+  @Override
+  public String toString() { return "scorer(" + weight + ")"; }
+
+}

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/TermQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/TermQuery.java?rev=1049543&r1=1049542&r2=1049543&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/TermQuery.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/TermQuery.java Wed Dec 15 13:31:13 2010
@@ -24,6 +24,7 @@ import org.apache.lucene.index.DocsEnum;
 import org.apache.lucene.index.BulkPostingsEnum;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.BulkPostingsEnum.BlockReader;
 import org.apache.lucene.search.Explanation.IDFExplanation;
 import org.apache.lucene.util.ToStringUtils;
 
@@ -85,10 +86,17 @@ public class TermQuery extends Query {
       if (docs == null) {
         return null;
       }
-
       // nocommit: we need this docfreq from TermState, MTQ knows it... but tosses it away.
-      return new TermScorer(this, docs, reader.docFreq(term.field(), term.bytes()),
-                            reader.getDeletedDocs(), similarity, reader.norms(term.field()));
+      final int docFreq = reader.docFreq(term.field(), term.bytes());
+      final BlockReader docDeltas = docs.getDocDeltasReader();
+      final BlockReader frequencies = docs.getFreqsReader();
+      if (frequencies == null) {
+        return new MatchOnlyTermScorer(this, docs, docDeltas, docFreq,
+            reader.getDeletedDocs(), similarity, reader.norms(term.field()));
+      } else {
+        return new TermScorer(this, docs, docDeltas, frequencies, docFreq,
+            reader.getDeletedDocs(), similarity, reader.norms(term.field()));
+      }
     }
 
     @Override

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/TermScorer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/TermScorer.java?rev=1049543&r1=1049542&r2=1049543&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/TermScorer.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/TermScorer.java Wed Dec 15 13:31:13 2010
@@ -20,6 +20,7 @@ package org.apache.lucene.search;
 import java.io.IOException;
 
 import org.apache.lucene.index.BulkPostingsEnum;
+import org.apache.lucene.index.BulkPostingsEnum.BlockReader;
 import org.apache.lucene.util.Bits;
 
 // nocommit -- break out aligned & not cases?
@@ -28,9 +29,9 @@ import org.apache.lucene.util.Bits;
 /** Expert: A <code>Scorer</code> for documents matching a <code>Term</code>.
  */
 final class TermScorer extends Scorer {
-  private BulkPostingsEnum docsEnum;
-  private byte[] norms;
-  private float weightValue;
+  private final BulkPostingsEnum docsEnum;
+  private final byte[] norms;
+  private final float weightValue;
   private int doc;
 
   private final int[] docDeltas;
@@ -43,9 +44,9 @@ final class TermScorer extends Scorer {
   private int freqPointerMax;
 
   private static final int SCORE_CACHE_SIZE = 32;
-  private float[] scoreCache = new float[SCORE_CACHE_SIZE];
-  private final BulkPostingsEnum.BlockReader freqsReader;
-  private final BulkPostingsEnum.BlockReader docDeltasReader;
+  private final float[] scoreCache = new float[SCORE_CACHE_SIZE];
+  private final BlockReader freqsReader;
+  private final BlockReader docDeltasReader;
   private final Bits skipDocs;
   private final int docFreq;
   private int count;
@@ -63,27 +64,15 @@ final class TermScorer extends Scorer {
    * @param norms
    *          The field norms of the document fields for the <code>Term</code>.
    */
-  TermScorer(Weight weight, BulkPostingsEnum td, int docFreq, Bits skipDocs, Similarity similarity, byte[] norms) throws IOException {
+  TermScorer(Weight weight, BulkPostingsEnum td, BlockReader docDeltaReader, BlockReader freqReader, int docFreq, Bits skipDocs, Similarity similarity, byte[] norms) throws IOException {
     super(similarity, weight);
-    
     this.docsEnum = td;
     this.docFreq = docFreq;
-    docDeltasReader = td.getDocDeltasReader();
+    this.docDeltasReader = docDeltaReader;
     docDeltas = docDeltasReader.getBuffer();
-    docPointerMax = docDeltasReader.end();
-    docPointer = docDeltasReader.offset();
-    docPointer--;
-
-    freqsReader = td.getFreqsReader();
-    if (freqsReader != null) {
-      freqs = freqsReader.getBuffer();
-      freqPointerMax = freqsReader.end();
-      freqPointer = freqsReader.offset();
-      freqPointer--;
-    } else {
-      freqs = null;
-    }
-
+    this.freqsReader = freqReader;
+    freqs = freqsReader.getBuffer();
+    reset();
     this.skipDocs = skipDocs;
     this.norms = norms;
     this.weightValue = weight.getValue();
@@ -101,11 +90,9 @@ final class TermScorer extends Scorer {
   @Override
   protected boolean score(Collector c, int end, int firstDocID) throws IOException {
     c.setScorer(this);
-    //System.out.println("ts.collect firstdocID=" + firstDocID + " term=" + term + " end=" + end + " doc=" + doc);
     // nocommit -- this can leave scorer on a deleted doc...
     while (doc < end) {                           // for docs in window
       if (skipDocs == null || !skipDocs.get(doc)) {
-        //System.out.println("ts.collect doc=" + doc + " skipDocs=" + skipDocs + " count=" + count + " vs dF=" + docFreq);
         c.collect(doc);                      // collect
       }
       if (count == docFreq) {
@@ -113,40 +100,8 @@ final class TermScorer extends Scorer {
         return false;
       }
       count++;
-      docPointer++;
-
-      //System.out.println("dp=" + docPointer + " dpMax=" + docPointerMax + " count=" + count + " countMax=" + docFreq);
-
-      if (docPointer >= docPointerMax) {
-        docPointerMax = docDeltasReader.fill();
-        //System.out.println("    refill!  dpMax=" + docPointerMax + " reader=" + docDeltasReader);
-        assert docPointerMax != 0;
-        docPointer = 0;
-
-        if (freqsReader != null) {
-          freqPointer++;
-          // NOTE: this code is intentionally dup'd
-          // (specialized) w/ the else clause, for better CPU
-          // branch prediction (assuming compiler doesn't
-          // de-dup): for codecs that always bulk read same
-          // number of docDeltas & freqs (standard, for,
-          // pfor), this if will always be true.  Other codecs
-          // (simple9/16) will not be aligned:
-          if (freqPointer >= freqPointerMax) {
-            freqPointerMax = freqsReader.fill();
-            assert freqPointerMax != 0;
-            freqPointer = 0;
-          }
-        }
-      } else if (freqsReader != null) {
-        freqPointer++;
-        if (freqPointer >= freqPointerMax) {
-          freqPointerMax = freqsReader.fill();
-          assert freqPointerMax != 0;
-          freqPointer = 0;
-        }
-      }
-
+      fillDeltas();
+      fillFreq();
       doc += docDeltas[docPointer];
     }
     return true;
@@ -159,11 +114,7 @@ final class TermScorer extends Scorer {
 
   @Override
   public float freq() {
-    if (freqsReader != null) {
-      return freqs[freqPointer];
-    } else {
-      return 1.0f;
-    }
+    return freqs[freqPointer];
   }
 
   /**
@@ -175,64 +126,25 @@ final class TermScorer extends Scorer {
    */
   @Override
   public int nextDoc() throws IOException {
-    //System.out.println("ts.nextDoc " + this + " count=" + count + " vs docFreq=" + docFreq);
     while(count < docFreq) {
-      docPointer++;
-      if (docPointer >= docPointerMax) {
-        //System.out.println("ts.nd refill docs");
-        docPointerMax = docDeltasReader.fill();
-        assert docPointerMax != 0;
-        docPointer = 0;
-        if (freqsReader != null) {
-          // NOTE: this code is intentionally dup'd
-          // (specialized) w/ the else clause, for better CPU
-          // branch prediction (assuming compiler doesn't
-          // de-dup): for codecs that always bulk read same
-          // number of docDeltas & freqs (standard, for,
-          // pfor), this if will always be true.  Other codecs
-          // (simple9/16) will not be aligned:
-          freqPointer++;
-          if (freqPointer >= freqPointerMax) {
-            //System.out.println("ts.nd refill freqs");
-            freqPointerMax = freqsReader.fill();
-            assert freqPointerMax != 0;
-            freqPointer = 0;
-          }
-        }
-      } else {
-        if (freqsReader != null) {
-          freqPointer++;
-          if (freqPointer >= freqPointerMax) {
-            //System.out.println("ts.nd refill freqs");
-            freqPointerMax = freqsReader.fill();
-            assert freqPointerMax != 0;
-            freqPointer = 0;
-          }
-        }
-      }
+      fillDeltas();
+      fillFreq();
       count++;
       doc += docDeltas[docPointer];
       first = false;
       assert doc >= 0 && (skipDocs == null || doc < skipDocs.length()) && doc != NO_MORE_DOCS: "doc=" + doc + " skipDocs=" + skipDocs + " skipDocs.length=" + (skipDocs==null? "n/a" : skipDocs.length());
       if (skipDocs == null || !skipDocs.get(doc)) {
-        //System.out.println("  ret doc=" + doc + " freq=" + freq());
         return doc;
       }
     }
 
-    //System.out.println("  end");
     return doc = NO_MORE_DOCS;
   }
-  
+
   @Override
   public float score() {
     assert !first;
-    final int freq;
-    if (freqsReader == null) {
-      freq = 1;
-    } else {
-      freq = freqs[freqPointer];
-    }
+    final int freq = freqs[freqPointer];
     assert freq > 0;
     assert doc != NO_MORE_DOCS;
     float raw =                                   // compute tf(f)*weight
@@ -253,7 +165,7 @@ final class TermScorer extends Scorer {
    * @return the matching document or NO_MORE_DOCS if none exist.
    */
   @Override
-  public int advance(int target) throws IOException {
+  public int advance(final int target) throws IOException {
 
     // nocommit: should we, here, optimize .advance(target that isn't
     // too far away) into scan?  seems like simple win?
@@ -264,11 +176,7 @@ final class TermScorer extends Scorer {
       doc += docDeltas[docPointer];
       first = false;
       count++;
-      if (freqsReader != null && ++freqPointer >= freqPointerMax) {
-        freqPointerMax = freqsReader.fill();
-        assert freqPointerMax != 0;
-        freqPointer = 0;
-      } 
+      fillFreq();
       if (doc >= target && (skipDocs == null || !skipDocs.get(doc))) {
         return doc;
       }
@@ -279,20 +187,13 @@ final class TermScorer extends Scorer {
     }
 
     // not found in current block, seek underlying stream
-    BulkPostingsEnum.JumpResult jumpResult;
+    final BulkPostingsEnum.JumpResult jumpResult;
     if (target - doc > docDeltas.length && // avoid useless jumps
         (jumpResult = docsEnum.jump(target, count)) != null) {
       count = jumpResult.count;
       doc = jumpResult.docID;
       first = false;
-      docPointer = docDeltasReader.offset();
-      docPointerMax = docDeltasReader.end();
-      docPointer--;
-      if (freqsReader != null) {
-        freqPointer = freqsReader.offset();
-        freqPointerMax = freqsReader.end();
-        freqPointer--;
-      }
+      reset();
     } else {
       // seek did not jump -- just fill next buffer
       docPointerMax = docDeltasReader.fill();
@@ -305,14 +206,14 @@ final class TermScorer extends Scorer {
       } else {
         return doc = NO_MORE_DOCS;
       }
-      if (freqsReader != null && ++freqPointer >= freqPointerMax) {
-        freqPointerMax = freqsReader.fill();
-        assert freqPointerMax != 0;
-        freqPointer = 0;
-      } 
+     fillFreq();
     }
 
-    // now scan
+    // now scan -- let the compiler inline this
+    return scan(target);
+  }
+
+  private int scan(final int target) throws IOException {
     while(true) {
       assert doc >= 0 && doc != NO_MORE_DOCS;
       if (doc >= target && (skipDocs == null || !skipDocs.get(doc))) {
@@ -332,12 +233,7 @@ final class TermScorer extends Scorer {
         }
       }
 
-      if (freqsReader != null && ++freqPointer >= freqPointerMax) {
-        freqPointerMax = freqsReader.fill();
-        assert freqPointerMax != 0;
-        freqPointer = 0;
-      } 
-
+      fillFreq();
       assert first || docDeltas[docPointer] > 0;
       doc += docDeltas[docPointer];
       count++;
@@ -348,5 +244,29 @@ final class TermScorer extends Scorer {
   /** Returns a string representation of this <code>TermScorer</code>. */
   @Override
   public String toString() { return "scorer(" + weight + ")"; }
-
+  
+  private final void fillFreq() throws IOException {
+    if (++freqPointer >= freqPointerMax) {
+      freqPointerMax = freqsReader.fill();
+      assert freqPointerMax != 0;
+      freqPointer = 0;
+    }
+  }
+  
+  private void fillDeltas() throws IOException {
+    if (++docPointer >= docPointerMax) {
+      docPointerMax = docDeltasReader.fill();
+      assert docPointerMax != 0;
+      docPointer = 0;
+    }
+  }
+  
+  private final void reset() throws IOException {
+    docPointer = docDeltasReader.offset();
+    docPointerMax = docDeltasReader.end();
+    freqPointer = freqsReader.offset();
+    freqPointerMax = freqsReader.end();
+    --docPointer;
+    --freqPointer;
+  }
 }