You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by gs...@apache.org on 2007/04/24 14:49:43 UTC

svn commit: r531913 - in /lucene/java/trunk/src: java/org/apache/lucene/search/ java/org/apache/lucene/search/payloads/ java/org/apache/lucene/search/spans/ test/org/apache/lucene/search/payloads/

Author: gsingers
Date: Tue Apr 24 05:49:42 2007
New Revision: 531913

URL: http://svn.apache.org/viewvc?view=rev&rev=531913
Log:
LUCENE-834:
Added documentation to Similarity class

Changed BoostingTermQuery to score all occurrences of the term in a document by making the underlying setFreqCurrentDoc() method protected so that it can be overridden in BTQ.  Updated the tests to have multiple terms per document with different payloads, so that it can be averaged.

Modified:
    lucene/java/trunk/src/java/org/apache/lucene/search/Similarity.java
    lucene/java/trunk/src/java/org/apache/lucene/search/payloads/BoostingTermQuery.java
    lucene/java/trunk/src/java/org/apache/lucene/search/spans/SpanScorer.java
    lucene/java/trunk/src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java

Modified: lucene/java/trunk/src/java/org/apache/lucene/search/Similarity.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/Similarity.java?view=diff&rev=531913&r1=531912&r2=531913
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/Similarity.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/Similarity.java Tue Apr 24 05:49:42 2007
@@ -513,6 +513,8 @@
    * The default implementation returns 1.
    *
    * @param payload The payload byte array to be scored
+   * @param offset The offset into the payload array
+   * @param length The length in the array
    * @return An implementation dependent float to be used as a scoring factor 
    *  <b>
    *  Warning: The status of the Payloads feature is experimental. The APIs

Modified: lucene/java/trunk/src/java/org/apache/lucene/search/payloads/BoostingTermQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/payloads/BoostingTermQuery.java?view=diff&rev=531913&r1=531912&r2=531913
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/payloads/BoostingTermQuery.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/payloads/BoostingTermQuery.java Tue Apr 24 05:49:42 2007
@@ -34,6 +34,8 @@
  * <p>
  * In order to take advantage of this, you must override {@link org.apache.lucene.search.Similarity#scorePayload(byte[],int,int)}
  * which returns 1 by default.
+ * <p>
+ * Payload scores are averaged across term occurrences in the document.  
  * 
  *
  * @see org.apache.lucene.search.Similarity#scorePayload(byte[], int, int)
@@ -50,7 +52,7 @@
     return new BoostingTermWeight(this, searcher);
   }
 
-  private class BoostingTermWeight extends SpanWeight implements Weight {
+  protected class BoostingTermWeight extends SpanWeight implements Weight {
 
 
     public BoostingTermWeight(BoostingTermQuery query, Searcher searcher) throws IOException {
@@ -70,7 +72,8 @@
       //TODO: is this the best way to allocate this?
       byte[] payload = new byte[256];
       private TermPositions positions;
-
+      protected float payloadScore;
+      private int payloadsSeen;
 
       public BoostingSpanScorer(TermSpans spans, Weight weight,
                                 Similarity similarity, byte[] norms) throws IOException {
@@ -79,12 +82,17 @@
 
       }
 
-      public boolean next() throws IOException {
+      /**
+       * Go to the next document
+       * 
+       */
+      /*public boolean next() throws IOException {
 
         boolean result = super.next();
         //set the payload.  super.next() properly increments the term positions
         if (result) {
-          loadPayload();
+          //Load the payloads for all 
+          processPayload();
         }
 
         return result;
@@ -94,15 +102,38 @@
         boolean result = super.skipTo(target);
 
         if (result) {
-          loadPayload();
+          processPayload();
         }
 
         return result;
+      }*/
+
+      protected boolean setFreqCurrentDoc() throws IOException {
+        if (!more) {
+          return false;
+        }
+        doc = spans.doc();
+        freq = 0.0f;
+        payloadScore = 0;
+        payloadsSeen = 0;
+        Similarity similarity1 = getSimilarity();
+        while (more && doc == spans.doc()) {
+          int matchLength = spans.end() - spans.start();
+
+          freq += similarity1.sloppyFreq(matchLength);
+          processPayload(similarity1);
+
+          more = spans.next();//this moves positions to the next match in this document
+        }
+        return more || (freq != 0);
       }
 
-      private void loadPayload() throws IOException {
+
+      protected void processPayload(Similarity similarity) throws IOException {
         if (positions.isPayloadAvailable()) {
           payload = positions.getPayload(payload, 0);
+          payloadScore += similarity.scorePayload(payload, 0, positions.getPayloadLength());
+          payloadsSeen++;
 
         } else {
           //zero out the payload?
@@ -112,8 +143,7 @@
 
       public float score() throws IOException {
 
-        int payLength = positions.getPayloadLength();
-        return super.score() * (payLength > 0 ? getSimilarity().scorePayload(payload, 0, payLength) : 1);
+        return super.score() * (payloadsSeen > 0 ? (payloadScore / payloadsSeen) : 1);
       }
 
 
@@ -127,14 +157,15 @@
         result.addDetail(payloadBoost);
 /*
         if (skipTo(doc) == true) {
-          loadPayload();
+          processPayload();
         }
 */
-        float payloadScore = getSimilarity().scorePayload(payload, 0, positions.getPayloadLength());
-        payloadBoost.setValue(payloadScore);
+
+        float avgPayloadScore = payloadScore / payloadsSeen;
+        payloadBoost.setValue(avgPayloadScore);
         //GSI: I suppose we could toString the payload, but I don't think that would be a good idea 
         payloadBoost.setDescription("scorePayload(...)");
-        result.setValue(nonPayloadExpl.getValue() * payloadScore);
+        result.setValue(nonPayloadExpl.getValue() * avgPayloadScore);
         result.setDescription("btq");
         return result;
       }

Modified: lucene/java/trunk/src/java/org/apache/lucene/search/spans/SpanScorer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/spans/SpanScorer.java?view=diff&rev=531913&r1=531912&r2=531913
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/spans/SpanScorer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/spans/SpanScorer.java Tue Apr 24 05:49:42 2007
@@ -71,7 +71,7 @@
     return setFreqCurrentDoc();
   }
 
-  private boolean setFreqCurrentDoc() throws IOException {
+  protected boolean setFreqCurrentDoc() throws IOException {
     if (! more) {
       return false;
     }

Modified: lucene/java/trunk/src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java?view=diff&rev=531913&r1=531912&r2=531913
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java Tue Apr 24 05:49:42 2007
@@ -35,6 +35,9 @@
 public class TestBoostingTermQuery extends TestCase {
   private IndexSearcher searcher;
   private BoostingSimilarity similarity = new BoostingSimilarity();
+  private byte[] payloadField = new byte[]{1};
+  private byte[] payloadMultiField1 = new byte[]{2};
+  private byte[] payloadMultiField2 = new byte[]{4};
 
   public TestBoostingTermQuery(String s) {
     super(s);
@@ -43,24 +46,43 @@
   private class PayloadAnalyzer extends Analyzer {
 
 
+
     public TokenStream tokenStream(String fieldName, Reader reader) {
       TokenStream result = new LowerCaseTokenizer(reader);
-      result = new PayloadFilter(result);
+      result = new PayloadFilter(result, fieldName);
       return result;
     }
   }
 
   private class PayloadFilter extends TokenFilter {
+    String fieldName;
+    int numSeen = 0;
 
-
-    public PayloadFilter(TokenStream input) {
+    public PayloadFilter(TokenStream input, String fieldName) {
       super(input);
+      this.fieldName = fieldName;
     }
 
     public Token next() throws IOException {
       Token result = input.next();
       if (result != null) {
-        result.setPayload(new Payload(encodePayload(result.termText()), 0, 4));
+        if (fieldName.equals("field"))
+        {
+          result.setPayload(new Payload(payloadField));
+        }
+        else if (fieldName.equals("multiField"))
+        {
+          if (numSeen  % 2 == 0)
+          {
+            result.setPayload(new Payload(payloadMultiField1));
+          }
+          else
+          {
+            result.setPayload(new Payload(payloadMultiField2));
+          }
+          numSeen++;
+        }
+
       }
       return result;
     }
@@ -76,6 +98,7 @@
     for (int i = 0; i < 1000; i++) {
       Document doc = new Document();
       doc.add(new Field("field", English.intToEnglish(i), Field.Store.YES, Field.Index.TOKENIZED));
+      doc.add(new Field("multiField", English.intToEnglish(i) + "  " + English.intToEnglish(i), Field.Store.YES, Field.Index.TOKENIZED));
       writer.addDocument(doc);
     }
     //writer.optimize();
@@ -85,29 +108,9 @@
     searcher.setSimilarity(similarity);
   }
 
-  private byte[] encodePayload(String englishInt)
-  {
-    int i = englishInt.hashCode();
-    byte[] bytes = new byte[4];
-    bytes[0] = (byte) (i >>> 24);
-    bytes[1] = (byte) (i >>> 16);
-    bytes[2] = (byte) (i >>> 8);
-    bytes[3] = (byte) i;
-    return bytes;
-  }
 
-  private int decodePayload(byte[] payload, int size)
-  {
-    //This should be equal to the hash code of the String representing the English int from English.intToEnglish
-    int result = (payload[0] << 24) | (payload[1] << 16) | (payload[2] << 8) | (payload[3]);
-    
-    /*assertEquals((byte) (size >>> 24), payload[0]);
-    assertEquals((byte) (size >>> 16), payload[1]);
-    assertEquals((byte) (size >>> 8), payload[2]);
-    assertEquals((byte) size, payload[3]);*/
 
-    return result;
-  }
+
 
   protected void tearDown() {
 
@@ -121,12 +124,11 @@
 
     //they should all have the exact same score, because they all contain seventy once, and we set
     //all the other similarity factors to be 1
-    //This score should be 1, since we normalize scores
-    int seventyHash = "seventy".hashCode();
-    assertTrue("score " + hits.getMaxScore() + " does not equal 'seventy' hashcode: " + seventyHash, hits.getMaxScore() == seventyHash);
+
+    assertTrue(hits.getMaxScore() + " does not equal: " + 1, hits.getMaxScore() == 1);
     for (int i = 0; i < hits.scoreDocs.length; i++) {
       ScoreDoc doc = hits.scoreDocs[i];
-      assertTrue("score " + doc.score + " does not equal 'seventy' hashcode: " + seventyHash, doc.score == seventyHash);
+      assertTrue(doc.score + " does not equal: " + 1, doc.score == 1);
     }
     CheckHits.checkExplanations(query, "field", searcher);
     Spans spans = query.getSpans(searcher.getIndexReader());
@@ -140,6 +142,48 @@
 
   }
 
+  public void testMultipleMatchesPerDoc() throws Exception {
+    BoostingTermQuery query = new BoostingTermQuery(new Term("multiField", "seventy"));
+    TopDocs hits = searcher.search(query, null, 100);
+    assertTrue("hits is null and it shouldn't be", hits != null);
+    assertTrue("hits Size: " + hits.totalHits + " is not: " + 100, hits.totalHits == 100);
+
+
+    //they should all have the exact same score, because they all contain seventy once, and we set
+    //all the other similarity factors to be 1
+
+    //System.out.println("Hash: " + seventyHash + " Twice Hash: " + 2*seventyHash);
+    assertTrue(hits.getMaxScore() + " does not equal: " + 3, hits.getMaxScore() == 3);
+    //there should be exactly 10 items that score a 3, all the rest should score a 2
+    //The 10 items are: 70 + i*100 where i in [0-9]
+    int numTens = 0;
+    for (int i = 0; i < hits.scoreDocs.length; i++) {
+      ScoreDoc doc = hits.scoreDocs[i];
+      if (doc.doc % 10 == 0)
+      {
+        numTens++;
+        assertTrue(doc.score + " does not equal: " + 3, doc.score == 3);
+      }
+      else
+      {
+        assertTrue(doc.score + " does not equal: " + 2, doc.score == 2);
+      }
+    }
+    assertTrue(numTens + " does not equal: " + 10, numTens == 10);
+    CheckHits.checkExplanations(query, "field", searcher);
+    Spans spans = query.getSpans(searcher.getIndexReader());
+    assertTrue("spans is null and it shouldn't be", spans != null);
+    assertTrue("spans is not an instanceof " + TermSpans.class, spans instanceof TermSpans);
+    //should be two matches per document
+    int count = 0;
+    //100 hits times 2 matches per hit, we should have 200 in count
+    while (spans.next())
+    {
+      count++;
+    }
+    assertTrue(count + " does not equal: " + 200, count == 200);
+  }
+
   public void testNoMatch() throws Exception {
     BoostingTermQuery query = new BoostingTermQuery(new Term("field", "junk"));
     TopDocs hits = searcher.search(query, null, 100);
@@ -155,7 +199,7 @@
     // TODO: Remove warning after API has been finalized
     public float scorePayload(byte[] payload, int offset, int length) {
       //we know it is size 4 here, so ignore the offset/length
-      return decodePayload(payload,4);
+      return payload[0];
     }
 
     //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!