You are viewing a plain text version of this content. The canonical link for it is here.

Posted to java-user@lucene.apache.org by Peter Keegan <pe...@gmail.com> on 2007/10/03 00:44:46 UTC

BoostingTermQuery performance

I have been experimenting with payloads and BoostingTermQuery, which I think
are excellent additions to Lucene core. Currently, BoostingTermQuery extends
SpanQuery. I would suggest changing this class to extend TermQuery and
refactor the current version to something like 'BoostingSpanQuery'.

The reason is rooted in performance. In my testing, I compared query
throughput using TermQuery against 2 versions of BoostingTermQuery - the
current one that extends SpanQuery and one that extends TermQuery (which
I've included, below). Here are the results (qps = queries per second):

TermQuery:    200 qps
BoostingTermQuery (extends SpanQuery): 97 qps
BoostingTermQuery (extends TermQuery): 130 qps

Here is a version of BoostingTermQuery that extends TermQuery. I had to
modify TermQuery and TermScorer to make them public. A code review would be
in order, and I would appreciate your comments on this suggestion.

Peter

-----------------------------------------

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.search.*;


import java.io.IOException;

/**
 * Copyright 2004 The Apache Software Foundation
 * <p/>
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * <p/>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p/>
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/**
 * The BoostingTermQuery is very similar to the {@link
org.apache.lucene.search.spans.SpanTermQuery} except
 * that it factors in the value of the payload located at each of the
positions where the
 * {@link org.apache.lucene.index.Term} occurs.
 * <p>
 * In order to take advantage of this, you must override {@link
org.apache.lucene.search.Similarity#scorePayload(byte[],int,int)}
 * which returns 1 by default.
 * <p>
 * Payload scores are averaged across term occurrences in the document.
 *
 * <p><font color="#FF0000">
 * WARNING: The status of the <b>Payloads</b> feature is experimental.
 * The APIs introduced here might change in the future and will not be
 * supported anymore in such a case.</font>
 *
 * @see org.apache.lucene.search.Similarity#scorePayload(byte[], int, int)
 */
public class BoostingTermQuery extends TermQuery{
    Term term;
    Similarity similarity;

  public BoostingTermQuery(Term term) {
    super(term);
    this.term = term;

  }


  protected Weight createWeight(Searcher searcher) throws IOException {
    this.similarity = getSimilarity(searcher);
    return new BoostingTermWeight(this, searcher);
  }

  protected class BoostingTermWeight extends TermWeight implements Weight {


    public BoostingTermWeight(BoostingTermQuery query, Searcher searcher)
throws IOException {
      super(searcher);
    }




    public Scorer scorer(IndexReader reader) throws IOException {
      return new BoostingTermScorer(reader.termDocs(term),
reader.termPositions(term), this, similarity,
             reader.norms(term.field()));
    }

    class BoostingTermScorer extends TermScorer {

      //TODO: is this the best way to allocate this?
      byte[] payload = new byte[256];
      private TermPositions positions;
      protected float payloadScore;
      private int payloadsSeen;

      public BoostingTermScorer(TermDocs termDocs, TermPositions
termPositions, Weight weight,
                                Similarity similarity, byte[] norms) throws
IOException {
          super(weight, termDocs, similarity, norms);
          positions = termPositions;

      }

      /**
       * Go to the next document
       *
       */
      public boolean next() throws IOException {

        boolean result = super.next();
        //set the payload.  super.next() properly increments the term
positions
        if (result) {
          if (positions.skipTo(super.doc())) {
              positions.nextPosition();
              processPayload(similarity);
          }
        }

        return result;
      }

      public boolean skipTo(int target) throws IOException {
        boolean result = super.skipTo(target);

        if (result) {
            if (positions.skipTo(target)) {
                positions.nextPosition();
              processPayload(similarity);
          }
        }

        return result;
      }

//      protected boolean setFreqCurrentDoc() throws IOException {
//        if (!more) {
//          return false;
//        }
//        doc = spans.doc();
//        freq = 0.0f;
//        payloadScore = 0;
//        payloadsSeen = 0;
//        Similarity similarity1 = getSimilarity();
//        while (more && doc == spans.doc()) {
//          int matchLength = spans.end() - spans.start();
//
//          freq += similarity1.sloppyFreq(matchLength);
//          processPayload(similarity1);
//
//          more = spans.next();//this moves positions to the next match in
this document
//        }
//        return more || (freq != 0);
//      }


      protected void processPayload(Similarity similarity) throws
IOException {
        if (positions.isPayloadAvailable()) {
          payload = positions.getPayload(payload, 0);
          payloadScore += similarity.scorePayload(payload, 0,
positions.getPayloadLength());
          payloadsSeen++;

        } else {
          //zero out the payload?
        }

      }

      public float score() {

        return super.score() * (payloadsSeen > 0 ? (payloadScore /
payloadsSeen) : 1);
      }


      public Explanation explain(final int doc) throws IOException {
        Explanation result = new Explanation();
        Explanation nonPayloadExpl = super.explain(doc);
        result.addDetail(nonPayloadExpl);
        //QUESTION: Is there a wau to avoid this skipTo call?  We need to
know whether to load the payload or not

        Explanation payloadBoost = new Explanation();
        result.addDetail(payloadBoost);
/*
        if (skipTo(doc) == true) {
          processPayload();
        }
*/

        float avgPayloadScore =  (payloadsSeen > 0 ? (payloadScore /
payloadsSeen) : 1);
        payloadBoost.setValue(avgPayloadScore);
        //GSI: I suppose we could toString the payload, but I don't think
that would be a good idea
        payloadBoost.setDescription("scorePayload(...)");
        result.setValue(nonPayloadExpl.getValue() * avgPayloadScore);
        result.setDescription("btq, product of:");
        return result;
      }
    }

  }


  public boolean equals(Object o) {
    if (!(o instanceof BoostingTermQuery))
      return false;
    BoostingTermQuery other = (BoostingTermQuery) o;
    return (this.getBoost() == other.getBoost())
            && this.term.equals(other.term);
  }
}


Diffs for TermQuery, TermScorer:

Index: src/java/org/apache/lucene/search/TermQuery.java
===================================================================
--- src/java/org/apache/lucene/search/TermQuery.java    (revision 581018)
+++ src/java/org/apache/lucene/search/TermQuery.java    (working copy)
@@ -31,7 +31,7 @@
 public class TermQuery extends Query {
   private Term term;

-  private class TermWeight implements Weight {
+  public class TermWeight implements Weight {
     private Similarity similarity;
     private float value;
     private float idf;
Index: src/java/org/apache/lucene/search/TermScorer.java
===================================================================
--- src/java/org/apache/lucene/search/TermScorer.java    (revision 581018)
+++ src/java/org/apache/lucene/search/TermScorer.java    (working copy)
@@ -23,7 +23,7 @@

 /** Expert: A <code>Scorer</code> for documents matching a
<code>Term</code>.
  */
-final class TermScorer extends Scorer {
+public class TermScorer extends Scorer {
   private Weight weight;
   private TermDocs termDocs;
   private byte[] norms;
@@ -44,7 +44,7 @@
    * @param similarity The </code>Similarity</code> implementation to be
used for score computations.
    * @param norms The field norms of the document fields for the
<code>Term</code>.
    */
-  TermScorer(Weight weight, TermDocs td, Similarity similarity,
+  public TermScorer(Weight weight, TermDocs td, Similarity similarity,
              byte[] norms) {
     super(similarity);
     this.weight = weight;


Peter

Re: BoostingTermQuery performance

Posted by Grant Ingersoll <gs...@apache.org>.

https://issues.apache.org/jira/browse/LUCENE-1017


On Oct 2, 2007, at 8:25 PM, Mike Klaas wrote:

> On 2-Oct-07, at 3:44 PM, Peter Keegan wrote:
>
>> I have been experimenting with payloads and BoostingTermQuery,  
>> which I think
>> are excellent additions to Lucene core. Currently,  
>> BoostingTermQuery extends
>> SpanQuery. I would suggest changing this class to extend TermQuery  
>> and
>> refactor the current version to something like 'BoostingSpanQuery'.
>>
>> The reason is rooted in performance. In my testing, I compared query
>> throughput using TermQuery against 2 versions of BoostingTermQuery  
>> - the
>> current one that extends SpanQuery and one that extends TermQuery  
>> (which
>> I've included, below). Here are the results (qps = queries per  
>> second):
>>
>> TermQuery:    200 qps
>> BoostingTermQuery (extends SpanQuery): 97 qps
>> BoostingTermQuery (extends TermQuery): 130 qps
>>
>> Here is a version of BoostingTermQuery that extends TermQuery. I  
>> had to
>> modify TermQuery and TermScorer to make them public. A code review  
>> would be
>> in order, and I would appreciate your comments on this suggestion.
>
> Awesome!  I wasn't aware that there was such a difference.  With a  
> performance gap that large, it is definitely worth having the option.
>
> Payload have the potential to be a heavily-used feature in Lucene,  
> and performacen will be key for that.
>
> -Mike
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-user-help@lucene.apache.org
>

--------------------------
Grant Ingersoll
http://lucene.grantingersoll.com

Lucene Helpful Hints:
http://wiki.apache.org/lucene-java/BasicsOfPerformance
http://wiki.apache.org/lucene-java/LuceneFAQ



---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org

Re: BoostingTermQuery performance

Posted by Mike Klaas <mi...@gmail.com>.

On 2-Oct-07, at 3:44 PM, Peter Keegan wrote:

> I have been experimenting with payloads and BoostingTermQuery,  
> which I think
> are excellent additions to Lucene core. Currently,  
> BoostingTermQuery extends
> SpanQuery. I would suggest changing this class to extend TermQuery and
> refactor the current version to something like 'BoostingSpanQuery'.
>
> The reason is rooted in performance. In my testing, I compared query
> throughput using TermQuery against 2 versions of BoostingTermQuery  
> - the
> current one that extends SpanQuery and one that extends TermQuery  
> (which
> I've included, below). Here are the results (qps = queries per  
> second):
>
> TermQuery:    200 qps
> BoostingTermQuery (extends SpanQuery): 97 qps
> BoostingTermQuery (extends TermQuery): 130 qps
>
> Here is a version of BoostingTermQuery that extends TermQuery. I  
> had to
> modify TermQuery and TermScorer to make them public. A code review  
> would be
> in order, and I would appreciate your comments on this suggestion.

Awesome!  I wasn't aware that there was such a difference.  With a  
performance gap that large, it is definitely worth having the option.

Payload have the potential to be a heavily-used feature in Lucene,  
and performacen will be key for that.

-Mike

---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org

Re: BoostingTermQuery performance

Posted by Grant Ingersoll <gs...@apache.org>.

Hi Peter,

This sounds interesting.  Can you put this in JIRA as a patch,  
please?  I am slowly but surely working on Span query stuff, so  
hopefully I can get to it soon.

Thanks,
Grant
On Oct 2, 2007, at 6:44 PM, Peter Keegan wrote:

> I have been experimenting with payloads and BoostingTermQuery,  
> which I think
> are excellent additions to Lucene core. Currently,  
> BoostingTermQuery extends
> SpanQuery. I would suggest changing this class to extend TermQuery and
> refactor the current version to something like 'BoostingSpanQuery'.
>
> The reason is rooted in performance. In my testing, I compared query
> throughput using TermQuery against 2 versions of BoostingTermQuery  
> - the
> current one that extends SpanQuery and one that extends TermQuery  
> (which
> I've included, below). Here are the results (qps = queries per  
> second):
>
> TermQuery:    200 qps
> BoostingTermQuery (extends SpanQuery): 97 qps
> BoostingTermQuery (extends TermQuery): 130 qps
>
> Here is a version of BoostingTermQuery that extends TermQuery. I  
> had to
> modify TermQuery and TermScorer to make them public. A code review  
> would be
> in order, and I would appreciate your comments on this suggestion.
>
> Peter
>
> -----------------------------------------
>
> import org.apache.lucene.index.IndexReader;
> import org.apache.lucene.index.Term;
> import org.apache.lucene.index.TermDocs;
> import org.apache.lucene.index.TermPositions;
> import org.apache.lucene.search.*;
>
>
> import java.io.IOException;
>
> /**
>  * Copyright 2004 The Apache Software Foundation
>  * <p/>
>  * Licensed under the Apache License, Version 2.0 (the "License");
>  * you may not use this file except in compliance with the License.
>  * You may obtain a copy of the License at
>  * <p/>
>  * http://www.apache.org/licenses/LICENSE-2.0
>  * <p/>
>  * Unless required by applicable law or agreed to in writing, software
>  * distributed under the License is distributed on an "AS IS" BASIS,
>  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  
> implied.
>  * See the License for the specific language governing permissions and
>  * limitations under the License.
>  */
>
> /**
>  * The BoostingTermQuery is very similar to the {@link
> org.apache.lucene.search.spans.SpanTermQuery} except
>  * that it factors in the value of the payload located at each of the
> positions where the
>  * {@link org.apache.lucene.index.Term} occurs.
>  * <p>
>  * In order to take advantage of this, you must override {@link
> org.apache.lucene.search.Similarity#scorePayload(byte[],int,int)}
>  * which returns 1 by default.
>  * <p>
>  * Payload scores are averaged across term occurrences in the  
> document.
>  *
>  * <p><font color="#FF0000">
>  * WARNING: The status of the <b>Payloads</b> feature is experimental.
>  * The APIs introduced here might change in the future and will not be
>  * supported anymore in such a case.</font>
>  *
>  * @see org.apache.lucene.search.Similarity#scorePayload(byte[],  
> int, int)
>  */
> public class BoostingTermQuery extends TermQuery{
>     Term term;
>     Similarity similarity;
>
>   public BoostingTermQuery(Term term) {
>     super(term);
>     this.term = term;
>
>   }
>
>
>   protected Weight createWeight(Searcher searcher) throws  
> IOException {
>     this.similarity = getSimilarity(searcher);
>     return new BoostingTermWeight(this, searcher);
>   }
>
>   protected class BoostingTermWeight extends TermWeight implements  
> Weight {
>
>
>     public BoostingTermWeight(BoostingTermQuery query, Searcher  
> searcher)
> throws IOException {
>       super(searcher);
>     }
>
>
>
>
>     public Scorer scorer(IndexReader reader) throws IOException {
>       return new BoostingTermScorer(reader.termDocs(term),
> reader.termPositions(term), this, similarity,
>              reader.norms(term.field()));
>     }
>
>     class BoostingTermScorer extends TermScorer {
>
>       //TODO: is this the best way to allocate this?
>       byte[] payload = new byte[256];
>       private TermPositions positions;
>       protected float payloadScore;
>       private int payloadsSeen;
>
>       public BoostingTermScorer(TermDocs termDocs, TermPositions
> termPositions, Weight weight,
>                                 Similarity similarity, byte[]  
> norms) throws
> IOException {
>           super(weight, termDocs, similarity, norms);
>           positions = termPositions;
>
>       }
>
>       /**
>        * Go to the next document
>        *
>        */
>       public boolean next() throws IOException {
>
>         boolean result = super.next();
>         //set the payload.  super.next() properly increments the term
> positions
>         if (result) {
>           if (positions.skipTo(super.doc())) {
>               positions.nextPosition();
>               processPayload(similarity);
>           }
>         }
>
>         return result;
>       }
>
>       public boolean skipTo(int target) throws IOException {
>         boolean result = super.skipTo(target);
>
>         if (result) {
>             if (positions.skipTo(target)) {
>                 positions.nextPosition();
>               processPayload(similarity);
>           }
>         }
>
>         return result;
>       }
>
> //      protected boolean setFreqCurrentDoc() throws IOException {
> //        if (!more) {
> //          return false;
> //        }
> //        doc = spans.doc();
> //        freq = 0.0f;
> //        payloadScore = 0;
> //        payloadsSeen = 0;
> //        Similarity similarity1 = getSimilarity();
> //        while (more && doc == spans.doc()) {
> //          int matchLength = spans.end() - spans.start();
> //
> //          freq += similarity1.sloppyFreq(matchLength);
> //          processPayload(similarity1);
> //
> //          more = spans.next();//this moves positions to the next  
> match in
> this document
> //        }
> //        return more || (freq != 0);
> //      }
>
>
>       protected void processPayload(Similarity similarity) throws
> IOException {
>         if (positions.isPayloadAvailable()) {
>           payload = positions.getPayload(payload, 0);
>           payloadScore += similarity.scorePayload(payload, 0,
> positions.getPayloadLength());
>           payloadsSeen++;
>
>         } else {
>           //zero out the payload?
>         }
>
>       }
>
>       public float score() {
>
>         return super.score() * (payloadsSeen > 0 ? (payloadScore /
> payloadsSeen) : 1);
>       }
>
>
>       public Explanation explain(final int doc) throws IOException {
>         Explanation result = new Explanation();
>         Explanation nonPayloadExpl = super.explain(doc);
>         result.addDetail(nonPayloadExpl);
>         //QUESTION: Is there a wau to avoid this skipTo call?  We  
> need to
> know whether to load the payload or not
>
>         Explanation payloadBoost = new Explanation();
>         result.addDetail(payloadBoost);
> /*
>         if (skipTo(doc) == true) {
>           processPayload();
>         }
> */
>
>         float avgPayloadScore =  (payloadsSeen > 0 ? (payloadScore /
> payloadsSeen) : 1);
>         payloadBoost.setValue(avgPayloadScore);
>         //GSI: I suppose we could toString the payload, but I don't  
> think
> that would be a good idea
>         payloadBoost.setDescription("scorePayload(...)");
>         result.setValue(nonPayloadExpl.getValue() * avgPayloadScore);
>         result.setDescription("btq, product of:");
>         return result;
>       }
>     }
>
>   }
>
>
>   public boolean equals(Object o) {
>     if (!(o instanceof BoostingTermQuery))
>       return false;
>     BoostingTermQuery other = (BoostingTermQuery) o;
>     return (this.getBoost() == other.getBoost())
>             && this.term.equals(other.term);
>   }
> }
>
>
> Diffs for TermQuery, TermScorer:
>
> Index: src/java/org/apache/lucene/search/TermQuery.java
> ===================================================================
> --- src/java/org/apache/lucene/search/TermQuery.java    (revision  
> 581018)
> +++ src/java/org/apache/lucene/search/TermQuery.java    (working copy)
> @@ -31,7 +31,7 @@
>  public class TermQuery extends Query {
>    private Term term;
>
> -  private class TermWeight implements Weight {
> +  public class TermWeight implements Weight {
>      private Similarity similarity;
>      private float value;
>      private float idf;
> Index: src/java/org/apache/lucene/search/TermScorer.java
> ===================================================================
> --- src/java/org/apache/lucene/search/TermScorer.java    (revision  
> 581018)
> +++ src/java/org/apache/lucene/search/TermScorer.java    (working  
> copy)
> @@ -23,7 +23,7 @@
>
>  /** Expert: A <code>Scorer</code> for documents matching a
> <code>Term</code>.
>   */
> -final class TermScorer extends Scorer {
> +public class TermScorer extends Scorer {
>    private Weight weight;
>    private TermDocs termDocs;
>    private byte[] norms;
> @@ -44,7 +44,7 @@
>     * @param similarity The </code>Similarity</code> implementation  
> to be
> used for score computations.
>     * @param norms The field norms of the document fields for the
> <code>Term</code>.
>     */
> -  TermScorer(Weight weight, TermDocs td, Similarity similarity,
> +  public TermScorer(Weight weight, TermDocs td, Similarity  
> similarity,
>               byte[] norms) {
>      super(similarity);
>      this.weight = weight;
>
>
> Peter

--------------------------
Grant Ingersoll
http://lucene.grantingersoll.com

Lucene Helpful Hints:
http://wiki.apache.org/lucene-java/BasicsOfPerformance
http://wiki.apache.org/lucene-java/LuceneFAQ



---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org

Re: BoostingTermQuery performance

Posted by Karl Wettin <ka...@gmail.com>.

3 okt 2007 kl. 00.44 skrev Peter Keegan:

>
> TermQuery:    200 qps
> BoostingTermQuery (extends SpanQuery): 97 qps
> BoostingTermQuery (extends TermQuery): 130 qps
>
> Here is a version of BoostingTermQuery that extends TermQuery. I  
> had to
> modify TermQuery and TermScorer to make them public. A code review  
> would be
> in order, and I would appreciate your comments on this suggestion.

This is nice, you should open a ticket in the Jira and post it in there!


-- 
karl

---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org