You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-user@lucene.apache.org by Peter Keegan <pe...@gmail.com> on 2007/10/03 00:44:46 UTC
BoostingTermQuery performance
I have been experimenting with payloads and BoostingTermQuery, which I think
are excellent additions to Lucene core. Currently, BoostingTermQuery extends
SpanQuery. I would suggest changing this class to extend TermQuery and
refactor the current version to something like 'BoostingSpanQuery'.
The reason is rooted in performance. In my testing, I compared query
throughput using TermQuery against 2 versions of BoostingTermQuery - the
current one that extends SpanQuery and one that extends TermQuery (which
I've included, below). Here are the results (qps = queries per second):
TermQuery: 200 qps
BoostingTermQuery (extends SpanQuery): 97 qps
BoostingTermQuery (extends TermQuery): 130 qps
Here is a version of BoostingTermQuery that extends TermQuery. I had to
modify TermQuery and TermScorer to make them public. A code review would be
in order, and I would appreciate your comments on this suggestion.
Peter
-----------------------------------------
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.search.*;
import java.io.IOException;
/**
* Copyright 2004 The Apache Software Foundation
* <p/>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* <p/>
* http://www.apache.org/licenses/LICENSE-2.0
* <p/>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* The BoostingTermQuery is very similar to the {@link
org.apache.lucene.search.spans.SpanTermQuery} except
* that it factors in the value of the payload located at each of the
positions where the
* {@link org.apache.lucene.index.Term} occurs.
* <p>
* In order to take advantage of this, you must override {@link
org.apache.lucene.search.Similarity#scorePayload(byte[],int,int)}
* which returns 1 by default.
* <p>
* Payload scores are averaged across term occurrences in the document.
*
* <p><font color="#FF0000">
* WARNING: The status of the <b>Payloads</b> feature is experimental.
* The APIs introduced here might change in the future and will not be
* supported anymore in such a case.</font>
*
* @see org.apache.lucene.search.Similarity#scorePayload(byte[], int, int)
*/
public class BoostingTermQuery extends TermQuery{
Term term;
Similarity similarity;
public BoostingTermQuery(Term term) {
super(term);
this.term = term;
}
protected Weight createWeight(Searcher searcher) throws IOException {
this.similarity = getSimilarity(searcher);
return new BoostingTermWeight(this, searcher);
}
protected class BoostingTermWeight extends TermWeight implements Weight {
public BoostingTermWeight(BoostingTermQuery query, Searcher searcher)
throws IOException {
super(searcher);
}
public Scorer scorer(IndexReader reader) throws IOException {
return new BoostingTermScorer(reader.termDocs(term),
reader.termPositions(term), this, similarity,
reader.norms(term.field()));
}
class BoostingTermScorer extends TermScorer {
//TODO: is this the best way to allocate this?
byte[] payload = new byte[256];
private TermPositions positions;
protected float payloadScore;
private int payloadsSeen;
public BoostingTermScorer(TermDocs termDocs, TermPositions
termPositions, Weight weight,
Similarity similarity, byte[] norms) throws
IOException {
super(weight, termDocs, similarity, norms);
positions = termPositions;
}
/**
* Go to the next document
*
*/
public boolean next() throws IOException {
boolean result = super.next();
//set the payload. super.next() properly increments the term
positions
if (result) {
if (positions.skipTo(super.doc())) {
positions.nextPosition();
processPayload(similarity);
}
}
return result;
}
public boolean skipTo(int target) throws IOException {
boolean result = super.skipTo(target);
if (result) {
if (positions.skipTo(target)) {
positions.nextPosition();
processPayload(similarity);
}
}
return result;
}
// protected boolean setFreqCurrentDoc() throws IOException {
// if (!more) {
// return false;
// }
// doc = spans.doc();
// freq = 0.0f;
// payloadScore = 0;
// payloadsSeen = 0;
// Similarity similarity1 = getSimilarity();
// while (more && doc == spans.doc()) {
// int matchLength = spans.end() - spans.start();
//
// freq += similarity1.sloppyFreq(matchLength);
// processPayload(similarity1);
//
// more = spans.next();//this moves positions to the next match in
this document
// }
// return more || (freq != 0);
// }
protected void processPayload(Similarity similarity) throws
IOException {
if (positions.isPayloadAvailable()) {
payload = positions.getPayload(payload, 0);
payloadScore += similarity.scorePayload(payload, 0,
positions.getPayloadLength());
payloadsSeen++;
} else {
//zero out the payload?
}
}
public float score() {
return super.score() * (payloadsSeen > 0 ? (payloadScore /
payloadsSeen) : 1);
}
public Explanation explain(final int doc) throws IOException {
Explanation result = new Explanation();
Explanation nonPayloadExpl = super.explain(doc);
result.addDetail(nonPayloadExpl);
//QUESTION: Is there a wau to avoid this skipTo call? We need to
know whether to load the payload or not
Explanation payloadBoost = new Explanation();
result.addDetail(payloadBoost);
/*
if (skipTo(doc) == true) {
processPayload();
}
*/
float avgPayloadScore = (payloadsSeen > 0 ? (payloadScore /
payloadsSeen) : 1);
payloadBoost.setValue(avgPayloadScore);
//GSI: I suppose we could toString the payload, but I don't think
that would be a good idea
payloadBoost.setDescription("scorePayload(...)");
result.setValue(nonPayloadExpl.getValue() * avgPayloadScore);
result.setDescription("btq, product of:");
return result;
}
}
}
public boolean equals(Object o) {
if (!(o instanceof BoostingTermQuery))
return false;
BoostingTermQuery other = (BoostingTermQuery) o;
return (this.getBoost() == other.getBoost())
&& this.term.equals(other.term);
}
}
Diffs for TermQuery, TermScorer:
Index: src/java/org/apache/lucene/search/TermQuery.java
===================================================================
--- src/java/org/apache/lucene/search/TermQuery.java (revision 581018)
+++ src/java/org/apache/lucene/search/TermQuery.java (working copy)
@@ -31,7 +31,7 @@
public class TermQuery extends Query {
private Term term;
- private class TermWeight implements Weight {
+ public class TermWeight implements Weight {
private Similarity similarity;
private float value;
private float idf;
Index: src/java/org/apache/lucene/search/TermScorer.java
===================================================================
--- src/java/org/apache/lucene/search/TermScorer.java (revision 581018)
+++ src/java/org/apache/lucene/search/TermScorer.java (working copy)
@@ -23,7 +23,7 @@
/** Expert: A <code>Scorer</code> for documents matching a
<code>Term</code>.
*/
-final class TermScorer extends Scorer {
+public class TermScorer extends Scorer {
private Weight weight;
private TermDocs termDocs;
private byte[] norms;
@@ -44,7 +44,7 @@
* @param similarity The </code>Similarity</code> implementation to be
used for score computations.
* @param norms The field norms of the document fields for the
<code>Term</code>.
*/
- TermScorer(Weight weight, TermDocs td, Similarity similarity,
+ public TermScorer(Weight weight, TermDocs td, Similarity similarity,
byte[] norms) {
super(similarity);
this.weight = weight;
Peter
Re: BoostingTermQuery performance
Posted by Grant Ingersoll <gs...@apache.org>.
https://issues.apache.org/jira/browse/LUCENE-1017
On Oct 2, 2007, at 8:25 PM, Mike Klaas wrote:
> On 2-Oct-07, at 3:44 PM, Peter Keegan wrote:
>
>> I have been experimenting with payloads and BoostingTermQuery,
>> which I think
>> are excellent additions to Lucene core. Currently,
>> BoostingTermQuery extends
>> SpanQuery. I would suggest changing this class to extend TermQuery
>> and
>> refactor the current version to something like 'BoostingSpanQuery'.
>>
>> The reason is rooted in performance. In my testing, I compared query
>> throughput using TermQuery against 2 versions of BoostingTermQuery
>> - the
>> current one that extends SpanQuery and one that extends TermQuery
>> (which
>> I've included, below). Here are the results (qps = queries per
>> second):
>>
>> TermQuery: 200 qps
>> BoostingTermQuery (extends SpanQuery): 97 qps
>> BoostingTermQuery (extends TermQuery): 130 qps
>>
>> Here is a version of BoostingTermQuery that extends TermQuery. I
>> had to
>> modify TermQuery and TermScorer to make them public. A code review
>> would be
>> in order, and I would appreciate your comments on this suggestion.
>
> Awesome! I wasn't aware that there was such a difference. With a
> performance gap that large, it is definitely worth having the option.
>
> Payload have the potential to be a heavily-used feature in Lucene,
> and performacen will be key for that.
>
> -Mike
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-user-help@lucene.apache.org
>
--------------------------
Grant Ingersoll
http://lucene.grantingersoll.com
Lucene Helpful Hints:
http://wiki.apache.org/lucene-java/BasicsOfPerformance
http://wiki.apache.org/lucene-java/LuceneFAQ
---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org
Re: BoostingTermQuery performance
Posted by Mike Klaas <mi...@gmail.com>.
On 2-Oct-07, at 3:44 PM, Peter Keegan wrote:
> I have been experimenting with payloads and BoostingTermQuery,
> which I think
> are excellent additions to Lucene core. Currently,
> BoostingTermQuery extends
> SpanQuery. I would suggest changing this class to extend TermQuery and
> refactor the current version to something like 'BoostingSpanQuery'.
>
> The reason is rooted in performance. In my testing, I compared query
> throughput using TermQuery against 2 versions of BoostingTermQuery
> - the
> current one that extends SpanQuery and one that extends TermQuery
> (which
> I've included, below). Here are the results (qps = queries per
> second):
>
> TermQuery: 200 qps
> BoostingTermQuery (extends SpanQuery): 97 qps
> BoostingTermQuery (extends TermQuery): 130 qps
>
> Here is a version of BoostingTermQuery that extends TermQuery. I
> had to
> modify TermQuery and TermScorer to make them public. A code review
> would be
> in order, and I would appreciate your comments on this suggestion.
Awesome! I wasn't aware that there was such a difference. With a
performance gap that large, it is definitely worth having the option.
Payload have the potential to be a heavily-used feature in Lucene,
and performacen will be key for that.
-Mike
---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org
Re: BoostingTermQuery performance
Posted by Grant Ingersoll <gs...@apache.org>.
Hi Peter,
This sounds interesting. Can you put this in JIRA as a patch,
please? I am slowly but surely working on Span query stuff, so
hopefully I can get to it soon.
Thanks,
Grant
On Oct 2, 2007, at 6:44 PM, Peter Keegan wrote:
> I have been experimenting with payloads and BoostingTermQuery,
> which I think
> are excellent additions to Lucene core. Currently,
> BoostingTermQuery extends
> SpanQuery. I would suggest changing this class to extend TermQuery and
> refactor the current version to something like 'BoostingSpanQuery'.
>
> The reason is rooted in performance. In my testing, I compared query
> throughput using TermQuery against 2 versions of BoostingTermQuery
> - the
> current one that extends SpanQuery and one that extends TermQuery
> (which
> I've included, below). Here are the results (qps = queries per
> second):
>
> TermQuery: 200 qps
> BoostingTermQuery (extends SpanQuery): 97 qps
> BoostingTermQuery (extends TermQuery): 130 qps
>
> Here is a version of BoostingTermQuery that extends TermQuery. I
> had to
> modify TermQuery and TermScorer to make them public. A code review
> would be
> in order, and I would appreciate your comments on this suggestion.
>
> Peter
>
> -----------------------------------------
>
> import org.apache.lucene.index.IndexReader;
> import org.apache.lucene.index.Term;
> import org.apache.lucene.index.TermDocs;
> import org.apache.lucene.index.TermPositions;
> import org.apache.lucene.search.*;
>
>
> import java.io.IOException;
>
> /**
> * Copyright 2004 The Apache Software Foundation
> * <p/>
> * Licensed under the Apache License, Version 2.0 (the "License");
> * you may not use this file except in compliance with the License.
> * You may obtain a copy of the License at
> * <p/>
> * http://www.apache.org/licenses/LICENSE-2.0
> * <p/>
> * Unless required by applicable law or agreed to in writing, software
> * distributed under the License is distributed on an "AS IS" BASIS,
> * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> * See the License for the specific language governing permissions and
> * limitations under the License.
> */
>
> /**
> * The BoostingTermQuery is very similar to the {@link
> org.apache.lucene.search.spans.SpanTermQuery} except
> * that it factors in the value of the payload located at each of the
> positions where the
> * {@link org.apache.lucene.index.Term} occurs.
> * <p>
> * In order to take advantage of this, you must override {@link
> org.apache.lucene.search.Similarity#scorePayload(byte[],int,int)}
> * which returns 1 by default.
> * <p>
> * Payload scores are averaged across term occurrences in the
> document.
> *
> * <p><font color="#FF0000">
> * WARNING: The status of the <b>Payloads</b> feature is experimental.
> * The APIs introduced here might change in the future and will not be
> * supported anymore in such a case.</font>
> *
> * @see org.apache.lucene.search.Similarity#scorePayload(byte[],
> int, int)
> */
> public class BoostingTermQuery extends TermQuery{
> Term term;
> Similarity similarity;
>
> public BoostingTermQuery(Term term) {
> super(term);
> this.term = term;
>
> }
>
>
> protected Weight createWeight(Searcher searcher) throws
> IOException {
> this.similarity = getSimilarity(searcher);
> return new BoostingTermWeight(this, searcher);
> }
>
> protected class BoostingTermWeight extends TermWeight implements
> Weight {
>
>
> public BoostingTermWeight(BoostingTermQuery query, Searcher
> searcher)
> throws IOException {
> super(searcher);
> }
>
>
>
>
> public Scorer scorer(IndexReader reader) throws IOException {
> return new BoostingTermScorer(reader.termDocs(term),
> reader.termPositions(term), this, similarity,
> reader.norms(term.field()));
> }
>
> class BoostingTermScorer extends TermScorer {
>
> //TODO: is this the best way to allocate this?
> byte[] payload = new byte[256];
> private TermPositions positions;
> protected float payloadScore;
> private int payloadsSeen;
>
> public BoostingTermScorer(TermDocs termDocs, TermPositions
> termPositions, Weight weight,
> Similarity similarity, byte[]
> norms) throws
> IOException {
> super(weight, termDocs, similarity, norms);
> positions = termPositions;
>
> }
>
> /**
> * Go to the next document
> *
> */
> public boolean next() throws IOException {
>
> boolean result = super.next();
> //set the payload. super.next() properly increments the term
> positions
> if (result) {
> if (positions.skipTo(super.doc())) {
> positions.nextPosition();
> processPayload(similarity);
> }
> }
>
> return result;
> }
>
> public boolean skipTo(int target) throws IOException {
> boolean result = super.skipTo(target);
>
> if (result) {
> if (positions.skipTo(target)) {
> positions.nextPosition();
> processPayload(similarity);
> }
> }
>
> return result;
> }
>
> // protected boolean setFreqCurrentDoc() throws IOException {
> // if (!more) {
> // return false;
> // }
> // doc = spans.doc();
> // freq = 0.0f;
> // payloadScore = 0;
> // payloadsSeen = 0;
> // Similarity similarity1 = getSimilarity();
> // while (more && doc == spans.doc()) {
> // int matchLength = spans.end() - spans.start();
> //
> // freq += similarity1.sloppyFreq(matchLength);
> // processPayload(similarity1);
> //
> // more = spans.next();//this moves positions to the next
> match in
> this document
> // }
> // return more || (freq != 0);
> // }
>
>
> protected void processPayload(Similarity similarity) throws
> IOException {
> if (positions.isPayloadAvailable()) {
> payload = positions.getPayload(payload, 0);
> payloadScore += similarity.scorePayload(payload, 0,
> positions.getPayloadLength());
> payloadsSeen++;
>
> } else {
> //zero out the payload?
> }
>
> }
>
> public float score() {
>
> return super.score() * (payloadsSeen > 0 ? (payloadScore /
> payloadsSeen) : 1);
> }
>
>
> public Explanation explain(final int doc) throws IOException {
> Explanation result = new Explanation();
> Explanation nonPayloadExpl = super.explain(doc);
> result.addDetail(nonPayloadExpl);
> //QUESTION: Is there a wau to avoid this skipTo call? We
> need to
> know whether to load the payload or not
>
> Explanation payloadBoost = new Explanation();
> result.addDetail(payloadBoost);
> /*
> if (skipTo(doc) == true) {
> processPayload();
> }
> */
>
> float avgPayloadScore = (payloadsSeen > 0 ? (payloadScore /
> payloadsSeen) : 1);
> payloadBoost.setValue(avgPayloadScore);
> //GSI: I suppose we could toString the payload, but I don't
> think
> that would be a good idea
> payloadBoost.setDescription("scorePayload(...)");
> result.setValue(nonPayloadExpl.getValue() * avgPayloadScore);
> result.setDescription("btq, product of:");
> return result;
> }
> }
>
> }
>
>
> public boolean equals(Object o) {
> if (!(o instanceof BoostingTermQuery))
> return false;
> BoostingTermQuery other = (BoostingTermQuery) o;
> return (this.getBoost() == other.getBoost())
> && this.term.equals(other.term);
> }
> }
>
>
> Diffs for TermQuery, TermScorer:
>
> Index: src/java/org/apache/lucene/search/TermQuery.java
> ===================================================================
> --- src/java/org/apache/lucene/search/TermQuery.java (revision
> 581018)
> +++ src/java/org/apache/lucene/search/TermQuery.java (working copy)
> @@ -31,7 +31,7 @@
> public class TermQuery extends Query {
> private Term term;
>
> - private class TermWeight implements Weight {
> + public class TermWeight implements Weight {
> private Similarity similarity;
> private float value;
> private float idf;
> Index: src/java/org/apache/lucene/search/TermScorer.java
> ===================================================================
> --- src/java/org/apache/lucene/search/TermScorer.java (revision
> 581018)
> +++ src/java/org/apache/lucene/search/TermScorer.java (working
> copy)
> @@ -23,7 +23,7 @@
>
> /** Expert: A <code>Scorer</code> for documents matching a
> <code>Term</code>.
> */
> -final class TermScorer extends Scorer {
> +public class TermScorer extends Scorer {
> private Weight weight;
> private TermDocs termDocs;
> private byte[] norms;
> @@ -44,7 +44,7 @@
> * @param similarity The </code>Similarity</code> implementation
> to be
> used for score computations.
> * @param norms The field norms of the document fields for the
> <code>Term</code>.
> */
> - TermScorer(Weight weight, TermDocs td, Similarity similarity,
> + public TermScorer(Weight weight, TermDocs td, Similarity
> similarity,
> byte[] norms) {
> super(similarity);
> this.weight = weight;
>
>
> Peter
--------------------------
Grant Ingersoll
http://lucene.grantingersoll.com
Lucene Helpful Hints:
http://wiki.apache.org/lucene-java/BasicsOfPerformance
http://wiki.apache.org/lucene-java/LuceneFAQ
---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org
Re: BoostingTermQuery performance
Posted by Karl Wettin <ka...@gmail.com>.
3 okt 2007 kl. 00.44 skrev Peter Keegan:
>
> TermQuery: 200 qps
> BoostingTermQuery (extends SpanQuery): 97 qps
> BoostingTermQuery (extends TermQuery): 130 qps
>
> Here is a version of BoostingTermQuery that extends TermQuery. I
> had to
> modify TermQuery and TermScorer to make them public. A code review
> would be
> in order, and I would appreciate your comments on this suggestion.
This is nice, you should open a ticket in the Jira and post it in there!
--
karl
---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org