You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by gs...@apache.org on 2009/08/06 17:14:18 UTC
svn commit: r801667 - in /lucene/java/trunk: ./
src/java/org/apache/lucene/search/payloads/
src/java/org/apache/lucene/search/spans/
src/test/org/apache/lucene/search/payloads/
Author: gsingers
Date: Thu Aug 6 15:14:18 2009
New Revision: 801667
URL: http://svn.apache.org/viewvc?rev=801667&view=rev
Log:
LUCENE-1341: Added BoostingNearQuery
Added:
lucene/java/trunk/src/java/org/apache/lucene/search/payloads/BoostingNearQuery.java (with props)
lucene/java/trunk/src/test/org/apache/lucene/search/payloads/TestBoostingNearQuery.java (with props)
Modified:
lucene/java/trunk/CHANGES.txt
lucene/java/trunk/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java
lucene/java/trunk/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java
lucene/java/trunk/src/java/org/apache/lucene/search/spans/SpanNearQuery.java
Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=801667&r1=801666&r2=801667&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Thu Aug 6 15:14:18 2009
@@ -662,6 +662,9 @@
the javadocs implied. If you have payloads and want to use an ordered
SpanNearQuery that does not need to use the payloads, you can
disable loading them with a new constructor switch. (Mark Miller)
+
+34. LUCENE-1341: Added BoostingNearQuery to enable SpanNearQuery functionality
+ with payloads (Peter Keegan, Grant Ingersoll)
Optimizations
Added: lucene/java/trunk/src/java/org/apache/lucene/search/payloads/BoostingNearQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/payloads/BoostingNearQuery.java?rev=801667&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/payloads/BoostingNearQuery.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/payloads/BoostingNearQuery.java Thu Aug 6 15:14:18 2009
@@ -0,0 +1,151 @@
+package org.apache.lucene.search.payloads;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.Explanation;
+import org.apache.lucene.search.QueryWeight;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.Searcher;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.Weight;
+import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanWeight;
+import org.apache.lucene.search.spans.SpanScorer;
+import org.apache.lucene.search.spans.Spans;
+import org.apache.lucene.search.spans.NearSpansOrdered;
+import org.apache.lucene.search.spans.NearSpansUnordered;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.Collection;
+
+/**
+ * The BoostingNearQuery is very similar to the {@link org.apache.lucene.search.spans.SpanNearQuery} except
+ * that it factors in the value of the payloads located at each of the positions where the
+ * {@link org.apache.lucene.search.spans.TermSpans} occurs.
+ * <p>
+ * In order to take advantage of this, you must override {@link org.apache.lucene.search.Similarity#scorePayload(String, byte[],int,int)}
+ * which returns 1 by default.
+ * <p>
+ * Payload scores are averaged across term occurrences in the document.
+ *
+ * @see org.apache.lucene.search.Similarity#scorePayload(String, byte[], int, int)
+ */
+
+public class BoostingNearQuery extends SpanNearQuery {
+ String fieldName;
+
+ public BoostingNearQuery(SpanQuery[] clauses, int slop, boolean inOrder) {
+ super(clauses, slop, inOrder);
+ fieldName = clauses[0].getField(); // all clauses must have same field
+ }
+
+ public QueryWeight createQueryWeight(Searcher searcher) throws IOException {
+ return new BoostingSpanWeight(this, searcher);
+ }
+
+ public class BoostingSpanWeight extends SpanWeight {
+ public BoostingSpanWeight(SpanQuery query, Searcher searcher) throws IOException {
+ super(query, searcher);
+ }
+
+ public Scorer scorer(IndexReader reader) throws IOException {
+ return new BoostingSpanScorer(query.getSpans(reader), this,
+ similarity,
+ reader.norms(query.getField()));
+ }
+ public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException {
+ return new BoostingSpanScorer(query.getSpans(reader), this,
+ similarity,
+ reader.norms(query.getField()));
+ }
+ }
+
+ public class BoostingSpanScorer extends SpanScorer {
+ Spans spans;
+ Spans[] subSpans = null;
+ protected float payloadScore;
+ private int payloadsSeen;
+ Similarity similarity = getSimilarity();
+
+ protected BoostingSpanScorer(Spans spans, Weight weight, Similarity similarity, byte[] norms)
+ throws IOException {
+ super(spans, weight, similarity, norms);
+ this.spans = spans;
+ }
+
+ // Get the payloads associated with all underlying subspans
+ public void getPayloads(Spans[] subSpans) throws IOException {
+ for (int i = 0; i < subSpans.length; i++) {
+ if (subSpans[i] instanceof NearSpansOrdered) {
+ if (((NearSpansOrdered)subSpans[i]).isPayloadAvailable()) {
+ processPayloads(((NearSpansOrdered)subSpans[i]).getPayload());
+ }
+ getPayloads(((NearSpansOrdered) subSpans[i]).getSubSpans());
+ } else if (subSpans[i] instanceof NearSpansUnordered) {
+ if (((NearSpansUnordered)subSpans[i]).isPayloadAvailable()) {
+ processPayloads(((NearSpansUnordered)subSpans[i]).getPayload());
+ }
+ getPayloads(((NearSpansUnordered) subSpans[i]).getSubSpans());
+ }
+ }
+ }
+
+ /**
+ * By default, sums the payloads, but can be overridden to do other things.
+ * @param payLoads The payloads
+ */
+ protected void processPayloads(Collection payLoads) {
+ for (Iterator iterator = payLoads.iterator(); iterator.hasNext();) {
+ byte[] thePayload = (byte[]) iterator.next();
+ ++payloadsSeen;
+ payloadScore += similarity.scorePayload(fieldName, thePayload, 0, thePayload.length);
+ }
+ }
+//
+ protected boolean setFreqCurrentDoc() throws IOException {
+ Spans[] spansArr = new Spans[1];
+ spansArr[0] = spans;
+ payloadScore = 0;
+ payloadsSeen = 0;
+ getPayloads(spansArr);
+ return super.setFreqCurrentDoc();
+ }
+
+ public float score() throws IOException {
+
+ return super.score() * (payloadsSeen > 0 ? (payloadScore / payloadsSeen) : 1);
+ }
+ public Explanation explain(int doc) throws IOException {
+ Explanation result = new Explanation();
+ Explanation nonPayloadExpl = super.explain(doc);
+ result.addDetail(nonPayloadExpl);
+ Explanation payloadBoost = new Explanation();
+ result.addDetail(payloadBoost);
+ float avgPayloadScore = (payloadsSeen > 0 ? (payloadScore / payloadsSeen) : 1);
+ payloadBoost.setValue(avgPayloadScore);
+ payloadBoost.setDescription("scorePayload(...)");
+ result.setValue(nonPayloadExpl.getValue() * avgPayloadScore);
+ result.setDescription("bnq, product of:");
+ return result;
+ }
+ }
+
+
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/search/payloads/BoostingNearQuery.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/trunk/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java?rev=801667&r1=801666&r2=801667&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java Thu Aug 6 15:14:18 2009
@@ -46,8 +46,12 @@
* matches twice:
* <pre>t1 t2 .. t3 </pre>
* <pre> t1 .. t2 t3</pre>
+ *
+ *
+ * Expert:
+ * Only public for subclassing. Most implementations should not need this class
*/
-class NearSpansOrdered implements Spans {
+public class NearSpansOrdered implements Spans {
private final int allowedSlop;
private boolean firstTime = true;
private boolean more = false;
@@ -104,6 +108,10 @@
// inherit javadocs
public int end() { return matchEnd; }
+
+ public Spans[] getSubSpans() {
+ return subSpans;
+ }
// TODO: Remove warning after API has been finalized
// TODO: Would be nice to be able to lazy load payloads
Modified: lucene/java/trunk/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java?rev=801667&r1=801666&r2=801667&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java Thu Aug 6 15:14:18 2009
@@ -27,10 +27,15 @@
import java.util.Set;
import java.util.HashSet;
-class NearSpansUnordered implements Spans {
+/**
+ * Expert:
+ * Only public for subclassing. Most implementations should not need this class
+ */
+public class NearSpansUnordered implements Spans {
private SpanNearQuery query;
private List ordered = new ArrayList(); // spans in query order
+ private Spans[] subSpans;
private int slop; // from query
private SpansCell first; // linked list of spans
@@ -122,13 +127,17 @@
SpanQuery[] clauses = query.getClauses();
queue = new CellQueue(clauses.length);
+ subSpans = new Spans[clauses.length];
for (int i = 0; i < clauses.length; i++) {
SpansCell cell =
new SpansCell(clauses[i].getSpans(reader), i);
ordered.add(cell);
+ subSpans[i] = cell.spans;
}
}
-
+ public Spans[] getSubSpans() {
+ return subSpans;
+ }
public boolean next() throws IOException {
if (firstTime) {
initList(true);
Modified: lucene/java/trunk/src/java/org/apache/lucene/search/spans/SpanNearQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/spans/SpanNearQuery.java?rev=801667&r1=801666&r2=801667&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/spans/SpanNearQuery.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/spans/SpanNearQuery.java Thu Aug 6 15:14:18 2009
@@ -38,7 +38,7 @@
private int slop;
private boolean inOrder;
- private String field;
+ protected String field;
private boolean collectPayloads;
/** Construct a SpanNearQuery. Matches spans matching a span from each
Added: lucene/java/trunk/src/test/org/apache/lucene/search/payloads/TestBoostingNearQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/search/payloads/TestBoostingNearQuery.java?rev=801667&view=auto
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/search/payloads/TestBoostingNearQuery.java (added)
+++ lucene/java/trunk/src/test/org/apache/lucene/search/payloads/TestBoostingNearQuery.java Thu Aug 6 15:14:18 2009
@@ -0,0 +1,217 @@
+package org.apache.lucene.search.payloads;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Collection;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LowerCaseTokenizer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Payload;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.Searcher;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.English;
+import org.apache.lucene.util.LuceneTestCase;
+
+
+public class TestBoostingNearQuery extends LuceneTestCase {
+ private IndexSearcher searcher;
+ private BoostingSimilarity similarity = new BoostingSimilarity();
+ private byte[] payload2 = new byte[]{2};
+ private byte[] payload4 = new byte[]{4};
+
+ public TestBoostingNearQuery(String s) {
+ super(s);
+ }
+
+ private class PayloadAnalyzer extends Analyzer {
+ public TokenStream tokenStream(String fieldName, Reader reader) {
+ TokenStream result = new LowerCaseTokenizer(reader);
+ result = new PayloadFilter(result, fieldName);
+ return result;
+ }
+ }
+
+ private class PayloadFilter extends TokenFilter {
+ String fieldName;
+ int numSeen = 0;
+ protected PayloadAttribute payAtt;
+
+ public PayloadFilter(TokenStream input, String fieldName) {
+ super(input);
+ this.fieldName = fieldName;
+ payAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
+ }
+
+ public boolean incrementToken() throws IOException {
+ boolean result = false;
+ if (input.incrementToken() == true){
+ if (numSeen % 2 == 0) {
+ payAtt.setPayload(new Payload(payload2));
+ } else {
+ payAtt.setPayload(new Payload(payload4));
+ }
+ numSeen++;
+ result = true;
+ }
+ return result;
+ }
+ }
+
+ private BoostingNearQuery newPhraseQuery (String fieldName, String phrase, boolean inOrder) {
+ int n;
+ String[] words = phrase.split("[\\s]+");
+ SpanQuery clauses[] = new SpanQuery[words.length];
+ for (int i=0;i<clauses.length;i++) {
+ clauses[i] = new BoostingTermQuery(new Term(fieldName, words[i]));
+ }
+ return new BoostingNearQuery(clauses, 0, inOrder);
+ }
+
+ protected void setUp() throws Exception {
+ super.setUp();
+ RAMDirectory directory = new RAMDirectory();
+ PayloadAnalyzer analyzer = new PayloadAnalyzer();
+ IndexWriter writer
+ = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+ writer.setSimilarity(similarity);
+ //writer.infoStream = System.out;
+ for (int i = 0; i < 1000; i++) {
+ Document doc = new Document();
+ doc.add(new Field("field", English.intToEnglish(i), Field.Store.YES, Field.Index.ANALYZED));
+ writer.addDocument(doc);
+ }
+ writer.optimize();
+ writer.close();
+
+ searcher = new IndexSearcher(directory, true);
+ searcher.setSimilarity(similarity);
+ }
+
+ public void test() throws IOException {
+ BoostingNearQuery query;
+ TopDocs hits;
+
+ query = newPhraseQuery("field", "twenty two", true);
+ // all 10 hits should have score = 3 because adjacent terms have payloads of 2,4
+ // and all the similarity factors are set to 1
+ hits = searcher.search(query, null, 100);
+ assertTrue("hits is null and it shouldn't be", hits != null);
+ assertTrue("should be 10 hits", hits.totalHits == 10);
+ for (int j = 0; j < hits.scoreDocs.length; j++) {
+ ScoreDoc doc = hits.scoreDocs[j];
+ assertTrue(doc.score + " does not equal: " + 3, doc.score == 3);
+ }
+ for (int i=1;i<10;i++) {
+ query = newPhraseQuery("field", English.intToEnglish(i)+" hundred", true);
+ // all should have score = 3 because adjacent terms have payloads of 2,4
+ // and all the similarity factors are set to 1
+ hits = searcher.search(query, null, 100);
+ assertTrue("hits is null and it shouldn't be", hits != null);
+ assertTrue("should be 100 hits", hits.totalHits == 100);
+ for (int j = 0; j < hits.scoreDocs.length; j++) {
+ ScoreDoc doc = hits.scoreDocs[j];
+// System.out.println("Doc: " + doc.toString());
+// System.out.println("Explain: " + searcher.explain(query, doc.doc));
+ assertTrue(doc.score + " does not equal: " + 3, doc.score == 3);
+ }
+ }
+ }
+
+ public void testLongerSpan() throws IOException {
+ BoostingNearQuery query;
+ TopDocs hits;
+ query = newPhraseQuery("field", "nine hundred ninety nine", true);
+ hits = searcher.search(query, null, 100);
+ ScoreDoc doc = hits.scoreDocs[0];
+// System.out.println("Doc: " + doc.toString());
+// System.out.println("Explain: " + searcher.explain(query, doc.doc));
+ assertTrue("hits is null and it shouldn't be", hits != null);
+ assertTrue("there should only be one hit", hits.totalHits == 1);
+ // should have score = 3 because adjacent terms have payloads of 2,4
+ assertTrue(doc.score + " does not equal: " + 3, doc.score == 3);
+ }
+
+ public void testComplexNested() throws IOException {
+ BoostingNearQuery query;
+ TopDocs hits;
+
+ // combine ordered and unordered spans with some nesting to make sure all payloads are counted
+
+ SpanQuery q1 = newPhraseQuery("field", "nine hundred", true);
+ SpanQuery q2 = newPhraseQuery("field", "ninety nine", true);
+ SpanQuery q3 = newPhraseQuery("field", "nine ninety", false);
+ SpanQuery q4 = newPhraseQuery("field", "hundred nine", false);
+ SpanQuery[]clauses = new SpanQuery[] {new BoostingNearQuery(new SpanQuery[] {q1,q2}, 0, true), new BoostingNearQuery(new SpanQuery[] {q3,q4}, 0, false)};
+ query = new BoostingNearQuery(clauses, 0, false);
+ hits = searcher.search(query, null, 100);
+ assertTrue("hits is null and it shouldn't be", hits != null);
+ // should be only 1 hit - doc 999
+ assertTrue("should only be one hit", hits.scoreDocs.length == 1);
+ // the score should be 3 - the average of all the underlying payloads
+ ScoreDoc doc = hits.scoreDocs[0];
+// System.out.println("Doc: " + doc.toString());
+// System.out.println("Explain: " + searcher.explain(query, doc.doc));
+ assertTrue(doc.score + " does not equal: " + 3, doc.score == 3);
+ }
+ // must be static for weight serialization tests
+ static class BoostingSimilarity extends DefaultSimilarity {
+
+ public float scorePayload(String fieldName, byte[] payload, int offset, int length) {
+ return payload[0];
+ }
+
+ //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ //Make everything else 1 so we see the effect of the payload
+ //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ public float lengthNorm(String fieldName, int numTerms) {
+ return 1;
+ }
+
+ public float queryNorm(float sumOfSquaredWeights) {
+ return 1;
+ }
+
+ public float sloppyFreq(int distance) {
+ return 1;
+ }
+
+ public float coord(int overlap, int maxOverlap) {
+ return 1;
+ }
+ public float tf(float freq) {
+ return 1;
+ }
+ // idf used for phrase queries
+ public float idf(Collection terms, Searcher searcher) {
+ return 1;
+ }
+ }
+}
Propchange: lucene/java/trunk/src/test/org/apache/lucene/search/payloads/TestBoostingNearQuery.java
------------------------------------------------------------------------------
svn:eol-style = native