You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by si...@apache.org on 2014/07/10 18:21:44 UTC
svn commit: r1609493 - in /lucene/dev/branches/branch_4x: ./ lucene/
lucene/CHANGES.txt lucene/queries/
lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
Author: simonw
Date: Thu Jul 10 16:21:43 2014
New Revision: 1609493
URL: http://svn.apache.org/r1609493
Log:
LUCENE-5795: MoreLikeThisQuery now only collects the top N terms
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/lucene/ (props changed)
lucene/dev/branches/branch_4x/lucene/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_4x/lucene/queries/ (props changed)
lucene/dev/branches/branch_4x/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
lucene/dev/branches/branch_4x/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1609493&r1=1609492&r2=1609493&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Thu Jul 10 16:21:43 2014
@@ -56,6 +56,10 @@ Optimizations
to another analyzer, e.g. per field name: PerFieldAnalyzerWrapper and
Solr's schema support. (Shay Banon, Uwe Schindler, Robert Muir)
+* LUCENE-5795: MoreLikeThisQuery now only collects the top N terms instead
+ of collecting all terms from the like text when building the query.
+ (Alex Ksikes, Simon Willnauer)
+
Bug Fixes
* LUCENE-5796: Fixes the Scorer.getChildren() method for two combinations
Modified: lucene/dev/branches/branch_4x/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java?rev=1609493&r1=1609492&r2=1609493&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java (original)
+++ lucene/dev/branches/branch_4x/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java Thu Jul 10 16:21:43 2014
@@ -617,22 +617,19 @@ public final class MoreLikeThis {
/**
* Create the More like query from a PriorityQueue
*/
- private Query createQuery(PriorityQueue<Object[]> q) {
+ private Query createQuery(PriorityQueue<ScoreTerm> q) {
BooleanQuery query = new BooleanQuery();
- Object cur;
- int qterms = 0;
- float bestScore = 0;
-
- while ((cur = q.pop()) != null) {
- Object[] ar = (Object[]) cur;
- TermQuery tq = new TermQuery(new Term((String) ar[1], (String) ar[0]));
+ ScoreTerm scoreTerm;
+ float bestScore = -1;
+
+ while ((scoreTerm = q.pop()) != null) {
+ TermQuery tq = new TermQuery(new Term(scoreTerm.topField, scoreTerm.word));
if (boost) {
- if (qterms == 0) {
- bestScore = ((Float) ar[2]);
+ if (bestScore == -1) {
+ bestScore = (scoreTerm.score);
}
- float myScore = ((Float) ar[2]);
-
+ float myScore = (scoreTerm.score);
tq.setBoost(boostFactor * myScore / bestScore);
}
@@ -642,13 +639,7 @@ public final class MoreLikeThis {
catch (BooleanQuery.TooManyClauses ignore) {
break;
}
-
- qterms++;
- if (maxQueryTerms > 0 && qterms >= maxQueryTerms) {
- break;
- }
}
-
return query;
}
@@ -657,10 +648,11 @@ public final class MoreLikeThis {
*
* @param words a map of words keyed on the word(String) with Int objects as the values.
*/
- private PriorityQueue<Object[]> createQueue(Map<String, Int> words) throws IOException {
+ private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words) throws IOException {
// have collected all words in doc and their freqs
int numDocs = ir.numDocs();
- FreqQ res = new FreqQ(words.size()); // will order words by score
+ final int limit = Math.min(maxQueryTerms, words.size());
+ FreqQ queue = new FreqQ(limit); // will order words by score
for (String word : words.keySet()) { // for every word
int tf = words.get(word).x; // term freq in the source doc
@@ -692,16 +684,18 @@ public final class MoreLikeThis {
float idf = similarity.idf(docFreq, numDocs);
float score = tf * idf;
- // only really need 1st 3 entries, other ones are for troubleshooting
- res.insertWithOverflow(new Object[]{word, // the word
- topField, // the top field
- score, // overall score
- idf, // idf
- docFreq, // freq in all docs
- tf
- });
+ if (queue.size() < limit) {
+ // there is still space in the queue
+ queue.add(new ScoreTerm(word, topField, score, idf, docFreq, tf));
+ } else {
+ ScoreTerm term = queue.top();
+ if (term.score < score) { // update the smallest in the queue in place and update the queue.
+ term.update(word, topField, score, idf, docFreq, tf);
+ queue.updateTop();
+ }
+ }
}
- return res;
+ return queue;
}
/**
@@ -730,7 +724,7 @@ public final class MoreLikeThis {
*
* @param docNum the id of the lucene document from which to find terms
*/
- public PriorityQueue<Object[]> retrieveTerms(int docNum) throws IOException {
+ private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException {
Map<String, Int> termFreqMap = new HashMap<>();
for (String fieldName : fieldNames) {
final Fields vectors = ir.getTermVectors(docNum);
@@ -873,7 +867,7 @@ public final class MoreLikeThis {
* @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first
* @see #retrieveInterestingTerms
*/
- public PriorityQueue<Object[]> retrieveTerms(Reader r, String fieldName) throws IOException {
+ private PriorityQueue<ScoreTerm> retrieveTerms(Reader r, String fieldName) throws IOException {
Map<String, Int> words = new HashMap<>();
addTermFrequencies(r, words, fieldName);
return createQueue(words);
@@ -884,13 +878,12 @@ public final class MoreLikeThis {
*/
public String[] retrieveInterestingTerms(int docNum) throws IOException {
ArrayList<Object> al = new ArrayList<>(maxQueryTerms);
- PriorityQueue<Object[]> pq = retrieveTerms(docNum);
- Object cur;
+ PriorityQueue<ScoreTerm> pq = retrieveTerms(docNum);
+ ScoreTerm scoreTerm;
int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
// we just want to return the top words
- while (((cur = pq.pop()) != null) && lim-- > 0) {
- Object[] ar = (Object[]) cur;
- al.add(ar[0]); // the 1st entry is the interesting word
+ while (((scoreTerm = pq.pop()) != null) && lim-- > 0) {
+ al.add(scoreTerm.word); // the 1st entry is the interesting word
}
String[] res = new String[al.size()];
return al.toArray(res);
@@ -908,13 +901,12 @@ public final class MoreLikeThis {
*/
public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException {
ArrayList<Object> al = new ArrayList<>(maxQueryTerms);
- PriorityQueue<Object[]> pq = retrieveTerms(r, fieldName);
- Object cur;
+ PriorityQueue<ScoreTerm> pq = retrieveTerms(r, fieldName);
+ ScoreTerm scoreTerm;
int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
// we just want to return the top words
- while (((cur = pq.pop()) != null) && lim-- > 0) {
- Object[] ar = (Object[]) cur;
- al.add(ar[0]); // the 1st entry is the interesting word
+ while (((scoreTerm = pq.pop()) != null) && lim-- > 0) {
+ al.add(scoreTerm.word); // the 1st entry is the interesting word
}
String[] res = new String[al.size()];
return al.toArray(res);
@@ -923,16 +915,42 @@ public final class MoreLikeThis {
/**
* PriorityQueue that orders words by score.
*/
- private static class FreqQ extends PriorityQueue<Object[]> {
- FreqQ(int s) {
- super(s);
+ private static class FreqQ extends PriorityQueue<ScoreTerm> {
+ FreqQ(int maxSize) {
+ super(maxSize);
}
@Override
- protected boolean lessThan(Object[] aa, Object[] bb) {
- Float fa = (Float) aa[2];
- Float fb = (Float) bb[2];
- return fa > fb;
+ protected boolean lessThan(ScoreTerm a, ScoreTerm b) {
+ return a.score < b.score;
+ }
+ }
+
+ private static class ScoreTerm {
+ // only really need 1st 3 entries, other ones are for troubleshooting
+ String word;
+ String topField;
+ float score;
+ float idf;
+ int docFreq;
+ int tf;
+
+ ScoreTerm(String word, String topField, float score, float idf, int docFreq, int tf) {
+ this.word = word;
+ this.topField = topField;
+ this.score = score;
+ this.idf = idf;
+ this.docFreq = docFreq;
+ this.tf = tf;
+ }
+
+ void update(String word, String topField, float score, float idf, int docFreq, int tf) {
+ this.word = word;
+ this.topField = topField;
+ this.score = score;
+ this.idf = idf;
+ this.docFreq = docFreq;
+ this.tf = tf;
}
}
Modified: lucene/dev/branches/branch_4x/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java?rev=1609493&r1=1609492&r2=1609493&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java (original)
+++ lucene/dev/branches/branch_4x/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java Thu Jul 10 16:21:43 2014
@@ -74,7 +74,15 @@ public class TestMoreLikeThis extends Lu
doc.add(newTextField("text", text, Field.Store.YES));
writer.addDocument(doc);
}
-
+
+ private void addDoc(RandomIndexWriter writer, String[] texts) throws IOException {
+ Document doc = new Document();
+ for (String text : texts) {
+ doc.add(newTextField("text", text, Field.Store.YES));
+ }
+ writer.addDocument(doc);
+ }
+
public void testBoostFactor() throws Throwable {
Map<String,Float> originalValues = getOriginalValues();
@@ -166,5 +174,62 @@ public class TestMoreLikeThis extends Lu
Query query = new MoreLikeThisQuery("this is a test", new String[] { "text" }, new MockAnalyzer(random()), "text");
QueryUtils.check(random(), query, searcher);
}
+
+ public void testTopN() throws Exception {
+ int numDocs = 100;
+ int topN = 25;
+
+ // add series of docs with terms of decreasing df
+ Directory dir = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
+ for (int i = 0; i < numDocs; i++) {
+ addDoc(writer, generateStrSeq(0, i + 1));
+ }
+ IndexReader reader = writer.getReader();
+ writer.close();
+
+ // setup MLT query
+ MoreLikeThis mlt = new MoreLikeThis(reader);
+ mlt.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false));
+ mlt.setMaxQueryTerms(topN);
+ mlt.setMinDocFreq(1);
+ mlt.setMinTermFreq(1);
+ mlt.setMinWordLen(1);
+ mlt.setFieldNames(new String[]{"text"});
+
+ // perform MLT query
+ String likeText = "";
+ for (String text : generateStrSeq(0, numDocs)) {
+ likeText += text + " ";
+ }
+ BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader(likeText));
+
+ // check best terms are topN of highest idf
+ List<BooleanClause> clauses = query.clauses();
+ assertEquals("Expected" + topN + "clauses only!", topN, clauses.size());
+
+ Term[] expectedTerms = new Term[topN];
+ int idx = 0;
+ for (String text : generateStrSeq(numDocs - topN, topN)) {
+ expectedTerms[idx++] = new Term("text", text);
+ }
+ for (BooleanClause clause : clauses) {
+ Term term = ((TermQuery) clause.getQuery()).getTerm();
+ assertTrue(Arrays.asList(expectedTerms).contains(term));
+ }
+
+ // clean up
+ reader.close();
+ dir.close();
+ }
+
+ private String[] generateStrSeq(int from, int size) {
+ String[] generatedStrings = new String[size];
+ for (int i = 0; i < generatedStrings.length; i++) {
+ generatedStrings[i] = String.valueOf(from + i);
+ }
+ return generatedStrings;
+ }
+
// TODO: add tests for the MoreLikeThisQuery
}