You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by yo...@apache.org on 2007/11/23 17:58:55 UTC
svn commit: r597702 - in /lucene/java/trunk: CHANGES.txt
src/java/org/apache/lucene/search/BooleanScorer2.java
src/java/org/apache/lucene/search/ConjunctionScorer.java
src/test/org/apache/lucene/search/TestScorerPerf.java
Author: yonik
Date: Fri Nov 23 08:58:54 2007
New Revision: 597702
URL: http://svn.apache.org/viewvc?rev=597702&view=rev
Log:
LUCENE-693: Speed up nested conjunctions
Modified:
lucene/java/trunk/CHANGES.txt
lucene/java/trunk/src/java/org/apache/lucene/search/BooleanScorer2.java
lucene/java/trunk/src/java/org/apache/lucene/search/ConjunctionScorer.java
lucene/java/trunk/src/test/org/apache/lucene/search/TestScorerPerf.java
Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=597702&r1=597701&r2=597702&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Fri Nov 23 08:58:54 2007
@@ -259,6 +259,11 @@
raw bytes for each contiguous range of non-deleted documents.
(Robert Engels via Mike McCandless)
+13. LUCENE-693: Speed up nested conjunctions (~2x) that match many
+ documents, and a slight performance increase for top level
+ conjunctions. (yonik)
+
+
Documentation
1. LUCENE-1051: Generate separate javadocs for core, demo and contrib
Modified: lucene/java/trunk/src/java/org/apache/lucene/search/BooleanScorer2.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/BooleanScorer2.java?rev=597702&r1=597701&r2=597702&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/BooleanScorer2.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/BooleanScorer2.java Fri Nov 23 08:58:54 2007
@@ -139,7 +139,7 @@
* When "sum" is used in a name it means score value summing
* over the matching scorers
*/
- private void initCountingSumScorer() {
+ private void initCountingSumScorer() throws IOException {
coordinator.init();
countingSumScorer = makeCountingSumScorer();
}
@@ -192,10 +192,10 @@
private static Similarity defaultSimilarity = new DefaultSimilarity();
- private Scorer countingConjunctionSumScorer(List requiredScorers) {
+ private Scorer countingConjunctionSumScorer(List requiredScorers) throws IOException {
// each scorer from the list counted as a single matcher
final int requiredNrMatchers = requiredScorers.size();
- ConjunctionScorer cs = new ConjunctionScorer(defaultSimilarity) {
+ return new ConjunctionScorer(defaultSimilarity, requiredScorers) {
private int lastScoredDoc = -1;
public float score() throws IOException {
@@ -210,34 +210,26 @@
return super.score();
}
};
- Iterator rsi = requiredScorers.iterator();
- while (rsi.hasNext()) {
- cs.add((Scorer) rsi.next());
- }
- return cs;
}
- private Scorer dualConjunctionSumScorer(Scorer req1, Scorer req2) { // non counting.
- ConjunctionScorer cs = new ConjunctionScorer(defaultSimilarity);
+ private Scorer dualConjunctionSumScorer(Scorer req1, Scorer req2) throws IOException { // non counting.
+ return new ConjunctionScorer(defaultSimilarity, new Scorer[]{req1, req2});
// All scorers match, so defaultSimilarity always has 1 as
// the coordination factor.
// Therefore the sum of the scores of two scorers
// is used as score.
- cs.add(req1);
- cs.add(req2);
- return cs;
}
/** Returns the scorer to be used for match counting and score summing.
* Uses requiredScorers, optionalScorers and prohibitedScorers.
*/
- private Scorer makeCountingSumScorer() { // each scorer counted as a single matcher
+ private Scorer makeCountingSumScorer() throws IOException { // each scorer counted as a single matcher
return (requiredScorers.size() == 0)
? makeCountingSumScorerNoReq()
: makeCountingSumScorerSomeReq();
}
- private Scorer makeCountingSumScorerNoReq() { // No required scorers
+ private Scorer makeCountingSumScorerNoReq() throws IOException { // No required scorers
if (optionalScorers.size() == 0) {
return new NonMatchingScorer(); // no clauses or only prohibited clauses
} else { // No required scorers. At least one optional scorer.
@@ -258,7 +250,7 @@
}
}
- private Scorer makeCountingSumScorerSomeReq() { // At least one required scorer.
+ private Scorer makeCountingSumScorerSomeReq() throws IOException { // At least one required scorer.
if (optionalScorers.size() < minNrShouldMatch) {
return new NonMatchingScorer(); // fewer optional clauses than minimum that should match
} else if (optionalScorers.size() == minNrShouldMatch) { // all optional scorers also required.
Modified: lucene/java/trunk/src/java/org/apache/lucene/search/ConjunctionScorer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/ConjunctionScorer.java?rev=597702&r1=597701&r2=597702&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/ConjunctionScorer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/ConjunctionScorer.java Fri Nov 23 08:58:54 2007
@@ -18,118 +18,105 @@
*/
import java.io.IOException;
+import java.util.Collection;
import java.util.Arrays;
import java.util.Comparator;
/** Scorer for conjunctions, sets of queries, all of which are required. */
class ConjunctionScorer extends Scorer {
- private Scorer[] scorers = new Scorer[2];
- private int length = 0;
- private int first = 0;
- private int last = -1;
- private boolean firstTime = true;
- private boolean more = true;
- private float coord;
+ private final Scorer[] scorers;
- public ConjunctionScorer(Similarity similarity) {
- super(similarity);
+ private boolean firstTime=true;
+ private boolean more;
+ private final float coord;
+ private int lastDoc=-1;
+
+ public ConjunctionScorer(Similarity similarity, Collection scorers) throws IOException {
+ this(similarity, (Scorer[])scorers.toArray(new Scorer[scorers.size()]));
}
- final void add(Scorer scorer) {
- if (length >= scorers.length) {
- // grow the array
- Scorer[] temps = new Scorer[scorers.length * 2];
- System.arraycopy(scorers, 0, temps, 0, length);
- scorers = temps;
- }
- last += 1;
- length += 1;
- scorers[last] = scorer;
+ public ConjunctionScorer(Similarity similarity, Scorer[] scorers) throws IOException {
+ super(similarity);
+ this.scorers = scorers;
+ coord = getSimilarity().coord(this.scorers.length, this.scorers.length);
}
- public int doc() { return scorers[first].doc(); }
+ public int doc() { return lastDoc; }
public boolean next() throws IOException {
- if (firstTime) {
- init(true);
- } else if (more) {
- more = scorers[last].next(); // trigger further scanning
- }
+ if (firstTime)
+ return init(0);
+ else if (more)
+ more = scorers[(scorers.length-1)].next();
return doNext();
}
-
+
private boolean doNext() throws IOException {
- while (more && scorers[first].doc() < scorers[last].doc()) { // find doc w/ all clauses
- more = scorers[first].skipTo(scorers[last].doc()); // skip first upto last
- last = first; // move first to last
- first = (first == length-1) ? 0 : first+1;
+ int first=0;
+ Scorer lastScorer = scorers[scorers.length-1];
+ Scorer firstScorer;
+ while (more && (firstScorer=scorers[first]).doc() < (lastDoc=lastScorer.doc())) {
+ more = firstScorer.skipTo(lastDoc);
+ lastScorer = firstScorer;
+ first = (first == (scorers.length-1)) ? 0 : first+1;
}
- return more; // found a doc with all clauses
+ return more;
}
public boolean skipTo(int target) throws IOException {
- if(firstTime) {
- init(false);
- }
-
- for (int i = 0, pos = first; i < length; i++) {
- if (!more) break;
- more = scorers[pos].skipTo(target);
- pos = (pos == length-1) ? 0 : pos+1;
- }
-
- if (more)
- sortScorers(); // re-sort scorers
-
+ if (firstTime)
+ return init(target);
+ else if (more)
+ more = scorers[(scorers.length-1)].skipTo(target);
return doNext();
}
- public float score() throws IOException {
- float sum = 0.0f;
- for (int i = 0; i < length; i++) {
- sum += scorers[i].score();
- }
- return sum * coord;
- }
-
- private void init(boolean initScorers) throws IOException {
- // compute coord factor
- coord = getSimilarity().coord(length, length);
-
- more = length > 0;
-
- if(initScorers){
- // move each scorer to its first entry
- for (int i = 0, pos = first; i < length; i++) {
- if (!more) break;
- more = scorers[pos].next();
- pos = (pos == length-1) ? 0 : pos+1;
- }
- // initial sort of simulated list
- if (more)
- sortScorers();
+ // Note... most of this could be done in the constructor
+ // thus skipping a check for firstTime per call to next() and skipTo()
+ private boolean init(int target) throws IOException {
+ firstTime=false;
+ more = scorers.length>1;
+ for (int i=0; i<scorers.length; i++) {
+ more = target==0 ? scorers[i].next() : scorers[i].skipTo(target);
+ if (!more)
+ return false;
}
- firstTime = false;
- }
+ // Sort the array the first time...
+ // We don't need to sort the array in any future calls because we know
+ // it will already start off sorted (all scorers on same doc).
- private void sortScorers() {
- // squeeze the array down for the sort
- if (length != scorers.length) {
- Scorer[] temps = new Scorer[length];
- System.arraycopy(scorers, 0, temps, 0, length);
- scorers = temps;
- }
-
// note that this comparator is not consistent with equals!
Arrays.sort(scorers, new Comparator() { // sort the array
public int compare(Object o1, Object o2) {
return ((Scorer)o1).doc() - ((Scorer)o2).doc();
}
});
-
- first = 0;
- last = length - 1;
+
+ doNext();
+
+ // If first-time skip distance is any predictor of
+ // scorer sparseness, then we should always try to skip first on
+ // those scorers.
+ // Keep last scorer in it's last place (it will be the first
+ // to be skipped on), but reverse all of the others so that
+ // they will be skipped on in order of original high skip.
+ int end=(scorers.length-1)-1;
+ for (int i=0; i<(end>>1); i++) {
+ Scorer tmp = scorers[i];
+ scorers[i] = scorers[end-i];
+ scorers[end-i] = tmp;
+ }
+
+ return more;
+ }
+
+ public float score() throws IOException {
+ float sum = 0.0f;
+ for (int i = 0; i < scorers.length; i++) {
+ sum += scorers[i].score();
+ }
+ return sum * coord;
}
public Explanation explain(int doc) {
Modified: lucene/java/trunk/src/test/org/apache/lucene/search/TestScorerPerf.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/search/TestScorerPerf.java?rev=597702&r1=597701&r2=597702&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/search/TestScorerPerf.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/search/TestScorerPerf.java Fri Nov 23 08:58:54 2007
@@ -42,6 +42,7 @@
boolean validate = true; // set to false when doing performance testing
BitSet[] sets;
+ Term[] terms;
IndexSearcher s;
public void createDummySearcher() throws Exception {
@@ -55,22 +56,25 @@
public void createRandomTerms(int nDocs, int nTerms, double power, Directory dir) throws Exception {
int[] freq = new int[nTerms];
+ terms = new Term[nTerms];
for (int i=0; i<nTerms; i++) {
int f = (nTerms+1)-i; // make first terms less frequent
freq[i] = (int)Math.ceil(Math.pow(f,power));
+ terms[i] = new Term("f",Character.toString((char)('A'+i)));
}
IndexWriter iw = new IndexWriter(dir,new WhitespaceAnalyzer(), true);
- iw.setMaxBufferedDocs(123);
for (int i=0; i<nDocs; i++) {
Document d = new Document();
for (int j=0; j<nTerms; j++) {
if (r.nextInt(freq[j]) == 0) {
- d.add(new Field("f", Character.toString((char)j), Field.Store.NO, Field.Index.UN_TOKENIZED));
+ d.add(new Field("f", terms[j].text(), Field.Store.NO, Field.Index.UN_TOKENIZED));
+ //System.out.println(d);
}
}
iw.addDocument(d);
}
+ iw.optimize();
iw.close();
}
@@ -168,6 +172,7 @@
public int doNestedConjunctions(int iter, int maxOuterClauses, int maxClauses) throws IOException {
int ret=0;
+ long nMatches=0;
for (int i=0; i<iter; i++) {
int oClauses = r.nextInt(maxOuterClauses-1)+2;
@@ -185,15 +190,15 @@
oq.add(bq, BooleanClause.Occur.MUST);
} // outer
-
CountingHitCollector hc = validate ? new MatchingHitCollector(result)
: new CountingHitCollector();
s.search(oq, hc);
+ nMatches += hc.getCount();
ret += hc.getSum();
if (validate) assertEquals(result.cardinality(), hc.getCount());
// System.out.println(hc.getCount());
}
-
+ System.out.println("Average number of matches="+(nMatches/iter));
return ret;
}
@@ -205,22 +210,28 @@
) throws IOException {
int ret=0;
+ long nMatches=0;
for (int i=0; i<iter; i++) {
int nClauses = r.nextInt(maxClauses-1)+2; // min 2 clauses
BooleanQuery bq = new BooleanQuery();
- BitSet terms = new BitSet(termsInIndex);
+ BitSet termflag = new BitSet(termsInIndex);
for (int j=0; j<nClauses; j++) {
int tnum;
// don't pick same clause twice
- do {tnum = r.nextInt(termsInIndex);} while (terms.get(tnum));
- Query tq = new TermQuery(new Term("f",Character.toString((char)tnum)));
+ tnum = r.nextInt(termsInIndex);
+ if (termflag.get(tnum)) tnum=termflag.nextClearBit(tnum);
+ if (tnum<0 || tnum>=termsInIndex) tnum=termflag.nextClearBit(0);
+ termflag.set(tnum);
+ Query tq = new TermQuery(terms[tnum]);
bq.add(tq, BooleanClause.Occur.MUST);
}
CountingHitCollector hc = new CountingHitCollector();
s.search(bq, hc);
+ nMatches += hc.getCount();
ret += hc.getSum();
}
+ System.out.println("Average number of matches="+(nMatches/iter));
return ret;
}
@@ -233,7 +244,7 @@
int iter
) throws IOException {
int ret=0;
-
+ long nMatches=0;
for (int i=0; i<iter; i++) {
int oClauses = r.nextInt(maxOuterClauses-1)+2;
BooleanQuery oq = new BooleanQuery();
@@ -241,12 +252,15 @@
int nClauses = r.nextInt(maxClauses-1)+2; // min 2 clauses
BooleanQuery bq = new BooleanQuery();
- BitSet terms = new BitSet(termsInIndex);
+ BitSet termflag = new BitSet(termsInIndex);
for (int j=0; j<nClauses; j++) {
int tnum;
// don't pick same clause twice
- do {tnum = r.nextInt(termsInIndex);} while (terms.get(tnum));
- Query tq = new TermQuery(new Term("f",Character.toString((char)tnum)));
+ tnum = r.nextInt(termsInIndex);
+ if (termflag.get(tnum)) tnum=termflag.nextClearBit(tnum);
+ if (tnum<0 || tnum>=25) tnum=termflag.nextClearBit(0);
+ termflag.set(tnum);
+ Query tq = new TermQuery(terms[tnum]);
bq.add(tq, BooleanClause.Occur.MUST);
} // inner
@@ -256,9 +270,10 @@
CountingHitCollector hc = new CountingHitCollector();
s.search(oq, hc);
+ nMatches += hc.getCount();
ret += hc.getSum();
}
-
+ System.out.println("Average number of matches="+(nMatches/iter));
return ret;
}
@@ -275,7 +290,7 @@
PhraseQuery q = new PhraseQuery();
for (int j=0; j<nClauses; j++) {
int tnum = r.nextInt(termsInIndex);
- q.add(new Term("f",Character.toString((char)tnum)), j);
+ q.add(new Term("f",Character.toString((char)(tnum+'A'))), j);
}
q.setSlop(termsInIndex); // this could be random too
@@ -299,7 +314,8 @@
}
/***
- int bigIter=6;
+ int bigIter=10;
+
public void testConjunctionPerf() throws Exception {
createDummySearcher();
validate=false;
@@ -326,16 +342,17 @@
s.close();
}
+
public void testConjunctionTerms() throws Exception {
validate=false;
RAMDirectory dir = new RAMDirectory();
System.out.println("Creating index");
- createRandomTerms(100000,25,2, dir);
+ createRandomTerms(100000,25,.5, dir);
s = new IndexSearcher(dir);
System.out.println("Starting performance test");
for (int i=0; i<bigIter; i++) {
long start = System.currentTimeMillis();
- doTermConjunctions(s,25,5,10000);
+ doTermConjunctions(s,25,5,1000);
long end = System.currentTimeMillis();
System.out.println("milliseconds="+(end-start));
}
@@ -346,12 +363,12 @@
validate=false;
RAMDirectory dir = new RAMDirectory();
System.out.println("Creating index");
- createRandomTerms(100000,25,2, dir);
+ createRandomTerms(100000,25,.2, dir);
s = new IndexSearcher(dir);
System.out.println("Starting performance test");
for (int i=0; i<bigIter; i++) {
long start = System.currentTimeMillis();
- doNestedTermConjunctions(s,25,5,5,1000);
+ doNestedTermConjunctions(s,25,3,3,200);
long end = System.currentTimeMillis();
System.out.println("milliseconds="+(end-start));
}
@@ -373,8 +390,8 @@
System.out.println("milliseconds="+(end-start));
}
s.close();
-
}
***/
+
}