You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by yo...@apache.org on 2007/11/23 17:58:55 UTC

svn commit: r597702 - in /lucene/java/trunk: CHANGES.txt src/java/org/apache/lucene/search/BooleanScorer2.java src/java/org/apache/lucene/search/ConjunctionScorer.java src/test/org/apache/lucene/search/TestScorerPerf.java

Author: yonik
Date: Fri Nov 23 08:58:54 2007
New Revision: 597702

URL: http://svn.apache.org/viewvc?rev=597702&view=rev
Log:
LUCENE-693: Speed up nested conjunctions

Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/src/java/org/apache/lucene/search/BooleanScorer2.java
    lucene/java/trunk/src/java/org/apache/lucene/search/ConjunctionScorer.java
    lucene/java/trunk/src/test/org/apache/lucene/search/TestScorerPerf.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=597702&r1=597701&r2=597702&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Fri Nov 23 08:58:54 2007
@@ -259,6 +259,11 @@
     raw bytes for each contiguous range of non-deleted documents.
     (Robert Engels via Mike McCandless)
 
+13. LUCENE-693: Speed up nested conjunctions (~2x) that match many
+    documents, and a slight performance increase for top level
+    conjunctions.  (yonik)
+
+
 Documentation
 
  1. LUCENE-1051: Generate separate javadocs for core, demo and contrib

Modified: lucene/java/trunk/src/java/org/apache/lucene/search/BooleanScorer2.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/BooleanScorer2.java?rev=597702&r1=597701&r2=597702&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/BooleanScorer2.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/BooleanScorer2.java Fri Nov 23 08:58:54 2007
@@ -139,7 +139,7 @@
    * When "sum" is used in a name it means score value summing
    * over the matching scorers
    */
-  private void initCountingSumScorer() {
+  private void initCountingSumScorer() throws IOException {
     coordinator.init();
     countingSumScorer = makeCountingSumScorer();
   }
@@ -192,10 +192,10 @@
 
   private static Similarity defaultSimilarity = new DefaultSimilarity();
 
-  private Scorer countingConjunctionSumScorer(List requiredScorers) {
+  private Scorer countingConjunctionSumScorer(List requiredScorers) throws IOException {
     // each scorer from the list counted as a single matcher
     final int requiredNrMatchers = requiredScorers.size();
-    ConjunctionScorer cs = new ConjunctionScorer(defaultSimilarity) {
+    return new ConjunctionScorer(defaultSimilarity, requiredScorers) {
       private int lastScoredDoc = -1;
 
       public float score() throws IOException {
@@ -210,34 +210,26 @@
         return super.score();
       }
     };
-    Iterator rsi = requiredScorers.iterator();
-    while (rsi.hasNext()) {
-      cs.add((Scorer) rsi.next());
-    }
-    return cs;
   }
 
-  private Scorer dualConjunctionSumScorer(Scorer req1, Scorer req2) { // non counting. 
-    ConjunctionScorer cs = new ConjunctionScorer(defaultSimilarity);
+  private Scorer dualConjunctionSumScorer(Scorer req1, Scorer req2) throws IOException { // non counting.
+    return new ConjunctionScorer(defaultSimilarity, new Scorer[]{req1, req2});
     // All scorers match, so defaultSimilarity always has 1 as
     // the coordination factor.
     // Therefore the sum of the scores of two scorers
     // is used as score.
-    cs.add(req1);
-    cs.add(req2);
-    return cs;
   }
 
   /** Returns the scorer to be used for match counting and score summing.
    * Uses requiredScorers, optionalScorers and prohibitedScorers.
    */
-  private Scorer makeCountingSumScorer() { // each scorer counted as a single matcher
+  private Scorer makeCountingSumScorer() throws IOException { // each scorer counted as a single matcher
     return (requiredScorers.size() == 0)
           ? makeCountingSumScorerNoReq()
           : makeCountingSumScorerSomeReq();
   }
 
-  private Scorer makeCountingSumScorerNoReq() { // No required scorers
+  private Scorer makeCountingSumScorerNoReq() throws IOException { // No required scorers
     if (optionalScorers.size() == 0) {
       return new NonMatchingScorer(); // no clauses or only prohibited clauses
     } else { // No required scorers. At least one optional scorer.
@@ -258,7 +250,7 @@
     }
   }
 
-  private Scorer makeCountingSumScorerSomeReq() { // At least one required scorer.
+  private Scorer makeCountingSumScorerSomeReq() throws IOException { // At least one required scorer.
     if (optionalScorers.size() < minNrShouldMatch) {
       return new NonMatchingScorer(); // fewer optional clauses than minimum that should match
     } else if (optionalScorers.size() == minNrShouldMatch) { // all optional scorers also required.

Modified: lucene/java/trunk/src/java/org/apache/lucene/search/ConjunctionScorer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/ConjunctionScorer.java?rev=597702&r1=597701&r2=597702&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/ConjunctionScorer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/ConjunctionScorer.java Fri Nov 23 08:58:54 2007
@@ -18,118 +18,105 @@
  */
 
 import java.io.IOException;
+import java.util.Collection;
 import java.util.Arrays;
 import java.util.Comparator;
 
 /** Scorer for conjunctions, sets of queries, all of which are required. */
 class ConjunctionScorer extends Scorer {
-  private Scorer[] scorers = new Scorer[2];
-  private int length = 0;
-  private int first = 0;
-  private int last = -1;
-  private boolean firstTime = true;
-  private boolean more = true;
-  private float coord;
+  private final Scorer[] scorers;
 
-  public ConjunctionScorer(Similarity similarity) {
-    super(similarity);
+  private boolean firstTime=true;
+  private boolean more;
+  private final float coord;
+  private int lastDoc=-1;
+
+  public ConjunctionScorer(Similarity similarity, Collection scorers) throws IOException {
+    this(similarity, (Scorer[])scorers.toArray(new Scorer[scorers.size()]));
   }
 
-  final void add(Scorer scorer) {
-    if (length >= scorers.length) {
-      // grow the array
-      Scorer[] temps = new Scorer[scorers.length * 2];
-      System.arraycopy(scorers, 0, temps, 0, length);
-      scorers = temps;
-    }
-    last += 1;
-    length += 1;
-    scorers[last] = scorer;
+  public ConjunctionScorer(Similarity similarity, Scorer[] scorers) throws IOException {
+    super(similarity);
+    this.scorers = scorers;
+    coord = getSimilarity().coord(this.scorers.length, this.scorers.length);
   }
 
-  public int doc() { return scorers[first].doc(); }
+  public int doc() { return lastDoc; }
 
   public boolean next() throws IOException {
-    if (firstTime) {
-      init(true);
-    } else if (more) {
-      more = scorers[last].next();                   // trigger further scanning
-    }
+    if (firstTime)
+      return init(0);
+    else if (more)
+      more = scorers[(scorers.length-1)].next();
     return doNext();
   }
-  
+
   private boolean doNext() throws IOException {
-    while (more && scorers[first].doc() < scorers[last].doc()) { // find doc w/ all clauses
-      more = scorers[first].skipTo(scorers[last].doc());      // skip first upto last
-      last = first; // move first to last
-      first = (first == length-1) ? 0 : first+1;
+    int first=0;
+    Scorer lastScorer = scorers[scorers.length-1];
+    Scorer firstScorer;
+    while (more && (firstScorer=scorers[first]).doc() < (lastDoc=lastScorer.doc())) {
+      more = firstScorer.skipTo(lastDoc);
+      lastScorer = firstScorer;
+      first = (first == (scorers.length-1)) ? 0 : first+1;
     }
-    return more;                                // found a doc with all clauses
+    return more;
   }
 
   public boolean skipTo(int target) throws IOException {
-    if(firstTime) {
-      init(false);
-    }
-    
-    for (int i = 0, pos = first; i < length; i++) {
-      if (!more) break; 
-      more = scorers[pos].skipTo(target);
-      pos = (pos == length-1) ? 0 : pos+1;
-    }
-    
-    if (more)
-      sortScorers();                              // re-sort scorers
-    
+    if (firstTime)
+      return init(target);
+    else if (more)
+      more = scorers[(scorers.length-1)].skipTo(target);
     return doNext();
   }
 
-  public float score() throws IOException {
-    float sum = 0.0f;
-    for (int i = 0; i < length; i++) {
-      sum += scorers[i].score();
-    }
-    return sum * coord;
-  }
-  
-  private void init(boolean initScorers) throws IOException {
-    //  compute coord factor
-    coord = getSimilarity().coord(length, length);
-   
-    more = length > 0;
-
-    if(initScorers){
-      // move each scorer to its first entry
-      for (int i = 0, pos = first; i < length; i++) {
-        if (!more) break; 
-        more = scorers[pos].next();
-        pos = (pos == length-1) ? 0 : pos+1;
-      }
-      // initial sort of simulated list
-      if (more) 
-        sortScorers();
+  // Note... most of this could be done in the constructor
+  // thus skipping a check for firstTime per call to next() and skipTo()
+  private boolean init(int target) throws IOException {
+    firstTime=false;
+    more = scorers.length>1;
+    for (int i=0; i<scorers.length; i++) {
+      more = target==0 ? scorers[i].next() : scorers[i].skipTo(target);
+      if (!more)
+        return false;
     }
 
-    firstTime = false;
-  }
+    // Sort the array the first time...
+    // We don't need to sort the array in any future calls because we know
+    // it will already start off sorted (all scorers on same doc).
 
-  private void sortScorers() {
-    // squeeze the array down for the sort
-    if (length != scorers.length) {
-      Scorer[] temps = new Scorer[length];
-      System.arraycopy(scorers, 0, temps, 0, length);
-      scorers = temps;
-    }
-    
     // note that this comparator is not consistent with equals!
     Arrays.sort(scorers, new Comparator() {         // sort the array
         public int compare(Object o1, Object o2) {
           return ((Scorer)o1).doc() - ((Scorer)o2).doc();
         }
       });
-   
-    first = 0;
-    last = length - 1;
+
+    doNext();
+
+    // If first-time skip distance is any predictor of
+    // scorer sparseness, then we should always try to skip first on
+    // those scorers.
+    // Keep last scorer in it's last place (it will be the first
+    // to be skipped on), but reverse all of the others so that
+    // they will be skipped on in order of original high skip.
+    int end=(scorers.length-1)-1;
+    for (int i=0; i<(end>>1); i++) {
+      Scorer tmp = scorers[i];
+      scorers[i] = scorers[end-i];
+      scorers[end-i] = tmp;
+    }
+
+    return more;
+  }
+
+  public float score() throws IOException {
+    float sum = 0.0f;
+    for (int i = 0; i < scorers.length; i++) {
+      sum += scorers[i].score();
+    }
+    return sum * coord;
   }
 
   public Explanation explain(int doc) {

Modified: lucene/java/trunk/src/test/org/apache/lucene/search/TestScorerPerf.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/search/TestScorerPerf.java?rev=597702&r1=597701&r2=597702&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/search/TestScorerPerf.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/search/TestScorerPerf.java Fri Nov 23 08:58:54 2007
@@ -42,6 +42,7 @@
   boolean validate = true;  // set to false when doing performance testing
 
   BitSet[] sets;
+  Term[] terms;
   IndexSearcher s;
 
   public void createDummySearcher() throws Exception {
@@ -55,22 +56,25 @@
 
   public void createRandomTerms(int nDocs, int nTerms, double power, Directory dir) throws Exception {
     int[] freq = new int[nTerms];
+    terms = new Term[nTerms];
     for (int i=0; i<nTerms; i++) {
       int f = (nTerms+1)-i;  // make first terms less frequent
       freq[i] = (int)Math.ceil(Math.pow(f,power));
+      terms[i] = new Term("f",Character.toString((char)('A'+i)));
     }
 
     IndexWriter iw = new IndexWriter(dir,new WhitespaceAnalyzer(), true);
-    iw.setMaxBufferedDocs(123);
     for (int i=0; i<nDocs; i++) {
       Document d = new Document();
       for (int j=0; j<nTerms; j++) {
         if (r.nextInt(freq[j]) == 0) {
-          d.add(new Field("f", Character.toString((char)j), Field.Store.NO, Field.Index.UN_TOKENIZED));
+          d.add(new Field("f", terms[j].text(), Field.Store.NO, Field.Index.UN_TOKENIZED));
+          //System.out.println(d);
         }
       }
       iw.addDocument(d);
     }
+    iw.optimize();
     iw.close();
   }
 
@@ -168,6 +172,7 @@
 
   public int doNestedConjunctions(int iter, int maxOuterClauses, int maxClauses) throws IOException {
     int ret=0;
+    long nMatches=0;
 
     for (int i=0; i<iter; i++) {
       int oClauses = r.nextInt(maxOuterClauses-1)+2;
@@ -185,15 +190,15 @@
       oq.add(bq, BooleanClause.Occur.MUST);
       } // outer
 
-
       CountingHitCollector hc = validate ? new MatchingHitCollector(result)
                                          : new CountingHitCollector();
       s.search(oq, hc);
+      nMatches += hc.getCount();
       ret += hc.getSum();
       if (validate) assertEquals(result.cardinality(), hc.getCount());
       // System.out.println(hc.getCount());
     }
-
+    System.out.println("Average number of matches="+(nMatches/iter));
     return ret;
   }
 
@@ -205,22 +210,28 @@
   ) throws IOException {
     int ret=0;
 
+    long nMatches=0;
     for (int i=0; i<iter; i++) {
       int nClauses = r.nextInt(maxClauses-1)+2; // min 2 clauses
       BooleanQuery bq = new BooleanQuery();
-      BitSet terms = new BitSet(termsInIndex);
+      BitSet termflag = new BitSet(termsInIndex);
       for (int j=0; j<nClauses; j++) {
         int tnum;
         // don't pick same clause twice
-        do {tnum = r.nextInt(termsInIndex);} while (terms.get(tnum));
-        Query tq = new TermQuery(new Term("f",Character.toString((char)tnum)));
+        tnum = r.nextInt(termsInIndex);
+        if (termflag.get(tnum)) tnum=termflag.nextClearBit(tnum);
+        if (tnum<0 || tnum>=termsInIndex) tnum=termflag.nextClearBit(0);
+        termflag.set(tnum);
+        Query tq = new TermQuery(terms[tnum]);
         bq.add(tq, BooleanClause.Occur.MUST);
       }
 
       CountingHitCollector hc = new CountingHitCollector();
       s.search(bq, hc);
+      nMatches += hc.getCount();
       ret += hc.getSum();
     }
+    System.out.println("Average number of matches="+(nMatches/iter));
 
     return ret;
   }
@@ -233,7 +244,7 @@
                                 int iter
   ) throws IOException {
     int ret=0;
-
+    long nMatches=0;
     for (int i=0; i<iter; i++) {
       int oClauses = r.nextInt(maxOuterClauses-1)+2;
       BooleanQuery oq = new BooleanQuery();
@@ -241,12 +252,15 @@
 
       int nClauses = r.nextInt(maxClauses-1)+2; // min 2 clauses
       BooleanQuery bq = new BooleanQuery();
-      BitSet terms = new BitSet(termsInIndex);
+      BitSet termflag = new BitSet(termsInIndex);
       for (int j=0; j<nClauses; j++) {
         int tnum;
         // don't pick same clause twice
-        do {tnum = r.nextInt(termsInIndex);} while (terms.get(tnum));
-        Query tq = new TermQuery(new Term("f",Character.toString((char)tnum)));
+        tnum = r.nextInt(termsInIndex);
+        if (termflag.get(tnum)) tnum=termflag.nextClearBit(tnum);
+        if (tnum<0 || tnum>=25) tnum=termflag.nextClearBit(0);
+        termflag.set(tnum);
+        Query tq = new TermQuery(terms[tnum]);
         bq.add(tq, BooleanClause.Occur.MUST);
       } // inner
 
@@ -256,9 +270,10 @@
 
       CountingHitCollector hc = new CountingHitCollector();
       s.search(oq, hc);
+      nMatches += hc.getCount();     
       ret += hc.getSum();
     }
-
+    System.out.println("Average number of matches="+(nMatches/iter));
     return ret;
   }
 
@@ -275,7 +290,7 @@
       PhraseQuery q = new PhraseQuery();
       for (int j=0; j<nClauses; j++) {
         int tnum = r.nextInt(termsInIndex);
-        q.add(new Term("f",Character.toString((char)tnum)), j);
+        q.add(new Term("f",Character.toString((char)(tnum+'A'))), j);
       }
       q.setSlop(termsInIndex);  // this could be random too
 
@@ -299,7 +314,8 @@
   }
 
   /***
-  int bigIter=6;
+  int bigIter=10;
+
   public void testConjunctionPerf() throws Exception {
     createDummySearcher();
     validate=false;
@@ -326,16 +342,17 @@
     s.close();
   }
 
+
   public void testConjunctionTerms() throws Exception {
     validate=false;
     RAMDirectory dir = new RAMDirectory();
     System.out.println("Creating index");
-    createRandomTerms(100000,25,2, dir);
+    createRandomTerms(100000,25,.5, dir);
     s = new IndexSearcher(dir);
     System.out.println("Starting performance test");
     for (int i=0; i<bigIter; i++) {
       long start = System.currentTimeMillis();
-      doTermConjunctions(s,25,5,10000);
+      doTermConjunctions(s,25,5,1000);
       long end = System.currentTimeMillis();
       System.out.println("milliseconds="+(end-start));
     }
@@ -346,12 +363,12 @@
     validate=false;    
     RAMDirectory dir = new RAMDirectory();
     System.out.println("Creating index");
-    createRandomTerms(100000,25,2, dir);
+    createRandomTerms(100000,25,.2, dir);
     s = new IndexSearcher(dir);
     System.out.println("Starting performance test");
     for (int i=0; i<bigIter; i++) {
       long start = System.currentTimeMillis();
-      doNestedTermConjunctions(s,25,5,5,1000);
+      doNestedTermConjunctions(s,25,3,3,200);
       long end = System.currentTimeMillis();
       System.out.println("milliseconds="+(end-start));
     }
@@ -373,8 +390,8 @@
       System.out.println("milliseconds="+(end-start));
     }
     s.close();
-
   }
    ***/
+
 
 }