You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2016/06/22 07:08:59 UTC

[2/2] lucene-solr:branch_6x: LUCENE-7339: Specialize conjunction with bit sets.

LUCENE-7339: Specialize conjunction with bit sets.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/60dcef6c
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/60dcef6c
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/60dcef6c

Branch: refs/heads/branch_6x
Commit: 60dcef6c61b0a72f4a7bb6e9b27a0073464f53c5
Parents: a3fc7ef
Author: Adrien Grand <jp...@gmail.com>
Authored: Wed Jun 22 08:02:17 2016 +0200
Committer: Adrien Grand <jp...@gmail.com>
Committed: Wed Jun 22 08:32:01 2016 +0200

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |   2 +-
 .../apache/lucene/search/ConjunctionDISI.java   | 124 ++++++++++++++++---
 .../org/apache/lucene/search/LRUQueryCache.java |  34 ++++-
 .../org/apache/lucene/util/BitSetIterator.java  |  10 ++
 .../lucene/search/TestConjunctionDISI.java      |  98 +++++++++++----
 .../apache/lucene/search/TestScorerPerf.java    |   1 +
 6 files changed, 227 insertions(+), 42 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/60dcef6c/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index ae0a157..eb3d115 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -56,7 +56,7 @@ Improvements
 
 Optimizations
 
-* LUCENE-7330: Speed up conjunction queries. (Adrien Grand)
+* LUCENE-7330, LUCENE-7339: Speed up conjunction queries. (Adrien Grand)
 
 Other
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/60dcef6c/lucene/core/src/java/org/apache/lucene/search/ConjunctionDISI.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/ConjunctionDISI.java b/lucene/core/src/java/org/apache/lucene/search/ConjunctionDISI.java
index 205a349..43d03b2 100644
--- a/lucene/core/src/java/org/apache/lucene/search/ConjunctionDISI.java
+++ b/lucene/core/src/java/org/apache/lucene/search/ConjunctionDISI.java
@@ -19,11 +19,15 @@ package org.apache.lucene.search;
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.List;
 
 import org.apache.lucene.search.spans.Spans;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSet;
+import org.apache.lucene.util.BitSetIterator;
 import org.apache.lucene.util.CollectionUtil;
 
 /** A conjunction of DocIdSetIterators.
@@ -47,11 +51,7 @@ public final class ConjunctionDISI extends DocIdSetIterator {
       addScorer(scorer, allIterators, twoPhaseIterators);
     }
 
-    DocIdSetIterator iterator = new ConjunctionDISI(allIterators);
-    if (twoPhaseIterators.isEmpty() == false) {
-      iterator = TwoPhaseIterator.asDocIdSetIterator(new ConjunctionTwoPhaseIterator(iterator, twoPhaseIterators));
-    }
-    return iterator;
+    return createConjunction(allIterators, twoPhaseIterators);
   }
 
   /** Create a conjunction over the provided DocIdSetIterators. Note that the
@@ -68,11 +68,7 @@ public final class ConjunctionDISI extends DocIdSetIterator {
       addIterator(iterator, allIterators, twoPhaseIterators);
     }
 
-    DocIdSetIterator iterator = new ConjunctionDISI(allIterators);
-    if (twoPhaseIterators.isEmpty() == false) {
-      iterator = TwoPhaseIterator.asDocIdSetIterator(new ConjunctionTwoPhaseIterator(iterator, twoPhaseIterators));
-    }
-    return iterator;
+    return createConjunction(allIterators, twoPhaseIterators);
   }
 
   /** Create a conjunction over the provided {@link Spans}. Note that the
@@ -89,11 +85,7 @@ public final class ConjunctionDISI extends DocIdSetIterator {
       addSpans(spans, allIterators, twoPhaseIterators);
     }
 
-    DocIdSetIterator iterator = new ConjunctionDISI(allIterators);
-    if (twoPhaseIterators.isEmpty() == false) {
-      iterator = TwoPhaseIterator.asDocIdSetIterator(new ConjunctionTwoPhaseIterator(iterator, twoPhaseIterators));
-    }
-    return iterator;
+    return createConjunction(allIterators, twoPhaseIterators);
   }
 
   /** Adds the scorer, possibly splitting up into two phases or collapsing if it is another conjunction */
@@ -127,6 +119,10 @@ public final class ConjunctionDISI extends DocIdSetIterator {
       allIterators.add(conjunction.lead1);
       allIterators.add(conjunction.lead2);
       Collections.addAll(allIterators, conjunction.others);
+    } else if (disi.getClass() == BitSetConjunctionDISI.class) {
+      BitSetConjunctionDISI conjunction = (BitSetConjunctionDISI) disi;
+      allIterators.add(conjunction.lead);
+      Collections.addAll(allIterators, conjunction.bitSetIterators);
     } else {
       allIterators.add(disi);
     }
@@ -141,6 +137,41 @@ public final class ConjunctionDISI extends DocIdSetIterator {
     }
   }
 
+  private static DocIdSetIterator createConjunction(
+      List<DocIdSetIterator> allIterators,
+      List<TwoPhaseIterator> twoPhaseIterators) {
+    long minCost = allIterators.stream().mapToLong(DocIdSetIterator::cost).min().getAsLong();
+    List<BitSetIterator> bitSetIterators = new ArrayList<>();
+    List<DocIdSetIterator> iterators = new ArrayList<>();
+    for (DocIdSetIterator iterator : allIterators) {
+      if (iterator.cost() > minCost && iterator instanceof BitSetIterator) {
+        // we put all bitset iterators into bitSetIterators
+        // except if they have the minimum cost, since we need
+        // them to lead the iteration in that case
+        bitSetIterators.add((BitSetIterator) iterator);
+      } else {
+        iterators.add(iterator);
+      }
+    }
+
+    DocIdSetIterator disi;
+    if (iterators.size() == 1) {
+      disi = iterators.get(0);
+    } else {
+      disi = new ConjunctionDISI(iterators);
+    }
+
+    if (bitSetIterators.size() > 0) {
+      disi = new BitSetConjunctionDISI(disi, bitSetIterators);
+    }
+
+    if (twoPhaseIterators.isEmpty() == false) {
+      disi = TwoPhaseIterator.asDocIdSetIterator(new ConjunctionTwoPhaseIterator(disi, twoPhaseIterators));
+    }
+
+    return disi;
+  }
+
   final DocIdSetIterator lead1, lead2;
   final DocIdSetIterator[] others;
 
@@ -214,6 +245,69 @@ public final class ConjunctionDISI extends DocIdSetIterator {
     return lead1.cost(); // overestimate
   }
 
+  /** Conjunction between a {@link DocIdSetIterator} and one or more {@link BitSetIterator}s. */
+  private static class BitSetConjunctionDISI extends DocIdSetIterator {
+
+    private final DocIdSetIterator lead;
+    private final BitSetIterator[] bitSetIterators;
+    private final BitSet[] bitSets;
+    private final int minLength;
+
+    BitSetConjunctionDISI(DocIdSetIterator lead, Collection<BitSetIterator> bitSetIterators) {
+      this.lead = lead;
+      assert bitSetIterators.size() > 0;
+      this.bitSetIterators = bitSetIterators.toArray(new BitSetIterator[0]);
+      // Put the least costly iterators first so that we exit as soon as possible
+      ArrayUtil.timSort(this.bitSetIterators, (a, b) -> Long.compare(a.cost(), b.cost()));
+      this.bitSets = new BitSet[this.bitSetIterators.length];
+      int minLen = Integer.MAX_VALUE;
+      for (int i = 0; i < this.bitSetIterators.length; ++i) {
+        BitSet bitSet = this.bitSetIterators[i].getBitSet();
+        this.bitSets[i] = bitSet;
+        minLen = Math.min(minLen, bitSet.length());
+      }
+      this.minLength = minLen;
+    }
+
+    @Override
+    public int docID() {
+      return lead.docID();
+    }
+
+    @Override
+    public int nextDoc() throws IOException {
+      return doNext(lead.nextDoc());
+    }
+
+    @Override
+    public int advance(int target) throws IOException {
+      return doNext(lead.advance(target));
+    }
+
+    private int doNext(int doc) throws IOException {
+      advanceLead: for (;; doc = lead.nextDoc()) {
+        if (doc >= minLength) {
+          return NO_MORE_DOCS;
+        }
+        for (BitSet bitSet : bitSets) {
+          if (bitSet.get(doc) == false) {
+            continue advanceLead;
+          }
+        }
+        for (BitSetIterator iterator : bitSetIterators) {
+          iterator.setDocId(doc);
+        }
+        return doc;
+      }
+    }
+
+    @Override
+    public long cost() {
+      return lead.cost();
+    }
+
+  }
+
   /**
    * {@link TwoPhaseIterator} implementing a conjunction.
    */

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/60dcef6c/lucene/core/src/java/org/apache/lucene/search/LRUQueryCache.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/LRUQueryCache.java b/lucene/core/src/java/org/apache/lucene/search/LRUQueryCache.java
index 8598645..f555365 100644
--- a/lucene/core/src/java/org/apache/lucene/search/LRUQueryCache.java
+++ b/lucene/core/src/java/org/apache/lucene/search/LRUQueryCache.java
@@ -40,6 +40,8 @@ import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TieredMergePolicy;
 import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.Accountables;
+import org.apache.lucene.util.BitDocIdSet;
+import org.apache.lucene.util.FixedBitSet;
 import org.apache.lucene.util.RamUsageEstimator;
 import org.apache.lucene.util.RoaringDocIdSet;
 
@@ -502,9 +504,39 @@ public class LRUQueryCache implements QueryCache, Accountable {
   }
 
   /**
-   * Default cache implementation: uses {@link RoaringDocIdSet}.
+   * Default cache implementation: uses {@link RoaringDocIdSet} for sets that
+   * have a density &lt; 1% and a {@link BitDocIdSet} over a {@link FixedBitSet}
+   * otherwise.
    */
   protected DocIdSet cacheImpl(BulkScorer scorer, int maxDoc) throws IOException {
+    if (scorer.cost() * 100 >= maxDoc) {
+      // FixedBitSet is faster for dense sets and will enable the random-access
+      // optimization in ConjunctionDISI
+      return cacheIntoBitSet(scorer, maxDoc);
+    } else {
+      return cacheIntoRoaringDocIdSet(scorer, maxDoc);
+    }
+  }
+
+  private static DocIdSet cacheIntoBitSet(BulkScorer scorer, int maxDoc) throws IOException {
+    final FixedBitSet bitSet = new FixedBitSet(maxDoc);
+    long cost[] = new long[1];
+    scorer.score(new LeafCollector() {
+
+      @Override
+      public void setScorer(Scorer scorer) throws IOException {}
+
+      @Override
+      public void collect(int doc) throws IOException {
+        cost[0]++;
+        bitSet.set(doc);
+      }
+
+    }, null);
+    return new BitDocIdSet(bitSet, cost[0]);
+  }
+
+  private static DocIdSet cacheIntoRoaringDocIdSet(BulkScorer scorer, int maxDoc) throws IOException {
     RoaringDocIdSet.Builder builder = new RoaringDocIdSet.Builder(maxDoc);
     scorer.score(new LeafCollector() {
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/60dcef6c/lucene/core/src/java/org/apache/lucene/util/BitSetIterator.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/BitSetIterator.java b/lucene/core/src/java/org/apache/lucene/util/BitSetIterator.java
index 1a51d80..9e88c6d 100644
--- a/lucene/core/src/java/org/apache/lucene/util/BitSetIterator.java
+++ b/lucene/core/src/java/org/apache/lucene/util/BitSetIterator.java
@@ -62,11 +62,21 @@ public class BitSetIterator extends DocIdSetIterator {
     this.cost = cost;
   }
 
+  /** Return the wrapped {@link BitSet}. */
+  public BitSet getBitSet() {
+    return bits;
+  }
+
   @Override
   public int docID() {
     return doc;
   }
 
+  /** Set the current doc id that this iterator is on. */
+  public void setDocId(int docId) {
+    this.doc = docId;
+  }
+
   @Override
   public int nextDoc() {
     return advance(doc + 1);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/60dcef6c/lucene/core/src/test/org/apache/lucene/search/TestConjunctionDISI.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestConjunctionDISI.java b/lucene/core/src/test/org/apache/lucene/search/TestConjunctionDISI.java
index dcb6664..9835f35 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestConjunctionDISI.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestConjunctionDISI.java
@@ -30,12 +30,18 @@ import org.apache.lucene.util.TestUtil;
 
 public class TestConjunctionDISI extends LuceneTestCase {
 
-  private static TwoPhaseIterator approximation(final DocIdSetIterator iterator, final FixedBitSet confirmed) {
-    return new TwoPhaseIterator(iterator) {
+  private static TwoPhaseIterator approximation(DocIdSetIterator iterator, final FixedBitSet confirmed) {
+    DocIdSetIterator approximation;
+    if (random().nextBoolean()) {
+      approximation = anonymizeIterator(iterator);
+    } else {
+      approximation = iterator;
+    }
+    return new TwoPhaseIterator(approximation) {
 
       @Override
       public boolean matches() throws IOException {
-        return confirmed.get(iterator.docID());
+        return confirmed.get(approximation.docID());
       }
 
       @Override
@@ -45,6 +51,33 @@ public class TestConjunctionDISI extends LuceneTestCase {
     };
   }
 
+  /** Return an anonym class so that ConjunctionDISI cannot optimize it
+   *  like it does eg. for BitSetIterators. */
+  private static DocIdSetIterator anonymizeIterator(DocIdSetIterator it) {
+    return new DocIdSetIterator() {
+      
+      @Override
+      public int nextDoc() throws IOException {
+        return it.nextDoc();
+      }
+      
+      @Override
+      public int docID() {
+        return it.docID();
+      }
+      
+      @Override
+      public long cost() {
+        return it.docID();
+      }
+      
+      @Override
+      public int advance(int target) throws IOException {
+        return it.advance(target);
+      }
+    };
+  }
+
   private static Scorer scorer(TwoPhaseIterator twoPhaseIterator) {
     return scorer(TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator), twoPhaseIterator);
   }
@@ -168,16 +201,24 @@ public class TestConjunctionDISI extends LuceneTestCase {
       final Scorer[] iterators = new Scorer[numIterators];
       for (int i = 0; i < iterators.length; ++i) {
         final FixedBitSet set = randomSet(maxDoc);
-        if (random().nextBoolean()) {
-          // simple iterator
-          sets[i] = set;
-          iterators[i] = new ConstantScoreScorer(null, 0f, new BitDocIdSet(set).iterator());
-        } else {
-          // scorer with approximation
-          final FixedBitSet confirmed = clearRandomBits(set);
-          sets[i] = confirmed;
-          final TwoPhaseIterator approximation = approximation(new BitDocIdSet(set).iterator(), confirmed);
-          iterators[i] = scorer(approximation);
+        switch (random().nextInt(3)) {
+          case 0:
+            // simple iterator
+            sets[i] = set;
+            iterators[i] = new ConstantScoreScorer(null, 0f, anonymizeIterator(new BitDocIdSet(set).iterator()));
+            break;
+          case 1:
+            // bitSet iterator
+            sets[i] = set;
+            iterators[i] = new ConstantScoreScorer(null, 0f, new BitDocIdSet(set).iterator());
+            break;
+          default:
+            // scorer with approximation
+            final FixedBitSet confirmed = clearRandomBits(set);
+            sets[i] = confirmed;
+            final TwoPhaseIterator approximation = approximation(new BitDocIdSet(set).iterator(), confirmed);
+            iterators[i] = scorer(approximation);
+            break;
         }
       }
 
@@ -232,19 +273,26 @@ public class TestConjunctionDISI extends LuceneTestCase {
       for (int i = 0; i < numIterators; ++i) {
         final FixedBitSet set = randomSet(maxDoc);
         final Scorer newIterator;
-        if (random().nextBoolean()) {
-          // simple iterator
-          sets[i] = set;
-          newIterator = new ConstantScoreScorer(null, 0f, new BitDocIdSet(set).iterator());
-        } else {
-          // scorer with approximation
-          final FixedBitSet confirmed = clearRandomBits(set);
-          sets[i] = confirmed;
-          final TwoPhaseIterator approximation = approximation(new BitDocIdSet(set).iterator(), confirmed);
-          newIterator = scorer(approximation);
-          hasApproximation = true;
+        switch (random().nextInt(3)) {
+          case 0:
+            // simple iterator
+            sets[i] = set;
+            newIterator = new ConstantScoreScorer(null, 0f, anonymizeIterator(new BitDocIdSet(set).iterator()));
+            break;
+          case 1:
+            // bitSet iterator
+            sets[i] = set;
+            newIterator = new ConstantScoreScorer(null, 0f, new BitDocIdSet(set).iterator());
+            break;
+          default:
+            // scorer with approximation
+            final FixedBitSet confirmed = clearRandomBits(set);
+            sets[i] = confirmed;
+            final TwoPhaseIterator approximation = approximation(new BitDocIdSet(set).iterator(), confirmed);
+            newIterator = scorer(approximation);
+            hasApproximation = true;
+            break;
         }
-
         if (conjunction == null) {
           conjunction = newIterator;
         } else {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/60dcef6c/lucene/core/src/test/org/apache/lucene/search/TestScorerPerf.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestScorerPerf.java b/lucene/core/src/test/org/apache/lucene/search/TestScorerPerf.java
index 741ebe3..a7c1ba8 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestScorerPerf.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestScorerPerf.java
@@ -53,6 +53,7 @@ public class TestScorerPerf extends LuceneTestCase {
     iw.close();
     r = DirectoryReader.open(d);
     s = newSearcher(r);
+    s.setQueryCache(null);
   }
 
   public void createRandomTerms(int nDocs, int nTerms, double power, Directory dir) throws Exception {