You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2021/10/27 08:20:19 UTC
[lucene] branch main updated: LUCENE-10206 Implement O(1) count on query cache (#415)

This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/main by this push:
     new 941df98  LUCENE-10206 Implement O(1) count on query cache (#415)
941df98 is described below

commit 941df98c3f718371af4702c92bf6537739120064
Author: Nik Everett <ni...@gmail.com>
AuthorDate: Wed Oct 27 04:20:10 2021 -0400

    LUCENE-10206 Implement O(1) count on query cache (#415)
    
    When we load a query into the query cache we always calculate the count
    of matching documents. This uses that count to power the new `O(1)`
    `Weight#count` method.
---
 .../org/apache/lucene/search/LRUQueryCache.java    | 158 +++++++++++++++------
 .../apache/lucene/search/TestLRUQueryCache.java    |  68 +++++++++
 2 files changed, 186 insertions(+), 40 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/search/LRUQueryCache.java b/lucene/core/src/java/org/apache/lucene/search/LRUQueryCache.java
index 7c98ddf..127f00f 100644
--- a/lucene/core/src/java/org/apache/lucene/search/LRUQueryCache.java
+++ b/lucene/core/src/java/org/apache/lucene/search/LRUQueryCache.java
@@ -43,6 +43,7 @@ import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.Accountables;
 import org.apache.lucene.util.BitDocIdSet;
 import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RamUsageEstimator;
 import org.apache.lucene.util.RoaringDocIdSet;
 
 /**
@@ -266,7 +267,7 @@ public class LRUQueryCache implements QueryCache, Accountable {
     }
   }
 
-  DocIdSet get(Query key, IndexReader.CacheHelper cacheHelper) {
+  CacheAndCount get(Query key, IndexReader.CacheHelper cacheHelper) {
     assert lock.isHeldByCurrentThread();
     assert key instanceof BoostQuery == false;
     assert key instanceof ConstantScoreQuery == false;
@@ -282,7 +283,7 @@ public class LRUQueryCache implements QueryCache, Accountable {
       onMiss(readerKey, key);
       return null;
     }
-    final DocIdSet cached = leafCache.get(singleton);
+    final CacheAndCount cached = leafCache.get(singleton);
     if (cached == null) {
       onMiss(readerKey, singleton);
     } else {
@@ -291,7 +292,7 @@ public class LRUQueryCache implements QueryCache, Accountable {
     return cached;
   }
 
-  private void putIfAbsent(Query query, DocIdSet set, IndexReader.CacheHelper cacheHelper) {
+  private void putIfAbsent(Query query, CacheAndCount cached, IndexReader.CacheHelper cacheHelper) {
     assert query instanceof BoostQuery == false;
     assert query instanceof ConstantScoreQuery == false;
     // under a lock to make sure that mostRecentlyUsedQueries and cache remain sync'ed
@@ -318,7 +319,7 @@ public class LRUQueryCache implements QueryCache, Accountable {
         // we just created a new leaf cache, need to register a close listener
         cacheHelper.addClosedListener(this::clearCoreCacheKey);
       }
-      leafCache.putIfAbsent(query, set);
+      leafCache.putIfAbsent(query, cached);
       evictIfNecessary();
     } finally {
       lock.unlock();
@@ -437,8 +438,8 @@ public class LRUQueryCache implements QueryCache, Accountable {
       recomputedRamBytesUsed += mostRecentlyUsedQueries.size() * QUERY_DEFAULT_RAM_BYTES_USED;
       for (LeafCache leafCache : cache.values()) {
         recomputedRamBytesUsed += HASHTABLE_RAM_BYTES_PER_ENTRY * leafCache.cache.size();
-        for (DocIdSet set : leafCache.cache.values()) {
-          recomputedRamBytesUsed += set.ramBytesUsed();
+        for (CacheAndCount cached : leafCache.cache.values()) {
+          recomputedRamBytesUsed += cached.ramBytesUsed();
         }
       }
       if (recomputedRamBytesUsed != ramBytesUsed) {
@@ -498,7 +499,7 @@ public class LRUQueryCache implements QueryCache, Accountable {
    * Default cache implementation: uses {@link RoaringDocIdSet} for sets that have a density &lt; 1%
    * and a {@link BitDocIdSet} over a {@link FixedBitSet} otherwise.
    */
-  protected DocIdSet cacheImpl(BulkScorer scorer, int maxDoc) throws IOException {
+  protected CacheAndCount cacheImpl(BulkScorer scorer, int maxDoc) throws IOException {
     if (scorer.cost() * 100 >= maxDoc) {
       // FixedBitSet is faster for dense sets and will enable the random-access
       // optimization in ConjunctionDISI
@@ -508,9 +509,9 @@ public class LRUQueryCache implements QueryCache, Accountable {
     }
   }
 
-  private static DocIdSet cacheIntoBitSet(BulkScorer scorer, int maxDoc) throws IOException {
+  private static CacheAndCount cacheIntoBitSet(BulkScorer scorer, int maxDoc) throws IOException {
     final FixedBitSet bitSet = new FixedBitSet(maxDoc);
-    long[] cost = new long[1];
+    int[] count = new int[1];
     scorer.score(
         new LeafCollector() {
 
@@ -519,15 +520,15 @@ public class LRUQueryCache implements QueryCache, Accountable {
 
           @Override
           public void collect(int doc) throws IOException {
-            cost[0]++;
+            count[0]++;
             bitSet.set(doc);
           }
         },
         null);
-    return new BitDocIdSet(bitSet, cost[0]);
+    return new CacheAndCount(new BitDocIdSet(bitSet, count[0]), count[0]);
   }
 
-  private static DocIdSet cacheIntoRoaringDocIdSet(BulkScorer scorer, int maxDoc)
+  private static CacheAndCount cacheIntoRoaringDocIdSet(BulkScorer scorer, int maxDoc)
       throws IOException {
     RoaringDocIdSet.Builder builder = new RoaringDocIdSet.Builder(maxDoc);
     scorer.score(
@@ -542,7 +543,8 @@ public class LRUQueryCache implements QueryCache, Accountable {
           }
         },
         null);
-    return builder.build();
+    RoaringDocIdSet cache = builder.build();
+    return new CacheAndCount(cache, cache.cardinality());
   }
 
   /**
@@ -622,7 +624,7 @@ public class LRUQueryCache implements QueryCache, Accountable {
   private class LeafCache implements Accountable {
 
     private final Object key;
-    private final Map<Query, DocIdSet> cache;
+    private final Map<Query, CacheAndCount> cache;
     private volatile long ramBytesUsed;
 
     LeafCache(Object key) {
@@ -641,25 +643,25 @@ public class LRUQueryCache implements QueryCache, Accountable {
       LRUQueryCache.this.onDocIdSetEviction(key, 1, ramBytesUsed);
     }
 
-    DocIdSet get(Query query) {
+    CacheAndCount get(Query query) {
       assert query instanceof BoostQuery == false;
       assert query instanceof ConstantScoreQuery == false;
       return cache.get(query);
     }
 
-    void putIfAbsent(Query query, DocIdSet set) {
+    void putIfAbsent(Query query, CacheAndCount cached) {
       assert query instanceof BoostQuery == false;
       assert query instanceof ConstantScoreQuery == false;
-      if (cache.putIfAbsent(query, set) == null) {
+      if (cache.putIfAbsent(query, cached) == null) {
         // the set was actually put
-        onDocIdSetCache(HASHTABLE_RAM_BYTES_PER_ENTRY + set.ramBytesUsed());
+        onDocIdSetCache(HASHTABLE_RAM_BYTES_PER_ENTRY + cached.ramBytesUsed());
       }
     }
 
     void remove(Query query) {
       assert query instanceof BoostQuery == false;
       assert query instanceof ConstantScoreQuery == false;
-      DocIdSet removed = cache.remove(query);
+      CacheAndCount removed = cache.remove(query);
       if (removed != null) {
         onDocIdSetEviction(HASHTABLE_RAM_BYTES_PER_ENTRY + removed.ramBytesUsed());
       }
@@ -703,10 +705,10 @@ public class LRUQueryCache implements QueryCache, Accountable {
       return worstCaseRamUsage * 5 < totalRamAvailable;
     }
 
-    private DocIdSet cache(LeafReaderContext context) throws IOException {
+    private CacheAndCount cache(LeafReaderContext context) throws IOException {
       final BulkScorer scorer = in.bulkScorer(context);
       if (scorer == null) {
-        return DocIdSet.EMPTY;
+        return CacheAndCount.EMPTY;
       } else {
         return cacheImpl(scorer, context.reader().maxDoc());
       }
@@ -747,18 +749,18 @@ public class LRUQueryCache implements QueryCache, Accountable {
         return in.scorerSupplier(context);
       }
 
-      DocIdSet docIdSet;
+      CacheAndCount cached;
       try {
-        docIdSet = get(in.getQuery(), cacheHelper);
+        cached = get(in.getQuery(), cacheHelper);
       } finally {
         lock.unlock();
       }
 
-      if (docIdSet == null) {
+      if (cached == null) {
         if (policy.shouldCache(in.getQuery())) {
           final ScorerSupplier supplier = in.scorerSupplier(context);
           if (supplier == null) {
-            putIfAbsent(in.getQuery(), DocIdSet.EMPTY, cacheHelper);
+            putIfAbsent(in.getQuery(), CacheAndCount.EMPTY, cacheHelper);
             return null;
           }
 
@@ -772,10 +774,10 @@ public class LRUQueryCache implements QueryCache, Accountable {
               }
 
               Scorer scorer = supplier.get(Long.MAX_VALUE);
-              DocIdSet docIdSet =
+              CacheAndCount cached =
                   cacheImpl(new DefaultBulkScorer(scorer), context.reader().maxDoc());
-              putIfAbsent(in.getQuery(), docIdSet, cacheHelper);
-              DocIdSetIterator disi = docIdSet.iterator();
+              putIfAbsent(in.getQuery(), cached, cacheHelper);
+              DocIdSetIterator disi = cached.iterator();
               if (disi == null) {
                 // docIdSet.iterator() is allowed to return null when empty but we want a non-null
                 // iterator here
@@ -796,11 +798,11 @@ public class LRUQueryCache implements QueryCache, Accountable {
         }
       }
 
-      assert docIdSet != null;
-      if (docIdSet == DocIdSet.EMPTY) {
+      assert cached != null;
+      if (cached == CacheAndCount.EMPTY) {
         return null;
       }
-      final DocIdSetIterator disi = docIdSet.iterator();
+      final DocIdSetIterator disi = cached.iterator();
       if (disi == null) {
         return null;
       }
@@ -830,7 +832,55 @@ public class LRUQueryCache implements QueryCache, Accountable {
 
     @Override
     public int count(LeafReaderContext context) throws IOException {
-      return in.count(context);
+      // If the wrapped weight can count quickly then use that
+      int innerCount = in.count(context);
+      if (innerCount != -1) {
+        return innerCount;
+      }
+
+      // Our cache won't have an accurate count if there are deletions
+      if (context.reader().hasDeletions()) {
+        return -1;
+      }
+
+      // Otherwise check if the count is in the cache
+      if (used.compareAndSet(false, true)) {
+        policy.onUse(getQuery());
+      }
+
+      if (in.isCacheable(context) == false) {
+        // this segment is not suitable for caching
+        return -1;
+      }
+
+      // Short-circuit: Check whether this segment is eligible for caching
+      // before we take a lock because of #get
+      if (shouldCache(context) == false) {
+        return -1;
+      }
+
+      final IndexReader.CacheHelper cacheHelper = context.reader().getCoreCacheHelper();
+      if (cacheHelper == null) {
+        // this reader has no cacheHelper
+        return -1;
+      }
+
+      // If the lock is already busy, prefer using the uncached version than waiting
+      if (lock.tryLock() == false) {
+        return -1;
+      }
+
+      CacheAndCount cached;
+      try {
+        cached = get(in.getQuery(), cacheHelper);
+      } finally {
+        lock.unlock();
+      }
+      if (cached == null) {
+        // Not cached
+        return -1;
+      }
+      return cached.count();
     }
 
     @Override
@@ -866,27 +916,27 @@ public class LRUQueryCache implements QueryCache, Accountable {
         return in.bulkScorer(context);
       }
 
-      DocIdSet docIdSet;
+      CacheAndCount cached;
       try {
-        docIdSet = get(in.getQuery(), cacheHelper);
+        cached = get(in.getQuery(), cacheHelper);
       } finally {
         lock.unlock();
       }
 
-      if (docIdSet == null) {
+      if (cached == null) {
         if (policy.shouldCache(in.getQuery())) {
-          docIdSet = cache(context);
-          putIfAbsent(in.getQuery(), docIdSet, cacheHelper);
+          cached = cache(context);
+          putIfAbsent(in.getQuery(), cached, cacheHelper);
         } else {
           return in.bulkScorer(context);
         }
       }
 
-      assert docIdSet != null;
-      if (docIdSet == DocIdSet.EMPTY) {
+      assert cached != null;
+      if (cached == CacheAndCount.EMPTY) {
         return null;
       }
-      final DocIdSetIterator disi = docIdSet.iterator();
+      final DocIdSetIterator disi = cached.iterator();
       if (disi == null) {
         return null;
       }
@@ -895,4 +945,32 @@ public class LRUQueryCache implements QueryCache, Accountable {
           new ConstantScoreScorer(this, 0f, ScoreMode.COMPLETE_NO_SCORES, disi));
     }
   }
+
+  /** Cache of doc ids with a count. */
+  protected static class CacheAndCount implements Accountable {
+    protected static final CacheAndCount EMPTY = new CacheAndCount(DocIdSet.EMPTY, 0);
+
+    private static final long BASE_RAM_BYTES_USED =
+        RamUsageEstimator.shallowSizeOfInstance(CacheAndCount.class);
+    private final DocIdSet cache;
+    private final int count;
+
+    public CacheAndCount(DocIdSet cache, int count) {
+      this.cache = cache;
+      this.count = count;
+    }
+
+    public DocIdSetIterator iterator() throws IOException {
+      return cache.iterator();
+    }
+
+    public int count() {
+      return count;
+    }
+
+    @Override
+    public long ramBytesUsed() {
+      return BASE_RAM_BYTES_USED + cache.ramBytesUsed();
+    }
+  }
 }
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestLRUQueryCache.java b/lucene/core/src/test/org/apache/lucene/search/TestLRUQueryCache.java
index 8c7cc0d..2179b34 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestLRUQueryCache.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestLRUQueryCache.java
@@ -62,6 +62,7 @@ import org.apache.lucene.index.Term;
 import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.Accountable;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.Constants;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.LuceneTestCase;
@@ -1088,7 +1089,18 @@ public class TestLRUQueryCache extends LuceneTestCase {
         cachedSearcher.setQueryCachingPolicy(ALWAYS_CACHE);
       }
       final Query q = buildRandomQuery(0);
+      /*
+       * Counts are the same. If the query has already been cached
+       * this'll use the O(1) Weight#count method.
+       */
       assertEquals(uncachedSearcher.count(q), cachedSearcher.count(q));
+      /*
+       * Just to make sure we can iterate every time also check that the
+       * same docs are returned in the same order.
+       */
+      int size = 1 + random().nextInt(1000);
+      CheckHits.checkEqual(
+          q, uncachedSearcher.search(q, size).scoreDocs, cachedSearcher.search(q, size).scoreDocs);
       if (rarely()) {
         queryCache.assertConsistent();
       }
@@ -1976,4 +1988,60 @@ public class TestLRUQueryCache extends LuceneTestCase {
     reader.close();
     dir.close();
   }
+
+  public void testCacheHasFastCount() throws IOException {
+    Query query = new PhraseQuery("words", new BytesRef("alice"), new BytesRef("ran"));
+
+    Directory dir = newDirectory();
+    RandomIndexWriter w =
+        new RandomIndexWriter(
+            random(), dir, newIndexWriterConfig().setMergePolicy(NoMergePolicy.INSTANCE));
+    Document doc1 = new Document();
+    doc1.add(new TextField("words", "tom ran", Store.NO));
+    Document doc2 = new Document();
+    doc2.add(new TextField("words", "alice ran", Store.NO));
+    doc2.add(new StringField("f", "a", Store.NO));
+    Document doc3 = new Document();
+    doc3.add(new TextField("words", "alice ran", Store.NO));
+    doc3.add(new StringField("f", "b", Store.NO));
+    w.addDocuments(Arrays.asList(doc1, doc2, doc3));
+
+    try (IndexReader reader = w.getReader()) {
+      IndexSearcher searcher = newSearcher(reader);
+      searcher.setQueryCachingPolicy(ALWAYS_CACHE);
+      LRUQueryCache allCache =
+          new LRUQueryCache(1000000, 10000000, context -> true, Float.POSITIVE_INFINITY);
+      searcher.setQueryCache(allCache);
+      Weight weight = searcher.createWeight(query, ScoreMode.COMPLETE_NO_SCORES, 1);
+      LeafReaderContext context = getOnlyLeafReader(reader).getContext();
+      // We don't have a fast count before the cache is filled
+      assertEquals(-1, weight.count(context));
+      // Fetch the scorer to populate the cache
+      weight.scorer(context);
+      assertEquals(List.of(query), allCache.cachedQueries());
+      // Now we *do* have a fast count
+      assertEquals(2, weight.count(context));
+    }
+
+    w.deleteDocuments(new TermQuery(new Term("f", new BytesRef("b"))));
+    try (IndexReader reader = w.getReader()) {
+      IndexSearcher searcher = newSearcher(reader);
+      searcher.setQueryCachingPolicy(ALWAYS_CACHE);
+      LRUQueryCache allCache =
+          new LRUQueryCache(1000000, 10000000, context -> true, Float.POSITIVE_INFINITY);
+      searcher.setQueryCache(allCache);
+      Weight weight = searcher.createWeight(query, ScoreMode.COMPLETE_NO_SCORES, 1);
+      LeafReaderContext context = getOnlyLeafReader(reader).getContext();
+      // We don't have a fast count before the cache is filled
+      assertEquals(-1, weight.count(context));
+      // Fetch the scorer to populate the cache
+      weight.scorer(context);
+      assertEquals(List.of(query), allCache.cachedQueries());
+      // We still don't have a fast count because we have deleted documents
+      assertEquals(-1, weight.count(context));
+    }
+
+    w.close();
+    dir.close();
+  }
 }