You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2022/04/05 06:55:51 UTC

[lucene] 02/02: LUCENE-10484: Add support for concurrent facets random sampling (#765)

This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git

commit b0ecba4fb243d24d083325c7482f9725bccc57db
Author: Luca Cavanna <ja...@users.noreply.github.com>
AuthorDate: Tue Apr 5 08:51:57 2022 +0200

    LUCENE-10484: Add support for concurrent facets random sampling (#765)
    
    This commit adds a new createManager static method to RandomSamplingFacetsCollector that allows users to perform random sampling concurrently. The returned collector manager is very similar to the existing FacetsCollectorManager but it exposes a specialized reduced RandomSamplingFacetsCollector.
    
    This relates to [LUCENE-10002](https://issues.apache.org/jira/browse/LUCENE-10002). It allows users to use a collector manager instead of a collector when doing random sampling, in the effort of reducing usages of IndexSearcher#search(Query, Collector).
---
 lucene/CHANGES.txt                                 |  3 ++
 .../facet/RandomSamplingFacetsCollector.java       | 38 ++++++++++++++++++++++
 .../facet/TestRandomSamplingFacetsCollector.java   | 20 +++++-------
 3 files changed, 49 insertions(+), 12 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index a36cab8d8ef..c03c7d8937a 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -26,6 +26,9 @@ Improvements
 * LUCENE-10494: Implement method to bulk add all collection elements to a PriorityQueue.
   (Bauyrzhan Sakhariyev)
 
+* LUCENE-10484: Add support for concurrent random sampling by calling
+  RandomSamplingFacetsCollector#createManager. (Luca Cavanna)
+
 Optimizations
 ---------------------
 
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/RandomSamplingFacetsCollector.java b/lucene/facet/src/java/org/apache/lucene/facet/RandomSamplingFacetsCollector.java
index 92a0c3133b1..de7c0e27323 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/RandomSamplingFacetsCollector.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/RandomSamplingFacetsCollector.java
@@ -18,10 +18,12 @@ package org.apache.lucene.facet;
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.List;
 import org.apache.lucene.facet.FacetsConfig.DimConfig;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.search.CollectorManager;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.util.BitDocIdSet;
@@ -255,4 +257,40 @@ public class RandomSamplingFacetsCollector extends FacetsCollector {
   public double getSamplingRate() {
     return samplingRate;
   }
+
+  /**
+   * Creates a {@link CollectorManager} for concurrent random sampling through {@link
+   * RandomSamplingFacetsCollector}
+   */
+  public static CollectorManager<RandomSamplingFacetsCollector, RandomSamplingFacetsCollector>
+      createManager(int sampleSize, long seed) {
+    return new CollectorManager<>() {
+      @Override
+      public RandomSamplingFacetsCollector newCollector() {
+        return new RandomSamplingFacetsCollector(sampleSize, seed);
+      }
+
+      @Override
+      public RandomSamplingFacetsCollector reduce(
+          Collection<RandomSamplingFacetsCollector> collectors) {
+        if (collectors == null || collectors.size() == 0) {
+          return new RandomSamplingFacetsCollector(sampleSize, seed);
+        }
+        if (collectors.size() == 1) {
+          return collectors.iterator().next();
+        }
+        return new ReducedRandomSamplingFacetsCollector(sampleSize, seed, collectors);
+      }
+    };
+  }
+
+  private static class ReducedRandomSamplingFacetsCollector extends RandomSamplingFacetsCollector {
+    ReducedRandomSamplingFacetsCollector(
+        int sampleSize, long seed, Collection<RandomSamplingFacetsCollector> facetsCollectors) {
+      super(sampleSize, seed);
+      facetsCollectors.forEach(
+          facetsCollector ->
+              getOriginalMatchingDocs().addAll(facetsCollector.getOriginalMatchingDocs()));
+    }
+  }
 }
diff --git a/lucene/facet/src/test/org/apache/lucene/facet/TestRandomSamplingFacetsCollector.java b/lucene/facet/src/test/org/apache/lucene/facet/TestRandomSamplingFacetsCollector.java
index 26d80fc784f..df51e0afc38 100644
--- a/lucene/facet/src/test/org/apache/lucene/facet/TestRandomSamplingFacetsCollector.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/TestRandomSamplingFacetsCollector.java
@@ -27,9 +27,9 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader;
 import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
 import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.search.CollectorManager;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.MultiCollector;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.tests.index.RandomIndexWriter;
@@ -74,11 +74,11 @@ public class TestRandomSamplingFacetsCollector extends FacetTestCase {
     IOUtils.close(writer, taxoWriter);
 
     // Test empty results
-    RandomSamplingFacetsCollector collectRandomZeroResults =
-        new RandomSamplingFacetsCollector(numDocs / 10, random.nextLong());
-
+    CollectorManager<RandomSamplingFacetsCollector, RandomSamplingFacetsCollector> fcm =
+        RandomSamplingFacetsCollector.createManager(numDocs / 10, random.nextLong());
     // There should be no divisions by zero
-    searcher.search(new TermQuery(new Term("EvenOdd", "NeverMatches")), collectRandomZeroResults);
+    RandomSamplingFacetsCollector collectRandomZeroResults =
+        searcher.search(new TermQuery(new Term("EvenOdd", "NeverMatches")), fcm);
 
     // There should be no divisions by zero and no null result
     assertNotNull(collectRandomZeroResults.getMatchingDocs());
@@ -93,13 +93,9 @@ public class TestRandomSamplingFacetsCollector extends FacetTestCase {
     // Use a query to select half of the documents.
     TermQuery query = new TermQuery(new Term("EvenOdd", "even"));
 
-    RandomSamplingFacetsCollector random10Percent =
-        new RandomSamplingFacetsCollector(
-            numDocs / 10, random.nextLong()); // 10% of total docs, 20% of the hits
-
-    FacetsCollector fc = new FacetsCollector();
-
-    searcher.search(query, MultiCollector.wrap(fc, random10Percent));
+    // 10% of total docs, 20% of the hits
+    fcm = RandomSamplingFacetsCollector.createManager(numDocs / 10, random.nextLong());
+    RandomSamplingFacetsCollector random10Percent = searcher.search(query, fcm);
 
     final List<MatchingDocs> matchingDocs = random10Percent.getMatchingDocs();