You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2022/04/05 06:55:51 UTC
[lucene] 02/02: LUCENE-10484: Add support for concurrent facets random sampling (#765)
This is an automated email from the ASF dual-hosted git repository.
jpountz pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git
commit b0ecba4fb243d24d083325c7482f9725bccc57db
Author: Luca Cavanna <ja...@users.noreply.github.com>
AuthorDate: Tue Apr 5 08:51:57 2022 +0200
LUCENE-10484: Add support for concurrent facets random sampling (#765)
This commit adds a new createManager static method to RandomSamplingFacetsCollector that allows users to perform random sampling concurrently. The returned collector manager is very similar to the existing FacetsCollectorManager but it exposes a specialized reduced RandomSamplingFacetsCollector.
This relates to [LUCENE-10002](https://issues.apache.org/jira/browse/LUCENE-10002). It allows users to use a collector manager instead of a collector when doing random sampling, in the effort of reducing usages of IndexSearcher#search(Query, Collector).
---
lucene/CHANGES.txt | 3 ++
.../facet/RandomSamplingFacetsCollector.java | 38 ++++++++++++++++++++++
.../facet/TestRandomSamplingFacetsCollector.java | 20 +++++-------
3 files changed, 49 insertions(+), 12 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index a36cab8d8ef..c03c7d8937a 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -26,6 +26,9 @@ Improvements
* LUCENE-10494: Implement method to bulk add all collection elements to a PriorityQueue.
(Bauyrzhan Sakhariyev)
+* LUCENE-10484: Add support for concurrent random sampling by calling
+ RandomSamplingFacetsCollector#createManager. (Luca Cavanna)
+
Optimizations
---------------------
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/RandomSamplingFacetsCollector.java b/lucene/facet/src/java/org/apache/lucene/facet/RandomSamplingFacetsCollector.java
index 92a0c3133b1..de7c0e27323 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/RandomSamplingFacetsCollector.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/RandomSamplingFacetsCollector.java
@@ -18,10 +18,12 @@ package org.apache.lucene.facet;
import java.io.IOException;
import java.util.ArrayList;
+import java.util.Collection;
import java.util.List;
import org.apache.lucene.facet.FacetsConfig.DimConfig;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
+import org.apache.lucene.search.CollectorManager;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.util.BitDocIdSet;
@@ -255,4 +257,40 @@ public class RandomSamplingFacetsCollector extends FacetsCollector {
public double getSamplingRate() {
return samplingRate;
}
+
+ /**
+ * Creates a {@link CollectorManager} for concurrent random sampling through {@link
+ * RandomSamplingFacetsCollector}
+ */
+ public static CollectorManager<RandomSamplingFacetsCollector, RandomSamplingFacetsCollector>
+ createManager(int sampleSize, long seed) {
+ return new CollectorManager<>() {
+ @Override
+ public RandomSamplingFacetsCollector newCollector() {
+ return new RandomSamplingFacetsCollector(sampleSize, seed);
+ }
+
+ @Override
+ public RandomSamplingFacetsCollector reduce(
+ Collection<RandomSamplingFacetsCollector> collectors) {
+ if (collectors == null || collectors.size() == 0) {
+ return new RandomSamplingFacetsCollector(sampleSize, seed);
+ }
+ if (collectors.size() == 1) {
+ return collectors.iterator().next();
+ }
+ return new ReducedRandomSamplingFacetsCollector(sampleSize, seed, collectors);
+ }
+ };
+ }
+
+ private static class ReducedRandomSamplingFacetsCollector extends RandomSamplingFacetsCollector {
+ ReducedRandomSamplingFacetsCollector(
+ int sampleSize, long seed, Collection<RandomSamplingFacetsCollector> facetsCollectors) {
+ super(sampleSize, seed);
+ facetsCollectors.forEach(
+ facetsCollector ->
+ getOriginalMatchingDocs().addAll(facetsCollector.getOriginalMatchingDocs()));
+ }
+ }
}
diff --git a/lucene/facet/src/test/org/apache/lucene/facet/TestRandomSamplingFacetsCollector.java b/lucene/facet/src/test/org/apache/lucene/facet/TestRandomSamplingFacetsCollector.java
index 26d80fc784f..df51e0afc38 100644
--- a/lucene/facet/src/test/org/apache/lucene/facet/TestRandomSamplingFacetsCollector.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/TestRandomSamplingFacetsCollector.java
@@ -27,9 +27,9 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
import org.apache.lucene.index.Term;
+import org.apache.lucene.search.CollectorManager;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.MultiCollector;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.index.RandomIndexWriter;
@@ -74,11 +74,11 @@ public class TestRandomSamplingFacetsCollector extends FacetTestCase {
IOUtils.close(writer, taxoWriter);
// Test empty results
- RandomSamplingFacetsCollector collectRandomZeroResults =
- new RandomSamplingFacetsCollector(numDocs / 10, random.nextLong());
-
+ CollectorManager<RandomSamplingFacetsCollector, RandomSamplingFacetsCollector> fcm =
+ RandomSamplingFacetsCollector.createManager(numDocs / 10, random.nextLong());
// There should be no divisions by zero
- searcher.search(new TermQuery(new Term("EvenOdd", "NeverMatches")), collectRandomZeroResults);
+ RandomSamplingFacetsCollector collectRandomZeroResults =
+ searcher.search(new TermQuery(new Term("EvenOdd", "NeverMatches")), fcm);
// There should be no divisions by zero and no null result
assertNotNull(collectRandomZeroResults.getMatchingDocs());
@@ -93,13 +93,9 @@ public class TestRandomSamplingFacetsCollector extends FacetTestCase {
// Use a query to select half of the documents.
TermQuery query = new TermQuery(new Term("EvenOdd", "even"));
- RandomSamplingFacetsCollector random10Percent =
- new RandomSamplingFacetsCollector(
- numDocs / 10, random.nextLong()); // 10% of total docs, 20% of the hits
-
- FacetsCollector fc = new FacetsCollector();
-
- searcher.search(query, MultiCollector.wrap(fc, random10Percent));
+ // 10% of total docs, 20% of the hits
+ fcm = RandomSamplingFacetsCollector.createManager(numDocs / 10, random.nextLong());
+ RandomSamplingFacetsCollector random10Percent = searcher.search(query, fcm);
final List<MatchingDocs> matchingDocs = random10Percent.getMatchingDocs();