You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ho...@apache.org on 2020/07/10 01:47:13 UTC
[lucene-solr] branch master updated: SOLR-13132: JSON Facet perf
improvements to support "sweeping" collection of "relatedness()"
This is an automated email from the ASF dual-hosted git repository.
hossman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new 40e2122 SOLR-13132: JSON Facet perf improvements to support "sweeping" collection of "relatedness()"
40e2122 is described below
commit 40e2122b5a5b89f446e51692ef0d72e48c7b71e5
Author: Michael Gibney <mi...@michaelgibney.net>
AuthorDate: Thu Jul 9 18:42:37 2020 -0700
SOLR-13132: JSON Facet perf improvements to support "sweeping" collection of "relatedness()"
This adds a lot of "under the covers" improvements to how JSON Faceting FacetField processors work, to enable
"sweeping" support when the SlotAcc used for sorting support it (currently just "relatedness()")
This is a squash commit of all changes on https://github.com/magibney/lucene-solr/tree/SOLR-13132
Up to and including ca7a8e0b39840d00af9022c048346a7d84bf280d.
Co-authored-by: Chris Hostetter <ho...@apache.org>
Co-authored-by: Michael Gibney <mi...@michaelgibney.net>
---
solr/CHANGES.txt | 3 +
.../solr/search/facet/FacetFieldProcessor.java | 79 +++++-
.../search/facet/FacetFieldProcessorByArray.java | 34 +++
.../search/facet/FacetFieldProcessorByArrayDV.java | 177 +++++++++-----
.../apache/solr/search/facet/FacetProcessor.java | 1 -
.../solr/search/facet/ReadOnlyCountSlotAcc.java | 31 +++
.../apache/solr/search/facet/RelatednessAgg.java | 174 ++++++++++++-
.../apache/solr/search/facet/SingletonDISI.java | 48 ++++
.../solr/search/facet/SingletonDocIterator.java | 52 ++++
.../java/org/apache/solr/search/facet/SlotAcc.java | 218 ++++++++++++++++-
.../apache/solr/search/facet/SweepCountAware.java | 187 ++++++++++++++
.../org/apache/solr/search/facet/SweepDISI.java | 85 +++++++
.../apache/solr/search/facet/SweepDocIterator.java | 87 +++++++
.../apache/solr/search/facet/UnInvertedField.java | 71 ++++--
.../org/apache/solr/search/facet/UnionDISI.java | 100 ++++++++
.../apache/solr/search/facet/UnionDocIterator.java | 107 ++++++++
.../search/facet/TestCloudJSONFacetSKGEquiv.java | 268 +++++++++++++++++++--
.../apache/solr/search/facet/TestJsonFacets.java | 48 ++++
solr/solr-ref-guide/src/json-facet-api.adoc | 4 +
19 files changed, 1662 insertions(+), 112 deletions(-)
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 68880e9..6a0d48d 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -129,6 +129,9 @@ Optimizations
* SOLR-14610: ReflectMapWriter to use MethodHandle instead of old reflection (noble)
+* SOLR-13132: JSON Facet perf improvements to support "sweeping" collection of "relatedness()"
+ (hossman, Michael Gibney)
+
Bug Fixes
---------------------
(No changes)
diff --git a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java
index e3af5b3..c7b31e1 100644
--- a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java
+++ b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java
@@ -33,6 +33,7 @@ import java.util.function.IntFunction;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.Query;
+import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.PriorityQueue;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.util.SimpleOrderedMap;
@@ -40,6 +41,8 @@ import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.facet.SlotAcc.SlotContext;
+import org.apache.solr.search.facet.SlotAcc.SweepableSlotAcc;
+import org.apache.solr.search.facet.SlotAcc.SweepingCountSlotAcc;
import static org.apache.solr.search.facet.FacetContext.SKIP_FACET;
@@ -116,7 +119,6 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
// allow a custom count acc to be used
if (countAcc == null) {
countAcc = new SlotAcc.CountSlotArrAcc(fcontext, slotCount);
- countAcc.key = "count";
}
if (accs != null) {
@@ -509,12 +511,12 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
/** Helper method used solely when looping over buckets to be returned in findTopSlots */
private void fillBucketFromSlot(SimpleOrderedMap<Object> target, Slot slot,
SlotAcc resortAcc) throws IOException {
- final long count = countAcc.getCount(slot.slot);
- target.add("count", count);
- if (count <= 0 && !freq.processEmpty) return;
+ final int slotOrd = slot.slot;
+ countAcc.setValues(target, slotOrd);
+ if (countAcc.getCount(slotOrd) <= 0 && !freq.processEmpty) return;
- if (collectAcc != null && slot.slot >= 0) {
- collectAcc.setValues(target, slot.slot);
+ if (slotOrd >= 0 && collectAcc != null) {
+ collectAcc.setValues(target, slotOrd);
}
if (otherAccs == null && freq.subFacets.isEmpty()) return;
@@ -689,7 +691,7 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
}
}
- static class MultiAcc extends SlotAcc {
+ static class MultiAcc extends SlotAcc implements SweepableSlotAcc<SlotAcc> {
final SlotAcc[] subAccs;
MultiAcc(FacetContext fcontext, SlotAcc[] subAccs) {
@@ -741,6 +743,65 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
acc.setValues(bucket, slotNum);
}
}
+
+ @Override
+ public SlotAcc registerSweepingAccs(SweepingCountSlotAcc baseSweepingAcc) {
+ final FacetFieldProcessor p = (FacetFieldProcessor) fcontext.processor;
+ int j = 0;
+ for (int i = 0; i < subAccs.length; i++) {
+ final SlotAcc acc = subAccs[i];
+ if (acc instanceof SweepableSlotAcc) {
+ SlotAcc replacement = ((SweepableSlotAcc<?>)acc).registerSweepingAccs(baseSweepingAcc);
+ if (replacement == null) {
+ // drop acc, do not increment j
+ continue;
+ } else if (replacement != acc || j < i) {
+ subAccs[j] = replacement;
+ }
+ } else if (j < i) {
+ subAccs[j] = acc;
+ }
+ j++;
+ }
+ switch (j) {
+ case 0:
+ return null;
+ case 1:
+ return subAccs[0];
+ default:
+ if (j == subAccs.length) {
+ return this;
+ } else {
+ // must resize final field subAccs
+ return new MultiAcc(fcontext, ArrayUtil.copyOfSubArray(subAccs, 0, j));
+ }
+ }
+ }
+ }
+
+ /**
+ * Helper method that subclasses can use to indicate they with to use sweeping.
+ * If {@link #countAcc} and {@link #collectAcc} support sweeping, then this method will:
+ * <ul>
+ * <li>replace {@link #collectAcc} with it's sweeping equivalent</li>
+ * <li>update {@link #allBucketsAcc}'s reference to {@link #collectAcc} (if it exists)</li>
+ * </ul>
+ *
+ * @return true if the above actions were taken
+ * @see SweepableSlotAcc
+ * @see SweepingCountSlotAcc
+ */
+ protected boolean registerSweepingAccIfSupportedByCollectAcc() {
+ if (countAcc instanceof SweepingCountSlotAcc && collectAcc instanceof SweepableSlotAcc) {
+ final SweepingCountSlotAcc sweepingCountAcc = (SweepingCountSlotAcc)countAcc;
+ collectAcc = ((SweepableSlotAcc<?>)collectAcc).registerSweepingAccs(sweepingCountAcc);
+ if (allBucketsAcc != null) {
+ allBucketsAcc.collectAcc = collectAcc;
+ allBucketsAcc.sweepingCountAcc = sweepingCountAcc;
+ }
+ return true;
+ }
+ return false;
}
private static final SlotContext ALL_BUCKETS_SLOT_CONTEXT = new SlotContext(null) {
@@ -766,6 +827,7 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
int collectAccSlot;
int otherAccsSlot;
long count;
+ SweepingCountSlotAcc sweepingCountAcc; // null unless/until sweeping is initialized
SpecialSlotAcc(FacetContext fcontext, SlotAcc collectAcc, int collectAccSlot, SlotAcc[] otherAccs, int otherAccsSlot) {
super(fcontext);
@@ -822,6 +884,9 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
@Override
public void setValues(SimpleOrderedMap<Object> bucket, int slotNum) throws IOException {
+ if (sweepingCountAcc != null) {
+ sweepingCountAcc.setSweepValues(bucket, collectAccSlot);
+ }
if (collectAcc != null) {
collectAcc.setValues(bucket, collectAccSlot);
}
diff --git a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByArray.java b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByArray.java
index dff72b4..18cf46d 100644
--- a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByArray.java
+++ b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByArray.java
@@ -27,6 +27,7 @@ import org.apache.lucene.search.Query;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.facet.SlotAcc.SlotContext;
+import org.apache.solr.search.facet.SlotAcc.SweepingCountSlotAcc;
import static org.apache.solr.search.facet.FacetContext.SKIP_FACET;
@@ -34,6 +35,9 @@ import static org.apache.solr.search.facet.FacetContext.SKIP_FACET;
* Base class for DV/UIF accumulating counts into an array by ordinal. It's
* for {@link org.apache.lucene.index.SortedDocValues} and {@link org.apache.lucene.index.SortedSetDocValues} only.
* It can handle terms (strings), not numbers directly but those encoded as terms, and is multi-valued capable.
+ * By default, this class assumes subclasses can support sweeping collection unless subclasses initialize <code>countAcc</code> directly in their constructors.
+ *
+ * @see SweepingCountSlotAcc
*/
abstract class FacetFieldProcessorByArray extends FacetFieldProcessor {
BytesRefBuilder prefixRef;
@@ -56,6 +60,34 @@ abstract class FacetFieldProcessorByArray extends FacetFieldProcessor {
/** this BytesRef may be shared across calls and should be deep-cloned if necessary */
abstract protected BytesRef lookupOrd(int ord) throws IOException;
+
+ /**
+ * {@inheritDoc}
+ *
+ * This impl first initializes <code>countAcc</code> as a {@link SweepingCountSlotAcc} if null.
+ */
+ @Override
+ protected void createAccs(long docCount, int slotCount) throws IOException {
+ if (countAcc == null) {
+ countAcc = new SweepingCountSlotAcc(slotCount, this);
+ }
+ super.createAccs(docCount, slotCount);
+ }
+
+ /**
+ * {@inheritDoc}
+ *
+ * This impl first initializes <code>countAcc</code> as a {@link SweepingCountSlotAcc} if null.
+ */
+ @Override
+ void createCollectAcc(int numDocs, int numSlots) throws IOException {
+ if (countAcc == null) {
+ countAcc = new SweepingCountSlotAcc(numSlots, this);
+ }
+ super.createCollectAcc(numDocs, numSlots);
+ registerSweepingAccIfSupportedByCollectAcc();
+ }
+
@Override
public void process() throws IOException {
super.process();
@@ -87,8 +119,10 @@ abstract class FacetFieldProcessorByArray extends FacetFieldProcessor {
if (freq.allBuckets) {
// count is irrelevant, but hardcoded in collect(...), so intercept/mask normal counts.
// Set here to prevent createAccs(...) from creating a 1-slot countAcc that will fail with AIOOBE
+ // NOTE: because collectAcc will be null, it is fine/irrelevant to set a countAcc that doesn't support sweeping
countAcc = SlotAcc.DEV_NULL_SLOT_ACC;
createAccs(nDocs, 1);
+ assert collectAcc == null;
otherAccs = accs; // accs is created above and set on allBucketsAcc; but during collection, setNextReader is called on otherAccs.
allBucketsAcc = new SpecialSlotAcc(fcontext, null, -1, accs, 0);
collectDocs();
diff --git a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByArrayDV.java b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByArrayDV.java
index dfd1bc1..a98ffd0 100644
--- a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByArrayDV.java
+++ b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByArrayDV.java
@@ -26,14 +26,17 @@ import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.OrdinalMap;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
-import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LongValues;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.solr.common.SolrException;
import org.apache.solr.schema.SchemaField;
-import org.apache.solr.search.Filter;
+import org.apache.solr.search.facet.SlotAcc.CountSlotAcc;
+import org.apache.solr.search.facet.SlotAcc.SweepCountAccStruct;
+import org.apache.solr.search.facet.SlotAcc.SweepingCountSlotAcc;
+import org.apache.solr.search.facet.SweepCountAware.SegCountGlobal;
+import org.apache.solr.search.facet.SweepCountAware.SegCountPerSeg;
import org.apache.solr.uninverting.FieldCacheImpl;
/**
@@ -94,6 +97,10 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
return;
}
+ final SweepCountAccStruct base = SweepingCountSlotAcc.baseStructOf(this);
+ final List<SweepCountAccStruct> others = SweepingCountSlotAcc.otherStructsOf(this);
+ assert null != base;
+
// TODO: refactor some of this logic into a base class
boolean countOnly = collectAcc==null && allBucketsAcc==null;
boolean fullRange = startTermIndex == 0 && endTermIndex == si.getValueCount();
@@ -118,16 +125,21 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
if (freq.perSeg != null) accumSeg = canDoPerSeg && freq.perSeg; // internal - override perSeg heuristic
+ final int maxSize = others.size() + 1; // others + base
final List<LeafReaderContext> leaves = fcontext.searcher.getIndexReader().leaves();
- Filter filter = fcontext.base.getTopFilter();
+ final DocIdSetIterator[] subIterators = new DocIdSetIterator[maxSize];
+ final CountSlotAcc[] activeCountAccs = new CountSlotAcc[maxSize];
for (int subIdx = 0; subIdx < leaves.size(); subIdx++) {
LeafReaderContext subCtx = leaves.get(subIdx);
setNextReaderFirstPhase(subCtx);
- DocIdSet dis = filter.getDocIdSet(subCtx, null); // solr docsets already exclude any deleted docs
- DocIdSetIterator disi = dis.iterator();
+ final SweepDISI disi = SweepDISI.newInstance(base, others, subIterators, activeCountAccs, subCtx);
+ if (disi == null) {
+ continue;
+ }
+ LongValues toGlobal = ordinalMap == null ? null : ordinalMap.getGlobalOrds(subIdx);
SortedDocValues singleDv = null;
SortedSetDocValues multiDv = null;
@@ -135,7 +147,13 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
// TODO: get sub from multi?
multiDv = subCtx.reader().getSortedSetDocValues(sf.getName());
if (multiDv == null) {
- multiDv = DocValues.emptySortedSet();
+ if (countOnly) {
+ continue;
+ } else {
+ multiDv = DocValues.emptySortedSet();
+ }
+ } else if (countOnly && multiDv.getValueCount() < 1){
+ continue;
}
// some codecs may optimize SortedSet storage for single-valued fields
// this will be null if this is not a wrapped single valued docvalues.
@@ -145,12 +163,16 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
} else {
singleDv = subCtx.reader().getSortedDocValues(sf.getName());
if (singleDv == null) {
- singleDv = DocValues.emptySorted();
+ if (countOnly) {
+ continue;
+ } else {
+ singleDv = DocValues.emptySorted();
+ }
+ } else if (countOnly && singleDv.getValueCount() < 1) {
+ continue;
}
}
- LongValues toGlobal = ordinalMap == null ? null : ordinalMap.getGlobalOrds(subIdx);
-
if (singleDv != null) {
if (accumSeg) {
collectPerSeg(singleDv, disi, toGlobal);
@@ -174,7 +196,7 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
}
}
- reuse = null; // better GC
+ Arrays.fill(reuse, null); // better GC
}
@Override
@@ -182,9 +204,9 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
return si.lookupOrd(ord);
}
- private void collectPerSeg(SortedDocValues singleDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
- int segMax = singleDv.getValueCount() + 1;
- final int[] counts = getCountArr( segMax );
+ private void collectPerSeg(SortedDocValues singleDv, SweepDISI disi, LongValues toGlobal) throws IOException {
+ int segMax = singleDv.getValueCount();
+ final SegCountPerSeg segCounter = getSegCountPerSeg(disi, segMax);
/** alternate trial implementations
// ord
@@ -202,73 +224,110 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
if (singleDv instanceof FieldCacheImpl.SortedDocValuesImpl.Iter) {
FieldCacheImpl.SortedDocValuesImpl.Iter fc = (FieldCacheImpl.SortedDocValuesImpl.Iter) singleDv;
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
- counts[fc.getOrd(doc) + 1]++;
+ final int segOrd = fc.getOrd(doc);
+ if (segOrd >= 0) {
+ final int maxIdx = disi.registerCounts(segCounter);
+ segCounter.incrementCount(segOrd, 1, maxIdx);
+ }
}
} else {
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (singleDv.advanceExact(doc)) {
- counts[singleDv.ordValue() + 1]++;
+ final int segOrd = singleDv.ordValue();
+ if (segOrd >= 0) {
+ final int maxIdx = disi.registerCounts(segCounter);
+ segCounter.incrementCount(segOrd, 1, maxIdx);
+ }
}
}
}
// convert segment-local counts to global counts
- for (int i=1; i<segMax; i++) {
- int segCount = counts[i];
- if (segCount > 0) {
- int slot = toGlobal == null ? (i - 1) : (int) toGlobal.get(i - 1);
- countAcc.incrementCount(slot, segCount);
- }
- }
+ segCounter.register(disi.countAccs, toGlobal, segMax - 1);
+ }
+
+ private SegCountPerSeg getSegCountPerSeg(SweepDISI disi, int segMax) {
+ final int size = disi.size;
+ return new SegCountPerSeg(getSegmentCountArrays(segMax, size), getBoolArr(segMax), segMax, size);
+ }
+
+ private SegCountGlobal getSegCountGlobal(SweepDISI disi, SortedDocValues dv) {
+ return new SegCountGlobal(disi.countAccs);
}
- private void collectPerSeg(SortedSetDocValues multiDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
+ private SegCountGlobal getSegCountGlobal(SweepDISI disi, SortedSetDocValues dv) {
+ return new SegCountGlobal(disi.countAccs);
+ }
+
+ private void collectPerSeg(SortedSetDocValues multiDv, SweepDISI disi, LongValues toGlobal) throws IOException {
int segMax = (int)multiDv.getValueCount();
- final int[] counts = getCountArr( segMax );
+ final SegCountPerSeg segCounter = getSegCountPerSeg(disi, segMax);
int doc;
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (multiDv.advanceExact(doc)) {
- for(;;) {
+ final int maxIdx = disi.registerCounts(segCounter);
+ for (;;) {
int segOrd = (int)multiDv.nextOrd();
if (segOrd < 0) break;
- counts[segOrd]++;
+ segCounter.incrementCount(segOrd, 1, maxIdx);
}
}
}
- for (int i=0; i<segMax; i++) {
- int segCount = counts[i];
- if (segCount > 0) {
- int slot = toGlobal == null ? (i) : (int) toGlobal.get(i);
- countAcc.incrementCount(slot, segCount);
- }
+ segCounter.register(disi.countAccs, toGlobal, segMax - 1);
+ }
+
+ private boolean[] reuseBool;
+ private boolean[] getBoolArr(int maxNeeded) {
+ if (reuseBool == null) {
+ // make the count array large enough for any segment
+ // FUTURE: (optionally) directly use the array of the CountAcc for an optimized index..
+ reuseBool = new boolean[(int) si.getValueCount() + 1];
+ } else {
+ Arrays.fill(reuseBool, 0, maxNeeded, false);
}
+ return reuseBool;
}
- private int[] reuse;
- private int[] getCountArr(int maxNeeded) {
- if (reuse == null) {
+ private int[][] reuse = new int[12][];
+ private int[] getCountArr(int maxNeeded, int idx) {
+ if (idx >= reuse.length) {
+ reuse = Arrays.copyOf(reuse, idx + 1);
+ }
+ if (reuse[idx] == null) {
// make the count array large enough for any segment
// FUTURE: (optionally) directly use the array of the CountAcc for an optimized index..
- reuse = new int[(int) si.getValueCount() + 1];
+ reuse[idx] = new int[(int) si.getValueCount() + 1];
} else {
- Arrays.fill(reuse, 0, maxNeeded, 0);
+ Arrays.fill(reuse[idx], 0, maxNeeded, 0);
}
- return reuse;
+ return reuse[idx];
+ }
+
+ private int[][] getSegmentCountArrays(int segMax, int size) {
+ int[][] ret = new int[size][];
+ int i = size - 1;
+ do {
+ ret[i] = getCountArr(segMax, i);
+ } while (i-- > 0);
+ return ret;
}
- private void collectDocs(SortedDocValues singleDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
+ private void collectDocs(SortedDocValues singleDv, SweepDISI disi, LongValues toGlobal) throws IOException {
int doc;
+ final SegCountGlobal segCounter = getSegCountGlobal(disi, singleDv);
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (singleDv.advanceExact(doc)) {
+ final int maxIdx = disi.registerCounts(segCounter);
int segOrd = singleDv.ordValue();
- collect(doc, segOrd, toGlobal);
+ collect(doc, segOrd, toGlobal, segCounter, maxIdx, disi.collectBase());
}
}
}
- private void collectCounts(SortedDocValues singleDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
+ private void collectCounts(SortedDocValues singleDv, SweepDISI disi, LongValues toGlobal) throws IOException {
+ final SegCountGlobal segCounter = getSegCountGlobal(disi, singleDv);
int doc;
if (singleDv instanceof FieldCacheImpl.SortedDocValuesImpl.Iter) {
@@ -277,7 +336,8 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
int segOrd = fc.getOrd(doc);
if (segOrd < 0) continue;
int ord = (int)toGlobal.get(segOrd);
- countAcc.incrementCount(ord, 1);
+ int maxIdx = disi.registerCounts(segCounter);
+ segCounter.incrementCount(ord, 1, maxIdx);
}
} else {
@@ -286,53 +346,60 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
if (singleDv.advanceExact(doc)) {
int segOrd = singleDv.ordValue();
int ord = (int) toGlobal.get(segOrd);
- countAcc.incrementCount(ord, 1);
+ int maxIdx = disi.registerCounts(segCounter);
+ segCounter.incrementCount(ord, 1, maxIdx);
}
}
-
}
}
- private void collectDocs(SortedSetDocValues multiDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
+ private void collectDocs(SortedSetDocValues multiDv, SweepDISI disi, LongValues toGlobal) throws IOException {
+ final SegCountGlobal segCounter = getSegCountGlobal(disi, multiDv);
int doc;
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (multiDv.advanceExact(doc)) {
+ final int maxIdx = disi.registerCounts(segCounter);
+ final boolean collectBase = disi.collectBase();
for(;;) {
int segOrd = (int)multiDv.nextOrd();
if (segOrd < 0) break;
- collect(doc, segOrd, toGlobal);
+ collect(doc, segOrd, toGlobal, segCounter, maxIdx, collectBase);
}
}
}
}
- private void collectCounts(SortedSetDocValues multiDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
+ private void collectCounts(SortedSetDocValues multiDv, SweepDISI disi, LongValues toGlobal) throws IOException {
+ final SegCountGlobal segCounter = getSegCountGlobal(disi, multiDv);
int doc;
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (multiDv.advanceExact(doc)) {
+ final int maxIdx = disi.registerCounts(segCounter);
for(;;) {
int segOrd = (int)multiDv.nextOrd();
if (segOrd < 0) break;
int ord = (int)toGlobal.get(segOrd);
- countAcc.incrementCount(ord, 1);
+ segCounter.incrementCount(ord, 1, maxIdx);
}
}
}
}
- private void collect(int doc, int segOrd, LongValues toGlobal) throws IOException {
+ private void collect(int doc, int segOrd, LongValues toGlobal, SegCountGlobal segCounter, int maxIdx, boolean collectBase) throws IOException {
int ord = (toGlobal != null && segOrd >= 0) ? (int)toGlobal.get(segOrd) : segOrd;
int arrIdx = ord - startTermIndex;
// This code handles faceting prefixes, which narrows the range of ords we want to collect.
// It’s not an error for an ord to fall outside this range… we simply want to skip it.
if (arrIdx >= 0 && arrIdx < nTerms) {
- countAcc.incrementCount(arrIdx, 1);
- if (collectAcc != null) {
- collectAcc.collect(doc, arrIdx, slotContext);
- }
- if (allBucketsAcc != null) {
- allBucketsAcc.collect(doc, arrIdx, slotContext);
+ segCounter.incrementCount(arrIdx, 1, maxIdx);
+ if (collectBase) {
+ if (collectAcc != null) {
+ collectAcc.collect(doc, arrIdx, slotContext);
+ }
+ if (allBucketsAcc != null) {
+ allBucketsAcc.collect(doc, arrIdx, slotContext);
+ }
}
}
}
diff --git a/solr/core/src/java/org/apache/solr/search/facet/FacetProcessor.java b/solr/core/src/java/org/apache/solr/search/facet/FacetProcessor.java
index c3d84eb..9170e24 100644
--- a/solr/core/src/java/org/apache/solr/search/facet/FacetProcessor.java
+++ b/solr/core/src/java/org/apache/solr/search/facet/FacetProcessor.java
@@ -310,7 +310,6 @@ public abstract class FacetProcessor<FacetRequestT extends FacetRequest> {
// allow a custom count acc to be used
if (countAcc == null) {
countAcc = new SlotAcc.CountSlotArrAcc(fcontext, slotCount);
- countAcc.key = "count";
}
for (Map.Entry<String,AggValueSource> entry : freq.getFacetStats().entrySet()) {
diff --git a/solr/core/src/java/org/apache/solr/search/facet/ReadOnlyCountSlotAcc.java b/solr/core/src/java/org/apache/solr/search/facet/ReadOnlyCountSlotAcc.java
new file mode 100644
index 0000000..8569324
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/search/facet/ReadOnlyCountSlotAcc.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.search.facet;
+
+import java.io.IOException;
+
+/**
+ * To be implemented by CountSlotAccs that wish to expose a read-only interface
+ */
+interface ReadOnlyCountSlotAcc {
+
+ public long getCount(int slot);
+
+ public int compare(int slotA, int slotB);
+
+ public Object getValue(int slotNum) throws IOException;
+}
diff --git a/solr/core/src/java/org/apache/solr/search/facet/RelatednessAgg.java b/solr/core/src/java/org/apache/solr/search/facet/RelatednessAgg.java
index 10146db..df4d11f 100644
--- a/solr/core/src/java/org/apache/solr/search/facet/RelatednessAgg.java
+++ b/solr/core/src/java/org/apache/solr/search/facet/RelatednessAgg.java
@@ -37,6 +37,7 @@ import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.QParser;
+import org.apache.solr.search.facet.SlotAcc.SweepableSlotAcc;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -56,6 +57,7 @@ public class RelatednessAgg extends AggValueSource {
private static final String RELATEDNESS = "relatedness";
private static final String FG_POP = "foreground_popularity";
private static final String BG_POP = "background_popularity";
+ public static final String SWEEP_COLLECTION = "sweep_collection";
// needed for distrib calculation
private static final String FG_SIZE = "foreground_size";
@@ -66,8 +68,11 @@ public class RelatednessAgg extends AggValueSource {
final protected Query fgQ;
final protected Query bgQ;
protected double min_pop = 0.0D;
+ private Boolean useSweep;
public static final String NAME = RELATEDNESS;
+ private static final boolean DEFAULT_SWEEP_COLLECTION = true;
+
public RelatednessAgg(Query fgQ, Query bgQ) {
super(NAME);
// NOTE: ideally we don't want to assume any defaults *yet* if fgQ/bgQ are null
@@ -87,7 +92,10 @@ public class RelatednessAgg extends AggValueSource {
public void setOpts(QParser parser) {
final boolean isShard = parser.getReq().getParams().getBool(ShardParams.IS_SHARD, false);
SolrParams opts = parser.getLocalParams();
- if (null != opts) {
+ if (null == opts) {
+ this.useSweep = DEFAULT_SWEEP_COLLECTION;
+ } else {
+ this.useSweep = opts.getBool(SWEEP_COLLECTION, DEFAULT_SWEEP_COLLECTION);
if (!isShard) { // ignore min_pop if this is a shard request
this.min_pop = opts.getDouble("min_popularity", 0.0D);
}
@@ -97,7 +105,7 @@ public class RelatednessAgg extends AggValueSource {
@Override
public String description() {
// TODO: need better output processing when we start supporting null fgQ/bgQ in constructor
- return name +"(fgQ=" + fgQ + ",bgQ=" + bgQ + ",min_pop="+min_pop+")";
+ return name +"(fgQ=" + fgQ + ",bgQ=" + bgQ + ",min_pop="+min_pop + ",useSweep="+useSweep+")";
}
@Override
@@ -163,9 +171,127 @@ public class RelatednessAgg extends AggValueSource {
return new Merger(this);
}
+ private static final class SweepSKGSlotAcc extends SlotAcc {
+
+ private final int minCount; // pre-calculate for a given min_popularity
+ private final long fgSize;
+ private final long bgSize;
+ private final ReadOnlyCountSlotAcc fgCount;
+ private final ReadOnlyCountSlotAcc bgCount;
+ private double[] relatedness;
+
+ private static final int NO_ALL_BUCKETS = -2;
+ private static final int ALL_BUCKETS_UNINITIALIZED = -1;
+
+ // we can't get the allBuckets info from the slotContext in collect(), b/c the whole point of
+ // sweep collection is that the "collect" methods aren't called.
+ // So this is the compromise: note in construction either that we're using a processor w/NO_ALL_BUCKETS
+ // or that we don't know the bucket yet (ALL_BUCKETS_UNINITIALIZED) and fill it in in getValues
+ // where we can check against the processor
+ private int allBucketsSlot;
+
+ public SweepSKGSlotAcc(double minPopularity, FacetContext fcontext, int numSlots, long fgSize, long bgSize, ReadOnlyCountSlotAcc fgCount, ReadOnlyCountSlotAcc bgCount) {
+ super(fcontext);
+ this.minCount = (int) Math.ceil(minPopularity * bgSize);
+ this.fgSize = fgSize;
+ this.bgSize = bgSize;
+ this.fgCount = fgCount;
+ this.bgCount = bgCount;
+ relatedness = new double[numSlots];
+ Arrays.fill(relatedness, 0, numSlots, Double.NaN);
+
+ // any processor that can (currently) result in the use of SweepSKGSlotAcc *should* be a
+ // FacetFieldProcessor -- but don't assume that will always be true...
+ this.allBucketsSlot = NO_ALL_BUCKETS;
+ if (fcontext.processor instanceof FacetFieldProcessor
+ // NOTE: if this instanceof/cast changes, getValues needs updated as well
+ && ((FacetFieldProcessor)fcontext.processor).freq.allBuckets) {
+ this.allBucketsSlot = ALL_BUCKETS_UNINITIALIZED;
+ }
+ }
+
+ @Override
+ public void collect(int perSegDocId, int slot, IntFunction<SlotContext> slotContext) throws IOException {
+ throw new UnsupportedOperationException("collect() not supported, this SlotAcc impl only usable for sweeping");
+ }
+
+ @Override
+ public int collect(DocSet docs, int slot, IntFunction<SlotContext> slotContext) throws IOException {
+ throw new UnsupportedOperationException("collect() not supported, this SlotAcc impl only usable for sweeping");
+ }
+
+ private double getRelatedness(int slot) {
+ final double cachedRelatedness = relatedness[slot];
+ if (Double.isNaN(cachedRelatedness)) {
+ final long fg_count = fgCount.getCount(slot);
+ final long bg_count = bgCount.getCount(slot);
+ if (minCount > 0) {
+ // if min_pop is configured, and either (fg|bg) popularity is lower then that value
+ // then "this.relatedness=-Infinity" so it sorts below any "valid" relatedness scores
+ if (fg_count < minCount || bg_count < minCount) {
+ return relatedness[slot] = Double.NEGATIVE_INFINITY;
+ }
+ }
+ return relatedness[slot] = computeRelatedness(fg_count, fgSize, bg_count, bgSize);
+ } else {
+ return cachedRelatedness;
+ }
+ }
+
+ public int compare(int slotA, int slotB) {
+ int r = Double.compare(getRelatedness(slotA), getRelatedness(slotB));
+ if (0 == r) {
+ r = Long.compare(fgCount.getCount(slotA), fgCount.getCount(slotB));
+ }
+ if (0 == r) {
+ r = Long.compare(bgCount.getCount(slotA), bgCount.getCount(slotB));
+ }
+ return r;
+ }
+
+ @Override
+ public Object getValue(int slotNum) {
+ final BucketData slotVal;
+ if (NO_ALL_BUCKETS != allBucketsSlot) {
+ // there's no reason why a processor should be resizing SlotAccs in the middle of getValue,
+ // but we're going to be vigilent against that possibility just in case...
+ if (ALL_BUCKETS_UNINITIALIZED == allBucketsSlot
+ || allBucketsSlot == slotNum) {
+ assert fcontext.processor instanceof FacetFieldProcessor
+ : "code changed, non FacetFieldProcessor sweeping w/allBuckets?!?";
+ allBucketsSlot = ((FacetFieldProcessor)fcontext.processor).allBucketsAcc.collectAccSlot;
+ }
+ }
+ if (slotNum == allBucketsSlot) {
+ slotVal = new BucketData(null);
+ } else {
+ slotVal = new BucketData(fgCount.getCount(slotNum), fgSize, bgCount.getCount(slotNum), bgSize, getRelatedness(slotNum));
+ }
+ return slotVal.externalize(fcontext.isShard());
+ }
+
+ @Override
+ public void reset() throws IOException {
+ Arrays.fill(relatedness, Double.NaN);
+ if (allBucketsSlot != NO_ALL_BUCKETS) {
+ allBucketsSlot = ALL_BUCKETS_UNINITIALIZED;
+ }
+ }
+
+ @Override
+ public void resize(Resizer resizer) {
+ relatedness = resizer.resize(relatedness, Double.NaN);
+ }
+
+ @Override
+ public void close() throws IOException {
+ relatedness = null;
+ }
+ }
+
private static final String IMPLIED_KEY = "implied";
- private static final class SKGSlotAcc extends SlotAcc {
+ private static final class SKGSlotAcc extends SlotAcc implements SweepableSlotAcc<SlotAcc> {
private final RelatednessAgg agg;
private BucketData[] slotvalues;
private final DocSet fgSet;
@@ -181,9 +307,30 @@ public class RelatednessAgg extends AggValueSource {
// cache the set sizes for frequent re-use on every slot
this.fgSize = fgSet.size();
this.bgSize = bgSet.size();
- this.slotvalues = new BucketData[numSlots];
+ this.slotvalues = new BucketData[numSlots]; //TODO: avoid initializing array until we know we're not doing sweep collection?
reset();
}
+
+ /**
+ * If called, may register SweepingAccs for fg and bg set based on whether
+ * user indicated sweeping should be used (default)
+ *
+ * @returns null if any SweepingAccs were registered since no other collection is needed for relatedness
+ */
+ @Override
+ public SKGSlotAcc registerSweepingAccs(SweepingCountSlotAcc baseSweepingAcc) {
+ if (!this.agg.useSweep) {
+ return this;
+ } else {
+ final ReadOnlyCountSlotAcc fgCount = baseSweepingAcc.add(key + "!fg", fgSet, slotvalues.length);
+ final ReadOnlyCountSlotAcc bgCount = baseSweepingAcc.add(key + "!bg", bgSet, slotvalues.length);
+ SweepSKGSlotAcc readOnlyReplacement = new SweepSKGSlotAcc(agg.min_pop, fcontext, slotvalues.length, fgSize, bgSize, fgCount, bgCount);
+ readOnlyReplacement.key = key;
+ baseSweepingAcc.registerMapping(this, readOnlyReplacement);
+ return null;
+ }
+ }
+
private void processSlot(int slot, IntFunction<SlotContext> slotContext) throws IOException {
assert null != slotContext;
@@ -213,13 +360,18 @@ public class RelatednessAgg extends AggValueSource {
assert null == fcontext.filter;
}
// ...and in which case we should just use the current base
- final DocSet slotSet = null == slotQ ? fcontext.base : fcontext.searcher.getDocSet(slotQ);
+ final DocSet slotSet;
+ if (null == slotQ) {
+ slotSet = fcontext.base;
+ } else {
+ slotSet = fcontext.searcher.getDocSet(slotQ);
+ }
slotVal.incSizes(fgSize, bgSize);
slotVal.incCounts(fgSet.intersectionSize(slotSet),
bgSet.intersectionSize(slotSet));
}
-
+
@Override
public void collect(int perSegDocId, int slot, IntFunction<SlotContext> slotContext) throws IOException {
// NOTE: we don't actaully care about the individual docs being collected
@@ -334,6 +486,16 @@ public class RelatednessAgg extends AggValueSource {
this.implied = true;
}
+ public BucketData(long fg_count, long fg_size, long bg_count, long bg_size, double relatedness) {
+ this.fg_count = fg_count;
+ this.fg_size = fg_size;
+ this.fg_pop = roundTo5Digits((double) fg_count / bg_size); // yes, BACKGROUND size is intentional
+ this.bg_count = bg_count;
+ this.bg_size = bg_size;
+ this.bg_pop = roundTo5Digits((double) bg_count / bg_size);
+ this.relatedness = relatedness;
+ }
+
/**
* Increment both the foreground & background <em>counts</em> for the current bucket, reseting any
* derived values that may be cached
diff --git a/solr/core/src/java/org/apache/solr/search/facet/SingletonDISI.java b/solr/core/src/java/org/apache/solr/search/facet/SingletonDISI.java
new file mode 100644
index 0000000..08be64b
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/search/facet/SingletonDISI.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.search.facet;
+
+import java.io.IOException;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.solr.search.facet.SlotAcc.CountSlotAcc;
+
+final class SingletonDISI extends SweepDISI {
+
+ private final DocIdSetIterator backing;
+ private final boolean isBase;
+
+ SingletonDISI(DocIdSetIterator backing, CountSlotAcc[] countAccs, boolean isBase) {
+ super(1, countAccs);
+ this.backing = backing;
+ this.isBase = isBase;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return backing.nextDoc();
+ }
+
+ @Override
+ public boolean collectBase() {
+ return isBase;
+ }
+
+ @Override
+ public int registerCounts(SegCounter segCounter) {
+ return 0;
+ }
+}
diff --git a/solr/core/src/java/org/apache/solr/search/facet/SingletonDocIterator.java b/solr/core/src/java/org/apache/solr/search/facet/SingletonDocIterator.java
new file mode 100644
index 0000000..17311a6
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/search/facet/SingletonDocIterator.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.search.facet;
+
+import org.apache.solr.search.DocIterator;
+
+final class SingletonDocIterator extends SweepDocIterator {
+
+ private final DocIterator backing;
+ private final boolean isBase;
+
+ SingletonDocIterator(DocIterator backing, boolean isBase) {
+ super(1);
+ this.backing = backing;
+ this.isBase = isBase;
+ }
+
+ @Override
+ public boolean hasNext() {
+ return backing.hasNext();
+ }
+
+ @Override
+ public int nextDoc() {
+ return backing.nextDoc();
+ }
+
+ @Override
+ public boolean collectBase() {
+ return isBase;
+ }
+
+ @Override
+ public int registerCounts(SegCounter segCounts) {
+ return 0;
+ }
+
+}
diff --git a/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java b/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java
index e855552..d7d6e35 100644
--- a/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java
+++ b/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java
@@ -21,6 +21,7 @@ import java.io.IOException;
import java.lang.reflect.Array;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.function.IntFunction;
@@ -52,6 +53,8 @@ public abstract class SlotAcc implements Closeable {
this.fcontext = fcontext;
}
+ @Override public String toString() { return key; }
+
/**
* NOTE: this currently detects when it is being reused and calls resetIterators by comparing reader ords
* with previous calls to setNextReader. For this reason, current users must call setNextReader
@@ -597,9 +600,222 @@ public abstract class SlotAcc implements Closeable {
}
}
- abstract static class CountSlotAcc extends SlotAcc {
+ /**
+ * Implemented by some SlotAccs if they are capable of being used for
+ * sweep collecting in compatible facet processors
+ * @see FacetFieldProcessor#registerSweepingAccIfSupportedByCollectAcc()
+ */
+ static interface SweepableSlotAcc<T extends SlotAcc> {
+ /**
+ * Called by processors if they support sweeping. Implementations will often
+ * return self or null (the latter indicating that all necessary collection will
+ * be covered by the "sweeping" data structures registered with the specified
+ * baseSweepingAcc as a result of the call to this method).
+ *
+ * If an implementing instance chooses to replace itself with another {@link SlotAcc}, it must
+ * call {@link SweepingCountSlotAcc#registerMapping(SlotAcc, SlotAcc)} on the specified
+ * baseSweepingAcc to notify it of the mapping from original SlotAcc to the SlotAcc that should
+ * be used for purposes of read access. It is the responsibility of the specified {@link SweepingCountSlotAcc}
+ * to ensure proper placement/accessibility of the SlotAcc to be used for read access.
+ *
+ * The replacement SlotAcc registered via {@link SweepingCountSlotAcc#registerMapping(SlotAcc, SlotAcc)}
+ * will be responsible for output via its {@link SlotAcc#setValues(SimpleOrderedMap, int)} method.
+ * An implementer of this method may register such a replacement, and also return a non-null
+ * SlotAcc to be used for normal collection (via {@link FacetFieldProcessor#collectAcc}). In this case,
+ * the implementer should take care that the returned {@link SlotAcc} is different from the {@link SlotAcc}
+ * registered for the purpose of output -- with the former overriding {@link SlotAcc#setValues(SimpleOrderedMap, int)}
+ * as a no-op, to prevent setting duplicate values.
+ *
+ * @param baseSweepingAcc - never null, where the SlotAcc may register domains for sweep collection,
+ * and must register mappings of new read-access SlotAccs that result from this call.
+ * @return SlotAcc to be used for purpose of collection. If null then collect methods will
+ * never be called on this SlotAcc.
+ */
+ public T registerSweepingAccs(SweepingCountSlotAcc baseSweepingAcc);
+ }
+
+ /**
+ * A simple data structure to {@link DocSet} domains with an associated {@link CountSlotAcc}. This may be used
+ * to support sweep count accumulation over different {@link DocSet} domains, but the concept is perfectly applicable
+ * to encapsulating the relevant state for simple "non-sweep" collection as well (in which case {@link SweepCountAccStruct#docSet}
+ * would be {@link FacetContext#base}, {@link SweepCountAccStruct#countAcc} would be {@link FacetProcessor#countAcc}, and
+ * {@link SweepCountAccStruct#isBase} would trivially be "true").
+ */
+ static final class SweepCountAccStruct {
+ final DocSet docSet;
+ final boolean isBase;
+ final CountSlotAcc countAcc;
+ public SweepCountAccStruct(DocSet docSet, boolean isBase, CountSlotAcc countAcc) {
+ this.docSet = docSet;
+ this.isBase = isBase;
+ this.countAcc = countAcc;
+ }
+ public SweepCountAccStruct(SweepCountAccStruct t, DocSet replaceDocSet) {
+ this.docSet = replaceDocSet;
+ this.isBase = t.isBase;
+ this.countAcc = t.countAcc;
+ }
+ /**
+ * Because sweep collection offloads "collect" methods to count accumulation code,
+ * it is helpful to provide a read-only view over the backing {@link CountSlotAcc}
+ *
+ * @return - a read-only view over {@link #countAcc}
+ */
+ public ReadOnlyCountSlotAcc roCountAcc() {
+ return countAcc;
+ }
+ @Override public String toString() {
+ return this.countAcc.toString();
+ }
+ }
+
+ /**
+ * Special CountSlotAcc used by processors that support sweeping to decide what to sweep over and how to "collect"
+ * when doing the sweep.
+ *
+ * This class may be used by instances of {@link SweepableSlotAcc} to register DocSet domains (via {@link SweepingCountSlotAcc#add})
+ * over which to sweep-collect facet counts.
+ *
+ * @see SweepableSlotAcc#registerSweepingAccs
+ */
+ static class SweepingCountSlotAcc extends CountSlotArrAcc {
+
+ static final String SWEEP_COLLECTION_DEBUG_KEY = "sweep_collection";
+ private final SimpleOrderedMap<Object> debug;
+ private final FacetFieldProcessor p;
+ final SweepCountAccStruct base;
+ final List<SweepCountAccStruct> others = new ArrayList<>();
+ private final List<SlotAcc> output = new ArrayList<>();
+
+ SweepingCountSlotAcc(int numSlots, FacetFieldProcessor p) {
+ super(p.fcontext, numSlots);
+ this.p = p;
+ this.base = new SweepCountAccStruct(fcontext.base, true, this);
+ final FacetDebugInfo fdebug = fcontext.getDebugInfo();
+ this.debug = null != fdebug ? new SimpleOrderedMap<>() : null;
+ if (null != this.debug) {
+ fdebug.putInfoItem(SWEEP_COLLECTION_DEBUG_KEY, debug);
+ debug.add("base", key);
+ debug.add("accs", new ArrayList<String>());
+ debug.add("mapped", new ArrayList<String>());
+ }
+ }
+
+ /**
+ * Called by SweepableSlotAccs to register new DocSet domains for sweep collection
+ *
+ * @param key
+ * assigned to the returned SlotAcc, and used for debugging
+ * @param docs
+ * the domain over which to sweep
+ * @param numSlots
+ * the number of slots
+ * @return a read-only representation of the count acc which is guaranteed to be populated after sweep count
+ * collection
+ */
+ public ReadOnlyCountSlotAcc add(String key, DocSet docs, int numSlots) {
+ final CountSlotAcc count = new CountSlotArrAcc(fcontext, numSlots);
+ count.key = key;
+ final SweepCountAccStruct ret = new SweepCountAccStruct(docs, false, count);
+ if (null != debug) {
+ @SuppressWarnings("unchecked")
+ List<String> accsDebug = (List<String>) debug.get("accs");
+ accsDebug.add(ret.toString());
+ }
+ others.add(ret);
+ return ret.roCountAcc();
+ }
+
+ /**
+ * When a {@link SweepableSlotAcc} replaces itself (for the purpose of collection) with a different {@link SlotAcc}
+ * instance, it must register that replacement by calling this method with itself as the fromAcc param, and with the
+ * new replacement {@link SlotAcc} as the toAcc param. The two SlotAccs must have the same {@link SlotAcc#key}.
+ *
+ * It is the responsibility of this method to insure that {@link FacetFieldProcessor} references to fromAcc (other than
+ * those within {@link FacetFieldProcessor#collectAcc}, which are set directly by the return value of
+ * {@link SweepableSlotAcc#registerSweepingAccs(SweepingCountSlotAcc)}) are replaced
+ * by references to toAcc. Such references would include, e.g., {@link FacetFieldProcessor#sortAcc}.
+ *
+ * It is also this method's responsibility to insure that read access to toAcc (via toAcc's {@link SlotAcc#setValues(SimpleOrderedMap, int)}
+ * method) is provided via this instance's {@link #setValues(SimpleOrderedMap, int)} method.
+ *
+ * @param fromAcc - the {@link SlotAcc} to be replaced (this will normally be the caller of this method).
+ * @param toAcc - the replacement {@link SlotAcc}
+ *
+ * @see SweepableSlotAcc#registerSweepingAccs(SweepingCountSlotAcc)
+ */
+ public void registerMapping(SlotAcc fromAcc, SlotAcc toAcc) {
+ assert fromAcc.key.equals(toAcc.key);
+ output.add(toAcc);
+ if (p.sortAcc == fromAcc) {
+ p.sortAcc = toAcc;
+ }
+ if (null != debug) {
+ @SuppressWarnings("unchecked")
+ List<String> mappedDebug = (List<String>) debug.get("mapped");
+ mappedDebug.add(fromAcc.toString());
+ }
+ }
+
+ /**
+ * Always populates the bucket with the current count for that slot. If the count is positive, or if
+ * <code>processEmpty==true</code>, then this method also populates the values from mapped "output" accumulators.
+ *
+ * @see #setSweepValues
+ */
+ @Override
+ public void setValues(SimpleOrderedMap<Object> bucket, int slotNum) throws IOException {
+ super.setValues(bucket, slotNum);
+ if (0 < getCount(slotNum) || fcontext.processor.freq.processEmpty) {
+ setSweepValues(bucket, slotNum);
+ }
+ }
+
+ /**
+ * Populates the bucket with the values from all mapped "output" accumulators for the specified slot.
+ *
+ * This method exists because there are some contexts (namely SpecialSlotAcc, for allBuckets, etc.) in which "base"
+ * count is tracked differently, via getSpecialCount(). For such cases, we need a method that allows the caller to
+ * directly coordinate calling {@link SlotAcc#setValues} on the sweeping output accs, while avoiding the inclusion
+ * of {@link CountSlotAcc#setValues CountSlotAcc.setValues}
+ */
+ public void setSweepValues(SimpleOrderedMap<Object> bucket, int slotNum) throws IOException {
+ for (SlotAcc acc : output) {
+ acc.setValues(bucket, slotNum);
+ }
+ }
+
+ /**
+ * Helper method for code that wants to operating in a sweeping manner even if the current processor
+ * is not using sweeping.
+ *
+ * @returns struct that wraps the {@link FacetContext#base} unless the {@link FacetProcessor#countAcc} is a {@link SweepingCountSlotAcc}
+ */
+ public static SweepCountAccStruct baseStructOf(FacetProcessor<?> processor) {
+ if (processor.countAcc instanceof SweepingCountSlotAcc) {
+ return ((SweepingCountSlotAcc) processor.countAcc).base;
+ }
+ return new SweepCountAccStruct(processor.fcontext.base, true, processor.countAcc);
+ }
+ /**
+ * Helper method for code that wants to operating in a sweeping manner even if the current processor
+ * is not using sweeping
+ *
+ * @returns empty list unless the {@link FacetProcessor#countAcc} is a {@link SweepingCountSlotAcc}
+ */
+ public static List<SweepCountAccStruct> otherStructsOf(FacetProcessor<?> processor) {
+ if (processor.countAcc instanceof SweepingCountSlotAcc) {
+ return ((SweepingCountSlotAcc) processor.countAcc).others;
+ }
+ return Collections.emptyList();
+ }
+ }
+
+ abstract static class CountSlotAcc extends SlotAcc implements ReadOnlyCountSlotAcc {
public CountSlotAcc(FacetContext fcontext) {
super(fcontext);
+ // assume we are the 'count' by default unless/untill our creator overrides this
+ this.key = "count";
}
public abstract void incrementCount(int slot, long count);
diff --git a/solr/core/src/java/org/apache/solr/search/facet/SweepCountAware.java b/solr/core/src/java/org/apache/solr/search/facet/SweepCountAware.java
new file mode 100644
index 0000000..3dd8376
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/search/facet/SweepCountAware.java
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.search.facet;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.util.LongValues;
+import org.apache.solr.search.DocIterator;
+import org.apache.solr.search.facet.SlotAcc.CountSlotAcc;
+
+/**
+ * Implemented by extensions of doc iterators (i.e., {@link DocIdSetIterator}, {@link DocIterator} over one or
+ * more domain, to support facet count accumulation corresponding to each domain (and via {@link #collectBase()}
+ * to inform the necessity of "collection" for a single optional backing "base" set).
+ */
+interface SweepCountAware {
+
+ /**
+ * Returns true if one of the domains underlying this iterator is the "base" domain, and if that base domain
+ * contains the doc on which this iterator is currently positioned. If "true", then "collection" may be necessary
+ * for the current doc.
+ *
+ * For each iterator position (each doc), {@link #registerCounts(SegCounter)} must be called before this method.
+ */
+ boolean collectBase();
+
+ /**
+ * Called on a positioned doc iterator to register array index mappings for domains that contain the current
+ * doc. Implementations register these mappings by calling {@link SegCounter#map(int, int)} on the specified
+ * segCounts param.
+ *
+ * For each iterator position, this method must be called before {@link #collectBase()}
+ *
+ * @param segCounts - to register mappings of array indices for domains that contain this doc
+ * @return - the max index of an array representing the domains that contain the current doc. If "n" domains
+ * contain the current doc, the return value would be "n - 1"
+ * @throws IOException - if thrown by advancing an underlying doc iterator
+ */
+ int registerCounts(SegCounter segCounts) throws IOException;
+
+ /**
+ * Used to coordinate multiple count accumulations over multiple domains. Implementers will have "n" backing term-ord-indexed
+ * counts -- one for each domain over which count accumulation is to be performed. For each doc, count accumulation
+ * takes place in two phases, invoked by a "driver" (e.g., {@link FacetFieldProcessor}) that manages iteration over the
+ * union of doc domains:
+ *
+ * First, the driver passes this object as the param to {@link SweepCountAware#registerCounts(SegCounter)}, which
+ * calls {@link #map(int, int)} on "this" to map the static "allIdx" (allIdx < n) for each active backing domain to
+ * a transient "activeIdx" for counts corresponding to active domains (activeIdx < count(allIdx) <= n). (The return value
+ * of {@link SweepCountAware#registerCounts(SegCounter)} indicates to the "driver" the max "active counts" index (for
+ * domains that contain the current doc).
+ *
+ * The driver then calls {@link #incrementCount(int, int, int)}, passing the term ord, increment amount (usually "1"),
+ * and the max "active counts" index returned from {@link SweepCountAware#registerCounts(SegCounter)} in the first
+ * phase. The "max active counts index" param is used as the limit (inclusive) to iterate count accumulation over each
+ * of the "active" domains for the current doc.
+ *
+ * @see SweepCountAware#registerCounts(SegCounter)
+ */
+ static interface SegCounter {
+ /**
+ * Mark/map a given domain/CountSlotAcc as active (eligible for count accumulation) for the current doc.
+ *
+ * @param allIdx - the static index of the domain/CountSlotAcc to be "activated" for the current doc
+ * @param activeIdx - the transient "active index" (for the purpose of actual count accumulation) to which to map
+ * the domain/CountSlotAcc indicated by "allIdx".
+ */
+ void map(int allIdx, int activeIdx);
+
+ /**
+ * Increments counts for active domains/CountSlotAccs.
+ *
+ * @param ord - the term ord (either global ord per-seg) for which to increment counts
+ * @param inc - the amount by which to increment the count for the specified term ord
+ * @param maxIdx - the max index (inclusive) of active domains/CountSlotAccs to be incremented for the current doc
+ */
+ void incrementCount(int ord, int inc, int maxIdx);
+ }
+
+ /**
+ * This class is designed to count over global term ords ({@link SegCountPerSeg} provides equivalent functionality for
+ * per-segment term ords).
+ *
+ * @see SegCountPerSeg
+ */
+ static class SegCountGlobal implements SegCounter {
+ private final CountSlotAcc[] allCounts;
+ private final CountSlotAcc[] activeCounts;
+
+ public SegCountGlobal(CountSlotAcc[] allCounts) {
+ this.allCounts = allCounts;
+ this.activeCounts = Arrays.copyOf(allCounts, allCounts.length);
+ }
+
+ @Override
+ public void map(int allIdx, int activeIdx) {
+ activeCounts[activeIdx] = allCounts[allIdx];
+ }
+
+ @Override
+ public final void incrementCount(int globalOrd, int inc, int maxIdx) {
+ int i = maxIdx;
+ do {
+ activeCounts[i].incrementCount(globalOrd, inc);
+ } while (i-- > 0);
+ }
+ }
+
+ /**
+ * This class is designed to count over per-segment term ords ({@link SegCountGlobal} provides equivalent functionality for
+ * global term ords).
+ *
+ * @see SegCountGlobal
+ */
+ static class SegCountPerSeg implements SegCounter {
+ protected final int[][] allSegCounts;
+ private final int[][] activeSegCounts;
+ private final boolean[] seen;
+
+ public SegCountPerSeg(int[][] allSegCounts, boolean[] seen, int segMax, int size) {
+ this.allSegCounts = allSegCounts;
+ this.activeSegCounts = Arrays.copyOf(this.allSegCounts, size);
+ this.seen = seen;
+ }
+
+ @Override
+ public final void map(int allIdx, int activeIdx) {
+ activeSegCounts[activeIdx] = allSegCounts[allIdx];
+ }
+
+ @Override
+ public final void incrementCount(int segOrd, int inc, int maxIdx) {
+ seen[segOrd] = true;
+ int i = maxIdx;
+ do {
+ activeSegCounts[i][segOrd] += inc;
+ } while (i-- > 0);
+ }
+
+ /**
+ * Maps accumulated per-segment term ords to global term ords and increments global slots on the specified countAccs
+ * accordingly. The index of each CountSlotAcc in the specified countAccs array must correspond to the
+ * the static index of its associated count accumulation doc domain and per-seg count array.
+ *
+ * @param countAccs - global-scope CountSlotAccs (one for each domain) to be incremented for the most recently accumulated
+ * segment
+ * @param toGlobal - mapping of per-segment term ords to global term ords for the most recently accumulated segment
+ * @param maxSegOrd - the max per-seg term ord for the most recently accumulated segment
+ */
+ public void register(CountSlotAcc[] countAccs, LongValues toGlobal, int maxSegOrd) {
+ int segOrd = maxSegOrd;
+ final int maxIdx = countAccs.length - 1;
+ for (;;) {
+ if (seen[segOrd]) {
+ int i = maxIdx;
+ int slot = toGlobal == null ? segOrd : (int)toGlobal.get(segOrd);
+ do {
+ final int inc = allSegCounts[i][segOrd];
+ if (inc > 0) {
+ countAccs[i].incrementCount(slot, inc);
+ }
+ } while (i-- > 0);
+ }
+ if (--segOrd < 0) {
+ break;
+ }
+ }
+ }
+ }
+
+}
diff --git a/solr/core/src/java/org/apache/solr/search/facet/SweepDISI.java b/solr/core/src/java/org/apache/solr/search/facet/SweepDISI.java
new file mode 100644
index 0000000..94c4261
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/search/facet/SweepDISI.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.search.facet;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.solr.search.facet.SlotAcc.CountSlotAcc;
+import org.apache.solr.search.facet.SlotAcc.SweepCountAccStruct;
+
+public abstract class SweepDISI extends DocIdSetIterator implements SweepCountAware {
+
+ public final int size;
+ final CountSlotAcc[] countAccs;
+
+ public SweepDISI(int size, CountSlotAcc[] countAccs) {
+ this.size = size;
+ this.countAccs = countAccs;
+ }
+
+ private static boolean addAcc(SweepCountAccStruct entry, DocIdSetIterator[] subIterators, CountSlotAcc[] activeCountAccs, LeafReaderContext subCtx, int idx) throws IOException {
+ final DocIdSet docIdSet = entry.docSet.getTopFilter().getDocIdSet(subCtx, null);
+ if (docIdSet == null || (subIterators[idx] = docIdSet.iterator()) == null) {
+ return false;
+ }
+ activeCountAccs[idx] = entry.countAcc;
+ return true;
+ }
+
+ static SweepDISI newInstance(SweepCountAccStruct base, List<SweepCountAccStruct> others, DocIdSetIterator[] subIterators, CountSlotAcc[] activeCountAccs, LeafReaderContext subCtx) throws IOException {
+ int activeCt = 0;
+ final int baseIdx;
+ if (base == null || !addAcc(base, subIterators, activeCountAccs, subCtx, activeCt)) {
+ baseIdx = -1;
+ } else {
+ baseIdx = activeCt++;
+ }
+ for (SweepCountAccStruct entry : others) {
+ if (addAcc(entry, subIterators, activeCountAccs, subCtx, activeCt)) {
+ activeCt++;
+ }
+ }
+ switch (activeCt) {
+ case 0:
+ return null;
+ case 1:
+ return new SingletonDISI(subIterators[0], activeCountAccs, baseIdx >= 0); // solr docsets already exclude any deleted docs
+ default:
+ return new UnionDISI(subIterators, activeCountAccs, activeCt, baseIdx);
+ }
+ }
+
+ @Override
+ public int docID() {
+ throw new UnsupportedOperationException("Not supported.");
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ throw new UnsupportedOperationException("Not supported.");
+ }
+
+ @Override
+ public long cost() {
+ throw new UnsupportedOperationException("Not supported.");
+ }
+
+}
diff --git a/solr/core/src/java/org/apache/solr/search/facet/SweepDocIterator.java b/solr/core/src/java/org/apache/solr/search/facet/SweepDocIterator.java
new file mode 100644
index 0000000..7478d10
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/search/facet/SweepDocIterator.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.search.facet;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.solr.search.DocIterator;
+import org.apache.solr.search.facet.SlotAcc.CountSlotAcc;
+import org.apache.solr.search.facet.SlotAcc.SweepCountAccStruct;
+
+abstract class SweepDocIterator implements DocIterator, SweepCountAware {
+
+ public final int size;
+
+ public SweepDocIterator(int size) {
+ this.size = size;
+ }
+
+ static class SweepIteratorAndCounts {
+ final SweepDocIterator iter;
+ final CountSlotAcc[] countAccs;
+ public SweepIteratorAndCounts(SweepDocIterator iter, CountSlotAcc[] countAccs) {
+ this.iter = iter;
+ this.countAccs = countAccs;
+ }
+ }
+
+ static SweepIteratorAndCounts newInstance(SweepCountAccStruct base, List<SweepCountAccStruct> others) throws IOException {
+ final int activeCt;
+ SweepCountAccStruct entry;
+ if (base == null) {
+ activeCt = others.size();
+ entry = others.get(0);
+ } else {
+ activeCt = others.size() + 1;
+ entry = base;
+ }
+ if (activeCt == 1) {
+ final CountSlotAcc[] countAccs = new CountSlotAcc[] {entry.countAcc};
+ return new SweepIteratorAndCounts(new SingletonDocIterator(entry.docSet.iterator(), base != null), countAccs);
+ } else {
+ final DocIterator[] subIterators = new DocIterator[activeCt];
+ final CountSlotAcc[] countAccs = new CountSlotAcc[activeCt];
+ Iterator<SweepCountAccStruct> othersIter = others.iterator();
+ int i = 0;
+ for (;;) {
+ subIterators[i] = entry.docSet.iterator();
+ countAccs[i] = entry.countAcc;
+ if (++i == activeCt) {
+ break;
+ }
+ entry = othersIter.next();
+ }
+ return new SweepIteratorAndCounts(new UnionDocIterator(subIterators, base == null ? -1 : 0), countAccs);
+ }
+ }
+
+ @Override
+ public float score() {
+ throw new UnsupportedOperationException("Not supported.");
+ }
+
+ @Override
+ public Integer next() {
+ throw new UnsupportedOperationException("Not supported.");
+ }
+
+ @Override
+ public abstract int registerCounts(SegCounter segCounts); // override to not throw IOException
+
+}
diff --git a/solr/core/src/java/org/apache/solr/search/facet/UnInvertedField.java b/solr/core/src/java/org/apache/solr/search/facet/UnInvertedField.java
index 04f88f9..ee86fe0 100644
--- a/solr/core/src/java/org/apache/solr/search/facet/UnInvertedField.java
+++ b/solr/core/src/java/org/apache/solr/search/facet/UnInvertedField.java
@@ -41,11 +41,15 @@ import org.apache.solr.index.SlowCompositeReaderWrapper;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.TrieField;
import org.apache.solr.search.BitDocSet;
-import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.SolrCache;
import org.apache.solr.search.SolrIndexSearcher;
+import org.apache.solr.search.facet.SweepCountAware.SegCountGlobal;
+import org.apache.solr.search.facet.SlotAcc.CountSlotAcc;
import org.apache.solr.search.facet.SlotAcc.SlotContext;
+import org.apache.solr.search.facet.SlotAcc.SweepCountAccStruct;
+import org.apache.solr.search.facet.SlotAcc.SweepingCountSlotAcc;
+import org.apache.solr.search.facet.SweepDocIterator.SweepIteratorAndCounts;
import org.apache.solr.uninverting.DocTermOrds;
import org.apache.solr.util.TestInjection;
import org.slf4j.Logger;
@@ -315,7 +319,7 @@ public class UnInvertedField extends DocTermOrds {
- private void getCounts(FacetFieldProcessorByArrayUIF processor, SlotAcc.CountSlotAcc counts) throws IOException {
+ private void getCounts(FacetFieldProcessorByArrayUIF processor) throws IOException {
DocSet docs = processor.fcontext.base;
int baseSize = docs.size();
int maxDoc = searcher.maxDoc();
@@ -325,9 +329,12 @@ public class UnInvertedField extends DocTermOrds {
return;
}
+ SweepCountAccStruct baseCountAccStruct = SweepingCountSlotAcc.baseStructOf(processor);
+ final List<SweepCountAccStruct> others = SweepingCountSlotAcc.otherStructsOf(processor);
+
final int[] index = this.index;
- boolean doNegative = baseSize > maxDoc >> 1 && termInstances > 0 && docs instanceof BitDocSet;
+ boolean doNegative = baseSize > maxDoc >> 1 && termInstances > 0 && docs instanceof BitDocSet && baseCountAccStruct != null;
if (doNegative) {
FixedBitSet bs = ((BitDocSet) docs).getBits().clone();
@@ -337,21 +344,34 @@ public class UnInvertedField extends DocTermOrds {
docs = new BitDocSet(bs, maxDoc - baseSize);
// simply negating will mean that we have deleted docs in the set.
// that should be OK, as their entries in our table should be empty.
+ baseCountAccStruct = new SweepCountAccStruct(baseCountAccStruct, docs);
}
// For the biggest terms, do straight set intersections
for (TopTerm tt : bigTerms.values()) {
// TODO: counts could be deferred if sorting by index order
- counts.incrementCount(tt.termNum, searcher.numDocs(tt.termQuery, docs));
+ final int termOrd = tt.termNum;
+ Iterator<SweepCountAccStruct> othersIter = others.iterator();
+ SweepCountAccStruct entry = baseCountAccStruct != null ? baseCountAccStruct : othersIter.next();
+ for (;;) {
+ entry.countAcc.incrementCount(termOrd, searcher.numDocs(tt.termQuery, entry.docSet));
+ if (!othersIter.hasNext()) {
+ break;
+ }
+ entry = othersIter.next();
+ }
}
// TODO: we could short-circuit counting altogether for sorted faceting
// where we already have enough terms from the bigTerms
if (termInstances > 0) {
- DocIterator iter = docs.iterator();
+ final SweepIteratorAndCounts iterAndCounts = SweepDocIterator.newInstance(baseCountAccStruct, others);
+ final SweepDocIterator iter = iterAndCounts.iter;
+ final SegCountGlobal counts = new SegCountGlobal(iterAndCounts.countAccs);
while (iter.hasNext()) {
int doc = iter.nextDoc();
+ int maxIdx = iter.registerCounts(counts);
int code = index[doc];
if ((code & 0x80000000)!=0) {
@@ -368,7 +388,7 @@ public class UnInvertedField extends DocTermOrds {
}
if (delta == 0) break;
tnum += delta - TNUM_OFFSET;
- counts.incrementCount(tnum,1);
+ counts.incrementCount(tnum, 1, maxIdx);
}
} else {
int tnum = 0;
@@ -378,7 +398,7 @@ public class UnInvertedField extends DocTermOrds {
if ((code & 0x80) == 0) {
if (delta == 0) break;
tnum += delta - TNUM_OFFSET;
- counts.incrementCount(tnum,1);
+ counts.incrementCount(tnum, 1, maxIdx);
delta = 0;
}
code >>>= 8;
@@ -388,9 +408,10 @@ public class UnInvertedField extends DocTermOrds {
}
if (doNegative) {
+ final CountSlotAcc baseCounts = processor.countAcc;
for (int i=0; i<numTermsInField; i++) {
// counts[i] = maxTermCounts[i] - counts[i];
- counts.incrementCount(i, maxTermCounts[i] - (int) counts.getCount(i)*2);
+ baseCounts.incrementCount(i, maxTermCounts[i] - (int) baseCounts.getCount(i)*2);
}
}
@@ -409,7 +430,7 @@ public class UnInvertedField extends DocTermOrds {
public void collectDocs(FacetFieldProcessorByArrayUIF processor) throws IOException {
if (processor.collectAcc==null && processor.allBucketsAcc == null && processor.startTermIndex == 0 && processor.endTermIndex >= numTermsInField) {
- getCounts(processor, processor.countAcc);
+ getCounts(processor);
return;
}
@@ -427,15 +448,22 @@ public class UnInvertedField extends DocTermOrds {
DocSet docs = processor.fcontext.base;
int uniqueTerms = 0;
- final SlotAcc.CountSlotAcc countAcc = processor.countAcc;
+ final CountSlotAcc countAcc = processor.countAcc;
+ final SweepCountAccStruct baseCountAccStruct = SweepingCountSlotAcc.baseStructOf(processor);
+ final List<SweepCountAccStruct> others = SweepingCountSlotAcc.otherStructsOf(processor);
for (TopTerm tt : bigTerms.values()) {
if (tt.termNum >= startTermIndex && tt.termNum < endTermIndex) {
// handle the biggest terms
- DocSet intersection = searcher.getDocSet(tt.termQuery, docs);
+ DocSet termSet = searcher.getDocSet(tt.termQuery);
+ DocSet intersection = termSet.intersection(docs);
int collected = processor.collectFirstPhase(intersection, tt.termNum - startTermIndex,
slotNum -> { return new SlotContext(tt.termQuery); });
- countAcc.incrementCount(tt.termNum - startTermIndex, collected);
+ final int termOrd = tt.termNum - startTermIndex;
+ countAcc.incrementCount(termOrd, collected);
+ for (SweepCountAccStruct entry : others) {
+ entry.countAcc.incrementCount(termOrd, termSet.intersectionSize(entry.docSet));
+ }
if (collected > 0) {
uniqueTerms++;
}
@@ -455,9 +483,14 @@ public class UnInvertedField extends DocTermOrds {
// TODO: handle facet.prefix here!!!
- DocIterator iter = docs.iterator();
+ SweepIteratorAndCounts sweepIterAndCounts = SweepDocIterator.newInstance(baseCountAccStruct, others);
+ final SweepDocIterator iter = sweepIterAndCounts.iter;
+ final CountSlotAcc[] countAccs = sweepIterAndCounts.countAccs;
+ final SegCountGlobal counts = new SegCountGlobal(countAccs);
while (iter.hasNext()) {
int doc = iter.nextDoc();
+ int maxIdx = iter.registerCounts(counts);
+ boolean collectBase = iter.collectBase();
if (doc >= adjustedMax) {
do {
@@ -495,8 +528,10 @@ public class UnInvertedField extends DocTermOrds {
int arrIdx = tnum - startTermIndex;
if (arrIdx < 0) continue;
if (arrIdx >= nTerms) break;
- countAcc.incrementCount(arrIdx, 1);
- processor.collectFirstPhase(segDoc, arrIdx, processor.slotContext);
+ counts.incrementCount(arrIdx, 1, maxIdx);
+ if (collectBase) {
+ processor.collectFirstPhase(segDoc, arrIdx, processor.slotContext);
+ }
}
} else {
int tnum = 0;
@@ -509,8 +544,10 @@ public class UnInvertedField extends DocTermOrds {
int arrIdx = tnum - startTermIndex;
if (arrIdx >= 0) {
if (arrIdx >= nTerms) break;
- countAcc.incrementCount(arrIdx, 1);
- processor.collectFirstPhase(segDoc, arrIdx, processor.slotContext);
+ counts.incrementCount(arrIdx, 1, maxIdx);
+ if (collectBase) {
+ processor.collectFirstPhase(segDoc, arrIdx, processor.slotContext);
+ }
}
delta = 0;
}
diff --git a/solr/core/src/java/org/apache/solr/search/facet/UnionDISI.java b/solr/core/src/java/org/apache/solr/search/facet/UnionDISI.java
new file mode 100644
index 0000000..8bd1968
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/search/facet/UnionDISI.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.search.facet;
+
+import java.io.IOException;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.util.PriorityQueue;
+import org.apache.solr.search.facet.SlotAcc.CountSlotAcc;
+
+final class UnionDISI extends SweepDISI {
+
+ final int maxIdx;
+ private final SubIterStruct baseSub;
+ private boolean collectBase;
+ private final PriorityQueue<SubIterStruct> queue;
+ private SubIterStruct top;
+ private int docId = -1;
+
+ private static final class SubIterStruct {
+ private final DocIdSetIterator sub;
+ private final int index;
+ private int docId;
+ public SubIterStruct(DocIdSetIterator sub, int index) throws IOException {
+ this.sub = sub;
+ this.index = index;
+ nextDoc();
+ }
+ public void nextDoc() throws IOException {
+ docId = sub.nextDoc();
+ }
+ }
+ UnionDISI(DocIdSetIterator[] subIterators, CountSlotAcc[] countAccs, int size, int baseIdx) throws IOException {
+ super(size, countAccs);
+ this.maxIdx = size - 1;
+ queue = new PriorityQueue<SubIterStruct>(size) {
+ @Override
+ protected boolean lessThan(SubIterStruct a, SubIterStruct b) {
+ return a.docId < b.docId;
+ }
+ };
+ int i = maxIdx;
+ SubIterStruct tmpBaseSub = null;
+ do {
+ final SubIterStruct subIterStruct = new SubIterStruct(subIterators[i], i);
+ queue.add(subIterStruct);
+ if (i == baseIdx) {
+ tmpBaseSub = subIterStruct;
+ }
+ } while (i-- > 0);
+ baseSub = tmpBaseSub;
+ top = queue.top();
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ if (top.docId == docId) {
+ do {
+ top.nextDoc();
+ } while ((top = queue.updateTop()).docId == docId);
+ }
+ if (baseSub != null) {
+ collectBase = false;
+ }
+ return docId = top.docId;
+ }
+
+ @Override
+ public boolean collectBase() {
+ assert top.docId != docId : "must call registerCounts() before collectBase()";
+ return collectBase;
+ }
+
+ @Override
+ public int registerCounts(SegCounter segCounter) throws IOException {
+ int i = -1;
+ do {
+ if (!collectBase && top == baseSub) {
+ collectBase = true;
+ }
+ segCounter.map(top.index, ++i);
+ top.nextDoc();
+ } while ((top = queue.updateTop()).docId == docId);
+ return i;
+ }
+
+}
diff --git a/solr/core/src/java/org/apache/solr/search/facet/UnionDocIterator.java b/solr/core/src/java/org/apache/solr/search/facet/UnionDocIterator.java
new file mode 100644
index 0000000..448a49c
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/search/facet/UnionDocIterator.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.search.facet;
+
+import java.io.IOException;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.solr.search.DocIterator;
+import org.apache.lucene.util.PriorityQueue;
+
+final class UnionDocIterator extends SweepDocIterator {
+
+ private final int maxIdx;
+ private final SubIterStruct baseSub;
+ private boolean collectBase;
+ private final PriorityQueue<SubIterStruct> queue;
+ private SubIterStruct top;
+ private int docId = -1;
+
+ private static final class SubIterStruct {
+ private final DocIterator sub;
+ private final int index;
+ private int docId;
+ public SubIterStruct(DocIterator sub, int index) throws IOException {
+ this.sub = sub;
+ this.index = index;
+ nextDoc();
+ }
+ public void nextDoc() {
+ docId = sub.hasNext() ? sub.nextDoc() : DocIdSetIterator.NO_MORE_DOCS;
+ }
+ }
+ UnionDocIterator(DocIterator[] subIterators, int baseIdx) throws IOException {
+ super(subIterators.length);
+ this.maxIdx = size - 1;
+ queue = new PriorityQueue<SubIterStruct>(size) {
+ @Override
+ protected boolean lessThan(SubIterStruct a, SubIterStruct b) {
+ return a.docId < b.docId;
+ }
+ };
+ SubIterStruct tmpBase = null;
+ int i = maxIdx;
+ do {
+ SubIterStruct subIterStruct = new SubIterStruct(subIterators[i], i);
+ queue.add(subIterStruct);
+ if (i == baseIdx) {
+ tmpBase = subIterStruct;
+ }
+ } while (i-- > 0);
+ this.baseSub = tmpBase;
+ top = queue.top();
+ }
+
+ @Override
+ public int nextDoc() {
+ if (top.docId == docId) {
+ do {
+ top.nextDoc();
+ } while ((top = queue.updateTop()).docId == docId);
+ }
+ collectBase = false;
+ return docId = top.docId;
+ }
+
+ @Override
+ public boolean hasNext() {
+ if (top.docId == docId) {
+ do {
+ top.nextDoc();
+ } while ((top = queue.updateTop()).docId == docId);
+ }
+ return top.docId != DocIdSetIterator.NO_MORE_DOCS;
+ }
+
+ @Override
+ public boolean collectBase() {
+ assert top.docId != docId : "must call registerCounts() before collectBase()";
+ return collectBase;
+ }
+
+ @Override
+ public int registerCounts(SegCounter segCounts) {
+ int i = -1;
+ do {
+ if (!collectBase && top == baseSub) {
+ collectBase = true;
+ }
+ segCounts.map(top.index, ++i);
+ top.nextDoc();
+ } while ((top = queue.updateTop()).docId == docId);
+ return i;
+ }
+}
diff --git a/solr/core/src/test/org/apache/solr/search/facet/TestCloudJSONFacetSKGEquiv.java b/solr/core/src/test/org/apache/solr/search/facet/TestCloudJSONFacetSKGEquiv.java
index 276dcb9..eb68662 100644
--- a/solr/core/src/test/org/apache/solr/search/facet/TestCloudJSONFacetSKGEquiv.java
+++ b/solr/core/src/test/org/apache/solr/search/facet/TestCloudJSONFacetSKGEquiv.java
@@ -48,6 +48,7 @@ import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import static org.apache.solr.search.facet.FacetField.FacetMethod;
+import static org.apache.solr.search.facet.SlotAcc.SweepingCountSlotAcc.SWEEP_COLLECTION_DEBUG_KEY;
import org.noggit.JSONUtil;
import org.noggit.JSONWriter;
@@ -58,12 +59,11 @@ import org.junit.BeforeClass;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-
/**
* <p>
* A randomized test of nested facets using the <code>relatedness()</code> function, that asserts the
* results are consistent and equivilent regardless of what <code>method</code> (ie: FacetFieldProcessor)
- * is requested.
+ * and/or <code>{@value RelatednessAgg#SWEEP_COLLECTION}</code> option is requested.
* </p>
* <p>
* This test is based on {@link TestCloudJSONFacetSKG} but does <em>not</em>
@@ -280,6 +280,212 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase {
assertEquals(FacetFieldProcessorByHashDV.class.getSimpleName(), debug.get("processor"));
}
}
+
+ /**
+ * Sanity check that our method of varying the <code>{@value RelatednessAgg#SWEEP_COLLECTION}</code> in conjunction with the
+ * <code>method</code> params works and can be verified by inspecting the debug output of basic requests.
+ */
+ public void testWhiteboxSanitySweepDebug() throws Exception {
+ // NOTE: json.facet debugging output can be wonky, particularly when dealing with cloud
+ // so for these queries we keep it simple:
+ // - only one "top" facet per request
+ // - no refinement
+ // even with those constraints in place, a single facet can (may/sometimes?) produce multiple debug
+ // blocks - aparently due to shard merging? So...
+ // - only inspect the "first" debug NamedList in the results
+ //
+
+ final SolrParams baseParams = params("rows","0",
+ "debug","true", // SOLR-14451
+ // *:* is the only "safe" query for this test,
+ // to ensure we always have at least one bucket for every facet
+ // so we can be confident in getting the debug we expect...
+ "q", "*:*",
+ "fore", multiStrField(7)+":11",
+ "back", "*:*");
+
+ // simple individual facet that sorts on an skg stat...
+ //
+ // all results we test should be the same even if there is another 'skg_extra' stat,
+ // it shouldn't be involved in the sweeping at all.
+ for (Facet extra : Arrays.asList(null, new RelatednessFacet(multiStrField(2)+":9", null))) {
+ // choose a single value string so we know both 'dv' (sweep) and 'dvhash' (no sweep) can be specified
+ final TermFacet f = new TermFacet(soloStrField(9), 10, 0, "skg desc", null);
+ if (null != extra) {
+ f.subFacets.put("skg_extra", extra);
+ }
+ final Map<String,TermFacet> facets = new LinkedHashMap<>();
+ facets.put("str", f);
+
+ final SolrParams facetParams
+ = SolrParams.wrapDefaults(params("method_val", "dv",
+ "json.facet", Facet.toJSONFacetParamValue(facets)),
+ baseParams);
+
+ // both default sweep option and explicit sweep should give same results...
+ for (SolrParams sweepParams : Arrays.asList(params(),
+ params("sweep_key", RelatednessAgg.SWEEP_COLLECTION,
+ "sweep_val", "true"))) {
+ final SolrParams params = SolrParams.wrapDefaults(sweepParams, facetParams);
+
+ final NamedList<Object> debug = getFacetDebug(params);
+ assertEquals(FacetFieldProcessorByArrayDV.class.getSimpleName(), debug.get("processor"));
+ @SuppressWarnings("unchecked")
+ final NamedList<Object> sweep_debug = (NamedList<Object>) debug.get(SWEEP_COLLECTION_DEBUG_KEY);
+ assertNotNull(sweep_debug);
+ assertEquals("count", sweep_debug.get("base"));
+ assertEquals(Arrays.asList("skg!fg","skg!bg"), sweep_debug.get("accs"));
+ assertEquals(Arrays.asList("skg"), sweep_debug.get("mapped"));
+ }
+ { // 'dv' will always *try* to sweep, but disabling on stat should mean debug is mostly empty...
+ final SolrParams params = SolrParams.wrapDefaults(params("sweep_key", RelatednessAgg.SWEEP_COLLECTION,
+ "sweep_val", "false"),
+ facetParams);
+ final NamedList<Object> debug = getFacetDebug(params);
+ assertEquals(FacetFieldProcessorByArrayDV.class.getSimpleName(), debug.get("processor"));
+ @SuppressWarnings("unchecked")
+ final NamedList<Object> sweep_debug = (NamedList<Object>) debug.get(SWEEP_COLLECTION_DEBUG_KEY);
+ assertNotNull(sweep_debug);
+ assertEquals("count", sweep_debug.get("base"));
+ assertEquals(Collections.emptyList(), sweep_debug.get("accs"));
+ assertEquals(Collections.emptyList(), sweep_debug.get("mapped"));
+ }
+ { // if we override 'dv' with 'hashdv' which doesn't sweep, our sweep debug should be empty,
+ // even if the skg stat does ask for sweeping explicitly...
+ final SolrParams params = SolrParams.wrapDefaults(params("method_val", "dvhash",
+ "sweep_key", RelatednessAgg.SWEEP_COLLECTION,
+ "sweep_val", "true"),
+ facetParams);
+ final NamedList<Object> debug = getFacetDebug(params);
+ assertEquals(FacetFieldProcessorByHashDV.class.getSimpleName(), debug.get("processor"));
+ assertNull(debug.get(SWEEP_COLLECTION_DEBUG_KEY));
+ }
+ }
+
+ // simple facet that sorts on an skg stat but uses prelim_sort on count
+ //
+ // all results we test should be the same even if there is another 'skg_extra' stat,
+ // neither skg should be involved in the sweeping at all.
+ for (Facet extra : Arrays.asList(null, new RelatednessFacet(multiStrField(2)+":9", null))) {
+ // choose a single value string so we know both 'dv' (sweep) and 'dvhash' (no sweep) can be specified
+ final TermFacet f = new TermFacet(soloStrField(9), map("limit", 3, "overrequest", 0,
+ "sort", "skg desc",
+ "prelim_sort", "count asc"));
+ if (null != extra) {
+ f.subFacets.put("skg_extra", extra);
+ }
+ final Map<String,TermFacet> facets = new LinkedHashMap<>();
+ facets.put("str", f);
+
+ final SolrParams facetParams
+ = SolrParams.wrapDefaults(params("method_val", "dv",
+ "json.facet", Facet.toJSONFacetParamValue(facets)),
+ baseParams);
+
+ // default sweep as well as any explicit sweep=true/false values should give same results: no sweeping
+ for (SolrParams sweepParams : Arrays.asList(params(),
+ params("sweep_key", RelatednessAgg.SWEEP_COLLECTION,
+ "sweep_val", "false"),
+ params("sweep_key", RelatednessAgg.SWEEP_COLLECTION,
+ "sweep_val", "true"))) {
+ final SolrParams params = SolrParams.wrapDefaults(sweepParams, facetParams);
+
+ final NamedList<Object> debug = getFacetDebug(params);
+ assertEquals(FacetFieldProcessorByArrayDV.class.getSimpleName(), debug.get("processor"));
+ @SuppressWarnings("unchecked")
+ final NamedList<Object> sweep_debug = (NamedList<Object>) debug.get(SWEEP_COLLECTION_DEBUG_KEY);
+ assertNotNull(sweep_debug);
+ assertEquals("count", sweep_debug.get("base"));
+ assertEquals(Collections.emptyList(), sweep_debug.get("accs"));
+ assertEquals(Collections.emptyList(), sweep_debug.get("mapped"));
+ }
+ }
+
+ { // single facet with infinite limit + multiple skgs...
+ // this should trigger MultiAcc collection, causing sweeping on both skg functions
+ //
+ // all results we test should be the same even if there is another 'min' stat,
+ // in each term facet. it shouldn't affect the sweeping/MultiAcc at all.
+ for (Facet extra : Arrays.asList(null, new SumFacet(multiIntField(2)))) {
+ final Map<String,TermFacet> facets = new LinkedHashMap<>();
+ final TermFacet facet = new TermFacet(soloStrField(9), -1, 0, "skg2 desc", null);
+ facet.subFacets.put("skg2", new RelatednessFacet(multiStrField(2)+":9", null));
+ if (null != extra) {
+ facet.subFacets.put("sum", extra);
+ }
+ facets.put("str", facet);
+ final SolrParams facetParams
+ = SolrParams.wrapDefaults(params("method_val", "dv",
+ "json.facet", Facet.toJSONFacetParamValue(facets)),
+ baseParams);
+
+ // both default sweep option and explicit sweep should give same results...
+ for (SolrParams sweepParams : Arrays.asList(params(),
+ params("sweep_key", RelatednessAgg.SWEEP_COLLECTION,
+ "sweep_val", "true"))) {
+ final SolrParams params = SolrParams.wrapDefaults(sweepParams, facetParams);
+
+ final NamedList<Object> debug = getFacetDebug(params);
+ assertEquals(FacetFieldProcessorByArrayDV.class.getSimpleName(), debug.get("processor"));
+ @SuppressWarnings("unchecked")
+ final NamedList<Object> sweep_debug = (NamedList<Object>) debug.get(SWEEP_COLLECTION_DEBUG_KEY);
+ assertNotNull(sweep_debug);
+ assertEquals("count", sweep_debug.get("base"));
+ assertEquals(Arrays.asList("skg!fg","skg!bg","skg2!fg","skg2!bg"), sweep_debug.get("accs"));
+ assertEquals(Arrays.asList("skg","skg2"), sweep_debug.get("mapped"));
+ }
+ }
+ }
+
+ // nested facets that both sort on an skg stat
+ // (set limit + overrequest tiny to keep multishard response managable)
+ //
+ // all results we test should be the same even if there is another 'skg_extra' stat,
+ // in each term facet. they shouldn't be involved in the sweeping at all.
+ for (Facet extra : Arrays.asList(null, new RelatednessFacet(multiStrField(2)+":9", null))) {
+ // choose single value strings so we know both 'dv' (sweep) and 'dvhash' (no sweep) can be specified
+ // choose 'id' for the parent facet so we are garunteed some child facets
+ final TermFacet parent = new TermFacet("id", 1, 0, "skg desc", false);
+ final TermFacet child = new TermFacet(soloStrField(7), 1, 0, "skg desc", false);
+ parent.subFacets.put("child", child);
+ if (null != extra) {
+ parent.subFacets.put("skg_extra", extra);
+ child.subFacets.put("skg_extra", extra);
+ }
+ final Map<String,TermFacet> facets = new LinkedHashMap<>();
+ facets.put("parent", parent);
+
+ final SolrParams facetParams
+ = SolrParams.wrapDefaults(params("method_val", "dv",
+ "json.facet", Facet.toJSONFacetParamValue(facets)),
+ baseParams);
+ // both default sweep option and explicit sweep should give same results...
+ for (SolrParams sweepParams : Arrays.asList(params(),
+ params("sweep_key", RelatednessAgg.SWEEP_COLLECTION,
+ "sweep_val", "true"))) {
+ final SolrParams params = SolrParams.wrapDefaults(sweepParams, facetParams);
+
+ final NamedList<Object> parentDebug = getFacetDebug(params);
+ assertEquals("id", parentDebug.get("field"));
+ assertNotNull(parentDebug.get("sub-facet"));
+ // may be multiples from diff shards, just use first one
+ @SuppressWarnings("unchecked")
+ final NamedList<Object> childDebug = ((List<NamedList<Object>>)parentDebug.get("sub-facet")).get(0);
+ assertEquals(soloStrField(7), childDebug.get("field"));
+
+ // these should all be true for both the parent and the child debug..
+ for (NamedList<Object> debug : Arrays.asList(parentDebug, childDebug)) {
+ assertEquals(FacetFieldProcessorByArrayDV.class.getSimpleName(), debug.get("processor"));
+ @SuppressWarnings("unchecked")
+ final NamedList<Object> sweep_debug = (NamedList<Object>) debug.get(SWEEP_COLLECTION_DEBUG_KEY);
+ assertNotNull(sweep_debug);
+ assertEquals("count", sweep_debug.get("base"));
+ assertEquals(Arrays.asList("skg!fg","skg!bg"), sweep_debug.get("accs"));
+ assertEquals(Arrays.asList("skg"), sweep_debug.get("mapped"));
+ }
+ }
+ }
+ }
/**
* returns the <b>FIRST</b> NamedList (under the implicit 'null' FacetQuery) in the "facet-trace" output
@@ -358,7 +564,7 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase {
}
}
- { // multi-valued facet field w/infinite limit and an extra (non-SKG) stat
+ { // multi-valued facet field w/infinite limit and an extra (non-SKG / non-sweeping) stat
final TermFacet xxx = new TermFacet(multiStrField(12), -1, 0, "count asc", false);
xxx.subFacets.put("sum", new SumFacet(multiIntField(4)));
final Map<String,TermFacet> facets = new LinkedHashMap<>();
@@ -414,7 +620,7 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase {
for (int limit : Arrays.asList(10, -1)) {
for (String sort : Arrays.asList("count desc", "skg desc", "index asc")) {
for (Boolean refine : Arrays.asList(false, true)) {
- { // 1 additional (non-SKG) stat
+ { // 1 additional (non-SKG / non-sweeping) stat
final TermFacet xxx = new TermFacet(facetFieldName, map("limit", limit,
"overrequest", 0,
"sort", sort,
@@ -440,7 +646,7 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase {
multiStrField(0) + ":46"),
multiStrField(5)+":9", "*:*");
}
- { // multiple SKGs and a multiple non-SKG stats
+ { // multiple SKGs and a multiple non-SKG / non-sweeping stats
final TermFacet xxx = new TermFacet(facetFieldName, map("limit", limit,
"overrequest", 0,
"sort", sort,
@@ -504,6 +710,8 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase {
/**
* Given a set of term facets, and top level query strings, asserts that
* the results of these queries are identical even when varying the <code>method_val</code> param
+ * and when varying the <code>{@value RelatednessAgg#SWEEP_COLLECTION}</code> param; either by explicitly setting to
+ * <code>true</code> or <code>false</code> or by changing the param key to not set it at all.
*/
private void assertFacetSKGsAreConsistent(final Map<String,TermFacet> facets,
final String query,
@@ -520,27 +728,33 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase {
@SuppressWarnings({"rawtypes"})
final NamedList expected = getFacetResponse(basicParams);
- // now loop over all processors and compare them to the "default"...
+ // now loop over all permutations of processors and sweep values and and compare them to the "default"...
for (FacetMethod method : EnumSet.allOf(FacetMethod.class)) {
- ModifiableSolrParams options = params("method_val", method.toString().toLowerCase(Locale.ROOT));
+ for (Boolean sweep : Arrays.asList(true, false, null)) {
+ final ModifiableSolrParams options = params("method_val", method.toString().toLowerCase(Locale.ROOT));
+ if (null != sweep) {
+ options.add("sweep_key", RelatednessAgg.SWEEP_COLLECTION);
+ options.add("sweep_val", sweep.toString());
+ }
+
+ @SuppressWarnings({"rawtypes"})
+ final NamedList actual = getFacetResponse(SolrParams.wrapAppended(options, basicParams));
- @SuppressWarnings({"rawtypes"})
- final NamedList actual = getFacetResponse(SolrParams.wrapAppended(options, basicParams));
-
- // we can't rely on a trivial assertEquals() comparison...
- //
- // the order of the sub-facet keys can change between
- // processors. (notably: method:enum vs method:smart when sort:"index asc")
- //
- // NOTE: this doesn't ignore the order of the buckets,
- // it ignores the order of the keys in each bucket...
- final String pathToMismatch = BaseDistributedSearchTestCase.compare
- (expected, actual, 0,
- Collections.singletonMap("buckets", BaseDistributedSearchTestCase.UNORDERED));
- if (null != pathToMismatch) {
- log.error("{}: expected = {}", options, expected);
- log.error("{}: actual = {}", options, actual);
- fail("Mismatch: " + pathToMismatch + " using " + options);
+ // we can't rely on a trivial assertEquals() comparison...
+ //
+ // the order of the sub-facet keys can change between
+ // processors. (notably: method:enum vs method:smart when sort:"index asc")
+ //
+ // NOTE: this doesn't ignore the order of the buckets,
+ // it ignores the order of the keys in each bucket...
+ final String pathToMismatch = BaseDistributedSearchTestCase.compare
+ (expected, actual, 0,
+ Collections.singletonMap("buckets", BaseDistributedSearchTestCase.UNORDERED));
+ if (null != pathToMismatch) {
+ log.error("{}: expected = {}", options, expected);
+ log.error("{}: actual = {}", options, actual);
+ fail("Mismatch: " + pathToMismatch + " using " + options);
+ }
}
}
} catch (AssertionError e) {
@@ -617,6 +831,10 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase {
* unless they are 'null' in which case <code>$fore</code> and <code>$back</code> refs will be used
* in their place, and must be set as request params (this allows "random" facets to still easily
* trigger the "nested facets re-using the same fore/back set for SKG situation)
+ *
+ * The JSON for all of these facets includes a <code>${sweep_key:xxx}</code> (which will be ignored
+ * by default) and <code>${sweep_val:yyy}</code> which may be set as params on each request to override the
+ * implicit default sweeping behavior of the underlying SKGAcc.
*/
private static final class RelatednessFacet implements Facet, Writable {
public final Map<String,Object> jsonData = new LinkedHashMap<>();
@@ -641,7 +859,7 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase {
// we don't allow these to be overridden by options, so set them now...
jsonData.put("type", "func");
jsonData.put("func", "relatedness("+f+","+b+")");
-
+ jsonData.put("${sweep_key:xxx}","${sweep_val:yyy}");
}
@Override
public void write(JSONWriter writer) {
diff --git a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java
index d12c27f..8a851d0 100644
--- a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java
+++ b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java
@@ -764,6 +764,54 @@ public class TestJsonFacets extends SolrTestCaseHS {
}
@Test
+ public void testSKGSweepMultiAcc() throws Exception {
+ Client client = Client.localClient();
+ indexSimple(client);
+
+ // simple single level facet w/skg & trivial non-sweeping stat using various sorts & (re)sorting
+ for (String sort : Arrays.asList("sort:'index asc'",
+ "sort:'y desc'",
+ "sort:'z desc'",
+ "sort:'skg desc'",
+ "prelim_sort:'count desc', sort:'index asc'",
+ "prelim_sort:'count desc', sort:'y desc'",
+ "prelim_sort:'count desc', sort:'z desc'",
+ "prelim_sort:'count desc', sort:'skg desc'")) {
+ // the relatedness score of each of our cat_s values is (conviniently) also alphabetical order,
+ // (and the same order as 'sum(num_i) desc' & 'min(num_i) desc')
+ //
+ // So all of these re/sort options should produce identical output
+ // - Testing "index" sort allows the randomized use of "stream" processor as default to be tested.
+ // - Testing (re)sorts on other stats sanity checks code paths where relatedness() is a "defered" Agg
+
+ for (String sweep : Arrays.asList("true", "false")) {
+ // results should be the same even if we disable sweeping...
+ assertJQ(req("q", "cat_s:[* TO *]", "rows", "0",
+ "fore", "where_s:NY", "back", "*:*",
+ "json.facet", ""
+ + "{x: { type: terms, field: 'cat_s', "+sort+", limit:-1, "
+ + " facet: { skg: { type: 'func', func:'relatedness($fore,$back)', "
+ +" "+RelatednessAgg.SWEEP_COLLECTION+": "+sweep+" },"
+ + " y:'sum(num_i)', "
+ +" z:'min(num_i)' } } }")
+ , "facets=={count:5, x:{ buckets:["
+ + " { val:'A', count:2, y:5.0, z:2, "
+ + " skg : { relatedness: 0.00554, "
+ + " foreground_popularity: 0.16667,"
+ + " background_popularity: 0.33333, },"
+ + " }, "
+ + " { val:'B', count:3, y:-3.0, z:-5, "
+ + " skg : { relatedness: 0.0, " // perfectly average and uncorrolated
+ + " foreground_popularity: 0.16667,"
+ + " background_popularity: 0.5 },"
+ + " } ] } } "
+ );
+ }
+ }
+ }
+
+
+ @Test
public void testRepeatedNumerics() throws Exception {
Client client = Client.localClient();
String field = "num_is"; // docValues of multi-valued points field can contain duplicate values... make sure they don't mess up our counts.
diff --git a/solr/solr-ref-guide/src/json-facet-api.adoc b/solr/solr-ref-guide/src/json-facet-api.adoc
index 5f636ad..bc336c3 100644
--- a/solr/solr-ref-guide/src/json-facet-api.adoc
+++ b/solr/solr-ref-guide/src/json-facet-api.adoc
@@ -919,6 +919,10 @@ NOTE: While it's very common to define the Background Set as `\*:*`, or some oth
When using the extended `type:func` syntax for specifying a `relatedness()` aggregation, an optional `min_popularity` (float) option can be used to specify a lower bound on the `foreground_popularity` and `background_popularity` values, that must be met in order for the `relatedness` score to be valid -- If this `min_popularity` is not met, then the `relatedness` score will be `-Infinity`.
+The default implementation for calculating `relatedness()` domain correlation depends on the type of facet being calculated. Generic domain correlation is calculated per-term, by selectively retrieving a DocSet for each bucket-associated query (consulting the `filterCache`) and calculating DocSet intersections with "foreground" and "background" sets. For term facets (especially over high-cardinality fields) this approach can lead to `filterCache` thrashing; accordingly, `relatedness()` o [...]
+
+NOTE: Disabling sweep collection for `relatedness()` stats over low-cardinality fields may yield a performance benefit, provided the `filterCache` is sufficiently large to accommodate an entry for each value in the associated field without inducing thrashing for anticipated use patterns. A reasonable heuristic is that fields of cardinality less than 1,000 _may_ benefit from disabling sweep. This heuristic is _not_ used to determine default behavior, particularly because non-sweep collect [...]
+
[source,json]
----
{ "type": "func",