You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by gs...@apache.org on 2021/08/23 21:54:29 UTC
[lucene-solr] branch branch_8x updated: LUCENE-5309: Optimize facet
counting for single-valued SSDV / StringValueFacetCounts (#2558)
This is an automated email from the ASF dual-hosted git repository.
gsmiller pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/branch_8x by this push:
new d6316a7 LUCENE-5309: Optimize facet counting for single-valued SSDV / StringValueFacetCounts (#2558)
d6316a7 is described below
commit d6316a7f9442e2494829ce52a80ddbb861a19287
Author: Greg Miller <gs...@gmail.com>
AuthorDate: Mon Aug 23 14:54:12 2021 -0700
LUCENE-5309: Optimize facet counting for single-valued SSDV / StringValueFacetCounts (#2558)
---
lucene/CHANGES.txt | 2 +
.../org/apache/lucene/index/MultiDocValues.java | 23 +--
.../lucene/facet/StringValueFacetCounts.java | 89 ++++++---
.../ConcurrentSortedSetDocValuesFacetCounts.java | 71 ++++---
.../sortedset/SortedSetDocValuesFacetCounts.java | 75 +++++---
.../lucene/facet/TestStringValueFacetCounts.java | 89 +++++----
.../sortedset/TestSortedSetDocValuesFacets.java | 214 +++++++++++++--------
7 files changed, 354 insertions(+), 209 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index f2dfd8b..2f3730d 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -65,6 +65,8 @@ Improvements
This prevents caching a query clause when it is much more expensive than
running the top-level query. (Julie Tibshirani)
+* LUCENE-5309: Optimize facet counting for single-valued SSDV / StringValueFacetCounts. (Greg Miller)
+
Optimizations
---------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java b/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java
index d515b6d..d47589b 100644
--- a/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java
@@ -85,7 +85,6 @@ public class MultiDocValues {
if (newDocID == NO_MORE_DOCS) {
currentValues = null;
- continue;
} else {
docID = currentLeaf.docBase + newDocID;
return docID;
@@ -216,7 +215,6 @@ public class MultiDocValues {
if (newDocID == NO_MORE_DOCS) {
currentValues = null;
- continue;
} else {
docID = currentLeaf.docBase + newDocID;
return docID;
@@ -335,7 +333,6 @@ public class MultiDocValues {
if (newDocID == NO_MORE_DOCS) {
currentValues = null;
- continue;
} else {
docID = currentLeaf.docBase + newDocID;
return docID;
@@ -428,7 +425,6 @@ public class MultiDocValues {
boolean anyReal = false;
final SortedNumericDocValues[] values = new SortedNumericDocValues[size];
- final int[] starts = new int[size+1];
long totalCost = 0;
for (int i = 0; i < size; i++) {
LeafReaderContext context = leaves.get(i);
@@ -439,10 +435,8 @@ public class MultiDocValues {
anyReal = true;
}
values[i] = v;
- starts[i] = context.docBase;
totalCost += v.cost();
}
- starts[size] = r.maxDoc();
if (anyReal == false) {
return null;
@@ -473,7 +467,6 @@ public class MultiDocValues {
if (newDocID == NO_MORE_DOCS) {
currentValues = null;
- continue;
} else {
docID = currentLeaf.docBase + newDocID;
return docID;
@@ -644,9 +637,9 @@ public class MultiDocValues {
*/
public static class MultiSortedDocValues extends SortedDocValues {
/** docbase for each leaf: parallel with {@link #values} */
- public final int docStarts[];
+ public final int[] docStarts;
/** leaf values */
- public final SortedDocValues values[];
+ public final SortedDocValues[] values;
/** ordinal map mapping ords from <code>values</code> to global ord space */
public final OrdinalMap mapping;
private final long totalCost;
@@ -657,7 +650,8 @@ public class MultiDocValues {
private int docID = -1;
/** Creates a new MultiSortedDocValues over <code>values</code> */
- public MultiSortedDocValues(SortedDocValues values[], int docStarts[], OrdinalMap mapping, long totalCost) throws IOException {
+ public MultiSortedDocValues(
+ SortedDocValues[] values, int[] docStarts, OrdinalMap mapping, long totalCost) {
assert docStarts.length == values.length + 1;
this.values = values;
this.docStarts = docStarts;
@@ -687,7 +681,6 @@ public class MultiDocValues {
if (newDocID == NO_MORE_DOCS) {
currentValues = null;
- continue;
} else {
docID = currentDocStart + newDocID;
return docID;
@@ -771,9 +764,9 @@ public class MultiDocValues {
*/
public static class MultiSortedSetDocValues extends SortedSetDocValues {
/** docbase for each leaf: parallel with {@link #values} */
- public final int docStarts[];
+ public final int[] docStarts;
/** leaf values */
- public final SortedSetDocValues values[];
+ public final SortedSetDocValues[] values;
/** ordinal map mapping ords from <code>values</code> to global ord space */
public final OrdinalMap mapping;
private final long totalCost;
@@ -784,7 +777,8 @@ public class MultiDocValues {
private int docID = -1;
/** Creates a new MultiSortedSetDocValues over <code>values</code> */
- public MultiSortedSetDocValues(SortedSetDocValues values[], int docStarts[], OrdinalMap mapping, long totalCost) throws IOException {
+ public MultiSortedSetDocValues(
+ SortedSetDocValues[] values, int[] docStarts, OrdinalMap mapping, long totalCost) {
assert docStarts.length == values.length + 1;
this.values = values;
this.docStarts = docStarts;
@@ -814,7 +808,6 @@ public class MultiDocValues {
if (newDocID == NO_MORE_DOCS) {
currentValues = null;
- continue;
} else {
docID = currentDocStart + newDocID;
return docID;
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java
index c6592ba..c9b5975 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java
@@ -28,6 +28,7 @@ import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.OrdinalMap;
import org.apache.lucene.index.ReaderUtil;
+import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.search.ConjunctionDISI;
import org.apache.lucene.search.DocIdSetIterator;
@@ -326,16 +327,21 @@ public class StringValueFacetCounts extends Facets {
}
private void countOneSegment(
- SortedSetDocValues segValues, int segmentOrd, FacetsCollector.MatchingDocs hits)
+ SortedSetDocValues multiValues, int segmentOrd, FacetsCollector.MatchingDocs hits)
throws IOException {
+ // It's slightly more efficient to work against SortedDocValues if the field is actually
+ // single-valued (see: LUCENE-5309)
+ SortedDocValues singleValues = DocValues.unwrapSingleton(multiValues);
+ DocIdSetIterator valuesIt = singleValues != null ? singleValues : multiValues;
+
// Intersect hits with doc values unless we're "counting all," in which case we'll iterate
// all doc values for this segment:
DocIdSetIterator it;
if (hits == null) {
- it = segValues;
+ it = valuesIt;
} else {
- it = ConjunctionDISI.intersectIterators(Arrays.asList(hits.bits.iterator(), segValues));
+ it = ConjunctionDISI.intersectIterators(Arrays.asList(hits.bits.iterator(), valuesIt));
}
// TODO: yet another option is to count all segs
@@ -350,16 +356,23 @@ public class StringValueFacetCounts extends Facets {
if (ordinalMap == null) {
// If there's no ordinal map we don't need to map segment ordinals to globals, so counting
// is very straight-forward:
- for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
- int term = (int) segValues.nextOrd();
- boolean countedDocInTotal = false;
- while (term != SortedSetDocValues.NO_MORE_ORDS) {
- increment(term);
- if (countedDocInTotal == false) {
- totalDocCount++;
+ if (singleValues != null) {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ increment(singleValues.ordValue());
+ totalDocCount++;
+ }
+ } else {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ int term = (int) multiValues.nextOrd();
+ boolean countedDocInTotal = false;
+ while (term != SortedSetDocValues.NO_MORE_ORDS) {
+ increment(term);
+ if (countedDocInTotal == false) {
+ totalDocCount++;
+ countedDocInTotal = true;
+ }
+ term = (int) multiValues.nextOrd();
}
- countedDocInTotal = true;
- term = (int) segValues.nextOrd();
}
}
} else {
@@ -367,20 +380,27 @@ public class StringValueFacetCounts extends Facets {
// depending on how many hits we have to count relative to how many unique doc val ordinals
// there are in this segment:
final LongValues ordMap = ordinalMap.getGlobalOrds(segmentOrd);
- int segmentCardinality = (int) segValues.getValueCount();
+ int segmentCardinality = (int) multiValues.getValueCount();
if (hits != null && hits.totalHits < segmentCardinality / 10) {
// Remap every ord to global ord as we iterate:
- for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
- int term = (int) segValues.nextOrd();
- boolean countedDocInTotal = false;
- while (term != SortedSetDocValues.NO_MORE_ORDS) {
- increment((int) ordMap.get(term));
- if (countedDocInTotal == false) {
- totalDocCount++;
+ if (singleValues != null) {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ increment((int) ordMap.get(singleValues.ordValue()));
+ totalDocCount++;
+ }
+ } else {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ int term = (int) multiValues.nextOrd();
+ boolean countedDocInTotal = false;
+ while (term != SortedSetDocValues.NO_MORE_ORDS) {
+ increment((int) ordMap.get(term));
+ if (countedDocInTotal == false) {
+ totalDocCount++;
+ countedDocInTotal = true;
+ }
+ term = (int) multiValues.nextOrd();
}
- countedDocInTotal = true;
- term = (int) segValues.nextOrd();
}
}
} else {
@@ -389,16 +409,23 @@ public class StringValueFacetCounts extends Facets {
// we expect to visit a large percentage of the unique ordinals (lots of hits relative
// to the segment cardinality), so we count the segment densely:
final int[] segCounts = new int[segmentCardinality];
- for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
- int term = (int) segValues.nextOrd();
- boolean countedDocInTotal = false;
- while (term != SortedSetDocValues.NO_MORE_ORDS) {
- segCounts[term]++;
- if (countedDocInTotal == false) {
- totalDocCount++;
+ if (singleValues != null) {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ segCounts[singleValues.ordValue()]++;
+ totalDocCount++;
+ }
+ } else {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ int term = (int) multiValues.nextOrd();
+ boolean countedDocInTotal = false;
+ while (term != SortedSetDocValues.NO_MORE_ORDS) {
+ segCounts[term]++;
+ if (countedDocInTotal == false) {
+ totalDocCount++;
+ countedDocInTotal = true;
+ }
+ term = (int) multiValues.nextOrd();
}
- countedDocInTotal = true;
- term = (int) segValues.nextOrd();
}
}
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java
index 4e4a01c..31a7f66 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java
@@ -37,6 +37,7 @@ import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.facet.LabelAndValue;
import org.apache.lucene.facet.TopOrdAndIntQueue;
import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState.OrdRange;
+import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
@@ -44,6 +45,7 @@ import org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.OrdinalMap;
import org.apache.lucene.index.ReaderUtil;
+import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.search.ConjunctionDISI;
import org.apache.lucene.search.DocIdSetIterator;
@@ -100,7 +102,7 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
return getDim(dim, ordRange, topN);
}
- private final FacetResult getDim(String dim, OrdRange ordRange, int topN) throws IOException {
+ private FacetResult getDim(String dim, OrdRange ordRange, int topN) throws IOException {
TopOrdAndIntQueue q = null;
@@ -165,12 +167,17 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
@Override
public Void call() throws IOException {
- SortedSetDocValues segValues = leafReader.getSortedSetDocValues(field);
- if (segValues == null) {
+ SortedSetDocValues multiValues = DocValues.getSortedSet(leafReader, field);
+ if (multiValues == null) {
// nothing to count here
return null;
}
+ // It's slightly more efficient to work against SortedDocValues if the field is actually
+ // single-valued (see: LUCENE-5309)
+ SortedDocValues singleValues = DocValues.unwrapSingleton(multiValues);
+ DocIdSetIterator valuesIt = singleValues != null ? singleValues : multiValues;
+
// TODO: yet another option is to count all segs
// first, only in seg-ord space, and then do a
// merge-sort-PQ in the end to only "resolve to
@@ -183,34 +190,46 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
DocIdSetIterator it;
if (hits == null) {
// count all
- it = segValues;
+ it = valuesIt;
} else {
- it = ConjunctionDISI.intersectIterators(Arrays.asList(hits.bits.iterator(), segValues));
+ it = ConjunctionDISI.intersectIterators(Arrays.asList(hits.bits.iterator(), valuesIt));
}
if (ordinalMap != null) {
final LongValues ordMap = ordinalMap.getGlobalOrds(segOrd);
- int numSegOrds = (int) segValues.getValueCount();
+ int numSegOrds = (int) multiValues.getValueCount();
if (hits != null && hits.totalHits < numSegOrds/10) {
// Remap every ord to global ord as we iterate:
- for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
- int term = (int) segValues.nextOrd();
- while (term != SortedSetDocValues.NO_MORE_ORDS) {
- counts.incrementAndGet((int) ordMap.get(term));
- term = (int) segValues.nextOrd();
+ if (singleValues != null) {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ counts.incrementAndGet((int) ordMap.get(singleValues.ordValue()));
+ }
+ } else {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ int term = (int) multiValues.nextOrd();
+ while (term != SortedSetDocValues.NO_MORE_ORDS) {
+ counts.incrementAndGet((int) ordMap.get(term));
+ term = (int) multiValues.nextOrd();
+ }
}
}
} else {
// First count in seg-ord space:
final int[] segCounts = new int[numSegOrds];
- for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
- int term = (int) segValues.nextOrd();
- while (term != SortedSetDocValues.NO_MORE_ORDS) {
- segCounts[term]++;
- term = (int) segValues.nextOrd();
+ if (singleValues != null) {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ segCounts[singleValues.ordValue()]++;
+ }
+ } else {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ int term = (int) multiValues.nextOrd();
+ while (term != SortedSetDocValues.NO_MORE_ORDS) {
+ segCounts[term]++;
+ term = (int) multiValues.nextOrd();
+ }
}
}
@@ -225,11 +244,17 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
} else {
// No ord mapping (e.g., single segment index):
// just aggregate directly into counts:
- for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
- int term = (int) segValues.nextOrd();
- while (term != SortedSetDocValues.NO_MORE_ORDS) {
- counts.incrementAndGet(term);
- term = (int) segValues.nextOrd();
+ if (singleValues != null) {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ counts.incrementAndGet(singleValues.ordValue());
+ }
+ } else {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ int term = (int) multiValues.nextOrd();
+ while (term != SortedSetDocValues.NO_MORE_ORDS) {
+ counts.incrementAndGet(term);
+ term = (int) multiValues.nextOrd();
+ }
}
}
}
@@ -239,7 +264,7 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
}
/** Does all the "real work" of tallying up the counts. */
- private final void count(List<MatchingDocs> matchingDocs) throws IOException, InterruptedException {
+ private void count(List<MatchingDocs> matchingDocs) throws IOException, InterruptedException {
OrdinalMap ordinalMap;
@@ -279,7 +304,7 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
}
/** Does all the "real work" of tallying up the counts. */
- private final void countAll() throws IOException, InterruptedException {
+ private void countAll() throws IOException, InterruptedException {
//System.out.println("ssdv count");
OrdinalMap ordinalMap;
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
index 6df4334..06f47fd 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
@@ -32,6 +32,7 @@ import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.facet.LabelAndValue;
import org.apache.lucene.facet.TopOrdAndIntQueue;
import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState.OrdRange;
+import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
@@ -39,6 +40,7 @@ import org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.OrdinalMap;
import org.apache.lucene.index.ReaderUtil;
+import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.search.ConjunctionDISI;
import org.apache.lucene.search.DocIdSetIterator;
@@ -106,7 +108,7 @@ public class SortedSetDocValuesFacetCounts extends Facets {
return getDim(dim, ordRange, topN);
}
- private final FacetResult getDim(String dim, OrdRange ordRange, int topN) throws IOException {
+ private FacetResult getDim(String dim, OrdRange ordRange, int topN) throws IOException {
TopOrdAndIntQueue q = null;
@@ -157,17 +159,22 @@ public class SortedSetDocValuesFacetCounts extends Facets {
}
private void countOneSegment(OrdinalMap ordinalMap, LeafReader reader, int segOrd, MatchingDocs hits) throws IOException {
- SortedSetDocValues segValues = reader.getSortedSetDocValues(field);
- if (segValues == null) {
+ SortedSetDocValues multiValues = DocValues.getSortedSet(reader, field);
+ if (multiValues == null) {
// nothing to count
return;
}
+ // It's slightly more efficient to work against SortedDocValues if the field is actually
+ // single-valued (see: LUCENE-5309)
+ SortedDocValues singleValues = DocValues.unwrapSingleton(multiValues);
+ DocIdSetIterator valuesIt = singleValues != null ? singleValues : multiValues;
+
DocIdSetIterator it;
if (hits == null) {
- it = segValues;
+ it = valuesIt;
} else {
- it = ConjunctionDISI.intersectIterators(Arrays.asList(hits.bits.iterator(), segValues));
+ it = ConjunctionDISI.intersectIterators(Arrays.asList(hits.bits.iterator(), valuesIt));
}
// TODO: yet another option is to count all segs
@@ -182,17 +189,22 @@ public class SortedSetDocValuesFacetCounts extends Facets {
if (ordinalMap != null) {
final LongValues ordMap = ordinalMap.getGlobalOrds(segOrd);
- int numSegOrds = (int) segValues.getValueCount();
+ int numSegOrds = (int) multiValues.getValueCount();
if (hits != null && hits.totalHits < numSegOrds/10) {
//System.out.println(" remap as-we-go");
// Remap every ord to global ord as we iterate:
- for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
- int term = (int) segValues.nextOrd();
- while (term != SortedSetDocValues.NO_MORE_ORDS) {
- //System.out.println(" segOrd=" + segOrd + " ord=" + term + " globalOrd=" + ordinalMap.getGlobalOrd(segOrd, term));
- counts[(int) ordMap.get(term)]++;
- term = (int) segValues.nextOrd();
+ if (singleValues != null) {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ counts[(int) ordMap.get(singleValues.ordValue())]++;
+ }
+ } else {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ int term = (int) multiValues.nextOrd();
+ while (term != SortedSetDocValues.NO_MORE_ORDS) {
+ counts[(int) ordMap.get(term)]++;
+ term = (int) multiValues.nextOrd();
+ }
}
}
} else {
@@ -200,12 +212,17 @@ public class SortedSetDocValuesFacetCounts extends Facets {
// First count in seg-ord space:
final int[] segCounts = new int[numSegOrds];
- for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
- int term = (int) segValues.nextOrd();
- while (term != SortedSetDocValues.NO_MORE_ORDS) {
- //System.out.println(" ord=" + term);
- segCounts[term]++;
- term = (int) segValues.nextOrd();
+ if (singleValues != null) {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ segCounts[singleValues.ordValue()]++;
+ }
+ } else {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ int term = (int) multiValues.nextOrd();
+ while (term != SortedSetDocValues.NO_MORE_ORDS) {
+ segCounts[term]++;
+ term = (int) multiValues.nextOrd();
+ }
}
}
@@ -221,20 +238,26 @@ public class SortedSetDocValuesFacetCounts extends Facets {
} else {
// No ord mapping (e.g., single segment index):
// just aggregate directly into counts:
- for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
- int term = (int) segValues.nextOrd();
- while (term != SortedSetDocValues.NO_MORE_ORDS) {
- counts[term]++;
- term = (int) segValues.nextOrd();
+ if (singleValues != null) {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ counts[singleValues.ordValue()]++;
+ }
+ } else {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ int term = (int) multiValues.nextOrd();
+ while (term != SortedSetDocValues.NO_MORE_ORDS) {
+ counts[term]++;
+ term = (int) multiValues.nextOrd();
+ }
}
}
}
-
+
}
/** Does all the "real work" of tallying up the counts. */
- private final void count(List<MatchingDocs> matchingDocs) throws IOException {
+ private void count(List<MatchingDocs> matchingDocs) throws IOException {
//System.out.println("ssdv count");
OrdinalMap ordinalMap;
@@ -265,7 +288,7 @@ public class SortedSetDocValuesFacetCounts extends Facets {
}
/** Does all the "real work" of tallying up the counts. */
- private final void countAll() throws IOException {
+ private void countAll() throws IOException {
//System.out.println("ssdv count");
OrdinalMap ordinalMap;
diff --git a/lucene/facet/src/test/org/apache/lucene/facet/TestStringValueFacetCounts.java b/lucene/facet/src/test/org/apache/lucene/facet/TestStringValueFacetCounts.java
index 180a80c..950b9e7 100644
--- a/lucene/facet/src/test/org/apache/lucene/facet/TestStringValueFacetCounts.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/TestStringValueFacetCounts.java
@@ -248,51 +248,62 @@ public class TestStringValueFacetCounts extends FacetTestCase {
public void testRandom() throws Exception {
- Directory dir = newDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
-
- // Build up test data
- String[] tokens = getRandomTokens(50); // 50 random values to pick from
- int numDocs = atLeast(1000);
- int expectedTotalDocCount = 0;
- Map<String, Integer> expected = new HashMap<>();
- for (int i = 0; i < numDocs; i++) {
- Document doc = new Document();
- int valCount = random().nextInt(5); // each doc can have up to 5 values
- Set<String> docVals = new HashSet<>();
- for (int j = 0; j < valCount; j++) {
- int tokenIdx = random().nextInt(tokens.length);
- String val = tokens[tokenIdx];
- // values should only be counted once per document
- if (docVals.contains(val) == false) {
- expected.put(val, expected.getOrDefault(val, 0) + 1);
+ int fullIterations = LuceneTestCase.TEST_NIGHTLY ? 20 : 3;
+ for (int iter = 0; iter < fullIterations; iter++) {
+ Directory dir = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
+
+ // Build up test data
+ String[] tokens = getRandomTokens(50); // 50 random values to pick from
+ int numDocs = atLeast(1000);
+ int expectedTotalDocCount = 0;
+ Map<String, Integer> expected = new HashMap<>();
+ for (int i = 0; i < numDocs; i++) {
+ Document doc = new Document();
+ // Sometimes we restrict docs to be single-valued, but most of the time they can have up to
+ // 5:
+ int maxValuesPerDoc;
+ if (random().nextInt(10) < 8) {
+ maxValuesPerDoc = 5;
+ } else {
+ maxValuesPerDoc = 1;
+ }
+ int valCount = random().nextInt(maxValuesPerDoc);
+ Set<String> docVals = new HashSet<>();
+ for (int j = 0; j < valCount; j++) {
+ int tokenIdx = random().nextInt(tokens.length);
+ String val = tokens[tokenIdx];
+ // values should only be counted once per document
+ if (docVals.contains(val) == false) {
+ expected.put(val, expected.getOrDefault(val, 0) + 1);
+ }
+ docVals.add(val);
+ doc.add(new SortedSetDocValuesField("field", new BytesRef(val)));
+ }
+ // only docs with at least one value in the field should be counted in the total
+ if (docVals.isEmpty() == false) {
+ expectedTotalDocCount++;
+ }
+ writer.addDocument(doc);
+ if (random().nextInt(10) == 0) {
+ writer.commit(); // sometimes commit
}
- docVals.add(val);
- doc.add(new SortedSetDocValuesField("field", new BytesRef(val)));
- }
- // only docs with at least one value in the field should be counted in the total
- if (docVals.isEmpty() == false) {
- expectedTotalDocCount++;
- }
- writer.addDocument(doc);
- if (random().nextInt(10) == 0) {
- writer.commit(); // sometimes commit
}
- }
- IndexSearcher searcher = newSearcher(writer.getReader());
- writer.close();
+ IndexSearcher searcher = newSearcher(writer.getReader());
+ writer.close();
- // run iterations with random values of topN
- int iterations = LuceneTestCase.TEST_NIGHTLY ? 10_000 : 50;
- int[] topNs = new int[iterations];
- for (int i = 0; i < iterations; i++) {
- topNs[i] = atLeast(1);
- }
+ // run iterations with random values of topN
+ int iterations = LuceneTestCase.TEST_NIGHTLY ? 10_000 : 50;
+ int[] topNs = new int[iterations];
+ for (int i = 0; i < iterations; i++) {
+ topNs[i] = atLeast(1);
+ }
- checkFacetResult(expected, expectedTotalDocCount, searcher, topNs);
+ checkFacetResult(expected, expectedTotalDocCount, searcher, topNs);
- IOUtils.close(searcher.getIndexReader(), dir);
+ IOUtils.close(searcher.getIndexReader(), dir);
+ }
}
private void checkFacetResult(
diff --git a/lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java b/lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java
index a9a388d..ab4f182 100644
--- a/lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java
@@ -46,6 +46,7 @@ import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.NamedThreadFactory;
import org.apache.lucene.util.TestUtil;
@@ -101,6 +102,59 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
IOUtils.close(searcher.getIndexReader(), dir);
}
+ public void testBasicSingleValued() throws Exception {
+ Directory dir = newDirectory();
+
+ FacetsConfig config = new FacetsConfig();
+ config.setMultiValued("a", false);
+ RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
+
+ Document doc = new Document();
+ doc.add(new SortedSetDocValuesFacetField("a", "foo"));
+ doc.add(new SortedSetDocValuesFacetField("b", "bar"));
+ writer.addDocument(config.build(doc));
+ doc = new Document();
+ doc.add(new SortedSetDocValuesFacetField("a", "foo"));
+ writer.addDocument(config.build(doc));
+ if (random().nextBoolean()) {
+ writer.commit();
+ }
+
+ doc = new Document();
+ doc.add(new SortedSetDocValuesFacetField("a", "baz"));
+ writer.addDocument(config.build(doc));
+
+ // NRT open
+ IndexSearcher searcher = newSearcher(writer.getReader());
+
+ // Per-top-reader state:
+ SortedSetDocValuesReaderState state =
+ new DefaultSortedSetDocValuesReaderState(searcher.getIndexReader());
+
+ ExecutorService exec = randomExecutorServiceOrNull();
+ Facets facets = getAllFacets(searcher, state, exec);
+
+ assertEquals(
+ "dim=a path=[] value=3 childCount=2\n foo (2)\n baz (1)\n",
+ facets.getTopChildren(10, "a").toString());
+ assertEquals(
+ "dim=b path=[] value=1 childCount=1\n bar (1)\n",
+ facets.getTopChildren(10, "b").toString());
+
+ // DrillDown:
+ DrillDownQuery q = new DrillDownQuery(config);
+ q.add("a", "foo");
+ q.add("b", "bar");
+ TopDocs hits = searcher.search(q, 1);
+ assertEquals(1, hits.totalHits.value);
+
+ if (exec != null) {
+ exec.shutdownNow();
+ }
+ writer.close();
+ IOUtils.close(searcher.getIndexReader(), dir);
+ }
+
public void testDrillDownOptions() throws Exception {
Directory dir = newDirectory();
@@ -339,99 +393,109 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
}
public void testRandom() throws Exception {
- String[] tokens = getRandomTokens(10);
- Directory indexDir = newDirectory();
- Directory taxoDir = newDirectory();
-
- RandomIndexWriter w = new RandomIndexWriter(random(), indexDir);
- FacetsConfig config = new FacetsConfig();
- int numDocs = atLeast(1000);
- int numDims = TestUtil.nextInt(random(), 1, 7);
- List<TestDoc> testDocs = getRandomDocs(tokens, numDocs, numDims);
- for(TestDoc testDoc : testDocs) {
- Document doc = new Document();
- doc.add(newStringField("content", testDoc.content, Field.Store.NO));
- for(int j=0;j<numDims;j++) {
- if (testDoc.dims[j] != null) {
- doc.add(new SortedSetDocValuesFacetField("dim" + j, testDoc.dims[j]));
+ int fullIterations = LuceneTestCase.TEST_NIGHTLY ? 20 : 3;
+ for (int fullIter = 0; fullIter < fullIterations; fullIter++) {
+ String[] tokens = getRandomTokens(10);
+ Directory indexDir = newDirectory();
+ Directory taxoDir = newDirectory();
+
+ RandomIndexWriter w = new RandomIndexWriter(random(), indexDir);
+ FacetsConfig config = new FacetsConfig();
+ int numDocs = atLeast(1000);
+ // Most of the time allow up to 7 dims per doc, but occasionally limit all docs to a single
+ // dim:
+ int numDims;
+ if (random().nextInt(10) < 8) {
+ numDims = TestUtil.nextInt(random(), 1, 7);
+ } else {
+ numDims = 1;
+ }
+ List<TestDoc> testDocs = getRandomDocs(tokens, numDocs, numDims);
+ for (TestDoc testDoc : testDocs) {
+ Document doc = new Document();
+ doc.add(newStringField("content", testDoc.content, Field.Store.NO));
+ for (int j = 0; j < numDims; j++) {
+ if (testDoc.dims[j] != null) {
+ doc.add(new SortedSetDocValuesFacetField("dim" + j, testDoc.dims[j]));
+ }
}
+ w.addDocument(config.build(doc));
}
- w.addDocument(config.build(doc));
- }
- // NRT open
- IndexSearcher searcher = newSearcher(w.getReader());
-
- // Per-top-reader state:
- SortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState(searcher.getIndexReader());
- ExecutorService exec = randomExecutorServiceOrNull();
+ // NRT open
+ IndexSearcher searcher = newSearcher(w.getReader());
- int iters = atLeast(100);
- for(int iter=0;iter<iters;iter++) {
- String searchToken = tokens[random().nextInt(tokens.length)];
- if (VERBOSE) {
- System.out.println("\nTEST: iter content=" + searchToken);
- }
- FacetsCollector fc = new FacetsCollector();
- FacetsCollector.search(searcher, new TermQuery(new Term("content", searchToken)), 10, fc);
- Facets facets;
- if (exec != null) {
- facets = new ConcurrentSortedSetDocValuesFacetCounts(state, fc, exec);
- } else {
- facets = new SortedSetDocValuesFacetCounts(state, fc);
- }
+ // Per-top-reader state:
+ SortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState(searcher.getIndexReader());
+ ExecutorService exec = randomExecutorServiceOrNull();
- // Slow, yet hopefully bug-free, faceting:
- @SuppressWarnings({"rawtypes","unchecked"}) Map<String,Integer>[] expectedCounts = new HashMap[numDims];
- for(int i=0;i<numDims;i++) {
- expectedCounts[i] = new HashMap<>();
- }
+ int iters = atLeast(100);
+ for (int iter = 0; iter < iters; iter++) {
+ String searchToken = tokens[random().nextInt(tokens.length)];
+ if (VERBOSE) {
+ System.out.println("\nTEST: iter content=" + searchToken);
+ }
+ FacetsCollector fc = new FacetsCollector();
+ FacetsCollector.search(searcher, new TermQuery(new Term("content", searchToken)), 10, fc);
+ Facets facets;
+ if (exec != null) {
+ facets = new ConcurrentSortedSetDocValuesFacetCounts(state, fc, exec);
+ } else {
+ facets = new SortedSetDocValuesFacetCounts(state, fc);
+ }
- for(TestDoc doc : testDocs) {
- if (doc.content.equals(searchToken)) {
- for(int j=0;j<numDims;j++) {
- if (doc.dims[j] != null) {
- Integer v = expectedCounts[j].get(doc.dims[j]);
- if (v == null) {
- expectedCounts[j].put(doc.dims[j], 1);
- } else {
- expectedCounts[j].put(doc.dims[j], v.intValue() + 1);
+ // Slow, yet hopefully bug-free, faceting:
+ @SuppressWarnings({"rawtypes", "unchecked"}) Map<String, Integer>[] expectedCounts = new HashMap[numDims];
+ for (int i = 0; i < numDims; i++) {
+ expectedCounts[i] = new HashMap<>();
+ }
+
+ for (TestDoc doc : testDocs) {
+ if (doc.content.equals(searchToken)) {
+ for (int j = 0; j < numDims; j++) {
+ if (doc.dims[j] != null) {
+ Integer v = expectedCounts[j].get(doc.dims[j]);
+ if (v == null) {
+ expectedCounts[j].put(doc.dims[j], 1);
+ } else {
+ expectedCounts[j].put(doc.dims[j], v.intValue() + 1);
+ }
}
}
}
}
- }
- List<FacetResult> expected = new ArrayList<>();
- for(int i=0;i<numDims;i++) {
- List<LabelAndValue> labelValues = new ArrayList<>();
- int totCount = 0;
- for(Map.Entry<String,Integer> ent : expectedCounts[i].entrySet()) {
- labelValues.add(new LabelAndValue(ent.getKey(), ent.getValue()));
- totCount += ent.getValue();
- }
- sortLabelValues(labelValues);
- if (totCount > 0) {
- expected.add(new FacetResult("dim" + i, new String[0], totCount, labelValues.toArray(new LabelAndValue[labelValues.size()]), labelValues.size()));
+ List<FacetResult> expected = new ArrayList<>();
+ for (int i = 0; i < numDims; i++) {
+ List<LabelAndValue> labelValues = new ArrayList<>();
+ int totCount = 0;
+ for (Map.Entry<String, Integer> ent : expectedCounts[i].entrySet()) {
+ labelValues.add(new LabelAndValue(ent.getKey(), ent.getValue()));
+ totCount += ent.getValue();
+ }
+ sortLabelValues(labelValues);
+ if (totCount > 0) {
+ expected.add(new FacetResult("dim" + i, new String[0], totCount, labelValues.toArray(new LabelAndValue[labelValues.size()]), labelValues.size()));
+ }
}
- }
- // Sort by highest value, tie break by value:
- sortFacetResults(expected);
+ // Sort by highest value, tie break by value:
+ sortFacetResults(expected);
- List<FacetResult> actual = facets.getAllDims(10);
+ List<FacetResult> actual = facets.getAllDims(10);
- // Messy: fixup ties
- //sortTies(actual);
+ // Messy: fixup ties
+ //sortTies(actual);
- assertEquals(expected, actual);
- }
+ assertEquals(expected, actual);
+ }
- if (exec != null) {
- exec.shutdownNow();
+ if (exec != null) {
+ exec.shutdownNow();
+ }
+ w.close();
+ IOUtils.close(searcher.getIndexReader(), indexDir, taxoDir);
}
- w.close();
- IOUtils.close(searcher.getIndexReader(), indexDir, taxoDir);
}
private static Facets getAllFacets(IndexSearcher searcher, SortedSetDocValuesReaderState state,