You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by gs...@apache.org on 2021/08/23 17:01:29 UTC
[lucene] branch main updated: LUCENE-5309: Optimize facet counting
for single-valued SSDV / StringValueFacetCounts (#255)
This is an automated email from the ASF dual-hosted git repository.
gsmiller pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/main by this push:
new 46fa09d LUCENE-5309: Optimize facet counting for single-valued SSDV / StringValueFacetCounts (#255)
46fa09d is described below
commit 46fa09d265b7ad7945bf06d13c5ffa4ee99407a6
Author: Greg Miller <gs...@gmail.com>
AuthorDate: Mon Aug 23 10:01:23 2021 -0700
LUCENE-5309: Optimize facet counting for single-valued SSDV / StringValueFacetCounts (#255)
---
lucene/CHANGES.txt | 2 +
.../org/apache/lucene/index/MultiDocValues.java | 23 +--
.../lucene/facet/StringValueFacetCounts.java | 89 +++++---
.../ConcurrentSortedSetDocValuesFacetCounts.java | 72 ++++---
.../sortedset/SortedSetDocValuesFacetCounts.java | 72 ++++---
.../lucene/facet/TestStringValueFacetCounts.java | 89 ++++----
.../sortedset/TestSortedSetDocValuesFacets.java | 228 +++++++++++++--------
7 files changed, 358 insertions(+), 217 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index ada3634..a1e25bf 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -415,6 +415,8 @@ Improvements
This prevents caching a query clause when it is much more expensive than
running the top-level query. (Julie Tibshirani)
+* LUCENE-5309: Optimize facet counting for single-valued SSDV / StringValueFacetCounts. (Greg Miller)
+
Optimizations
---------------------
* LUCENE-9996: Improved memory efficiency of IndexWriter's RAM buffer, in
diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java b/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java
index 8d1c143..a598142 100644
--- a/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java
@@ -82,7 +82,6 @@ public class MultiDocValues {
if (newDocID == NO_MORE_DOCS) {
currentValues = null;
- continue;
} else {
docID = currentLeaf.docBase + newDocID;
return docID;
@@ -221,7 +220,6 @@ public class MultiDocValues {
if (newDocID == NO_MORE_DOCS) {
currentValues = null;
- continue;
} else {
docID = currentLeaf.docBase + newDocID;
return docID;
@@ -350,7 +348,6 @@ public class MultiDocValues {
if (newDocID == NO_MORE_DOCS) {
currentValues = null;
- continue;
} else {
docID = currentLeaf.docBase + newDocID;
return docID;
@@ -452,7 +449,6 @@ public class MultiDocValues {
boolean anyReal = false;
final SortedNumericDocValues[] values = new SortedNumericDocValues[size];
- final int[] starts = new int[size + 1];
long totalCost = 0;
for (int i = 0; i < size; i++) {
LeafReaderContext context = leaves.get(i);
@@ -463,10 +459,8 @@ public class MultiDocValues {
anyReal = true;
}
values[i] = v;
- starts[i] = context.docBase;
totalCost += v.cost();
}
- starts[size] = r.maxDoc();
if (anyReal == false) {
return null;
@@ -497,7 +491,6 @@ public class MultiDocValues {
if (newDocID == NO_MORE_DOCS) {
currentValues = null;
- continue;
} else {
docID = currentLeaf.docBase + newDocID;
return docID;
@@ -680,9 +673,9 @@ public class MultiDocValues {
*/
public static class MultiSortedDocValues extends SortedDocValues {
/** docbase for each leaf: parallel with {@link #values} */
- public final int docStarts[];
+ public final int[] docStarts;
/** leaf values */
- public final SortedDocValues values[];
+ public final SortedDocValues[] values;
/** ordinal map mapping ords from <code>values</code> to global ord space */
public final OrdinalMap mapping;
@@ -695,8 +688,7 @@ public class MultiDocValues {
/** Creates a new MultiSortedDocValues over <code>values</code> */
public MultiSortedDocValues(
- SortedDocValues values[], int docStarts[], OrdinalMap mapping, long totalCost)
- throws IOException {
+ SortedDocValues[] values, int[] docStarts, OrdinalMap mapping, long totalCost) {
assert docStarts.length == values.length + 1;
this.values = values;
this.docStarts = docStarts;
@@ -726,7 +718,6 @@ public class MultiDocValues {
if (newDocID == NO_MORE_DOCS) {
currentValues = null;
- continue;
} else {
docID = currentDocStart + newDocID;
return docID;
@@ -819,9 +810,9 @@ public class MultiDocValues {
*/
public static class MultiSortedSetDocValues extends SortedSetDocValues {
/** docbase for each leaf: parallel with {@link #values} */
- public final int docStarts[];
+ public final int[] docStarts;
/** leaf values */
- public final SortedSetDocValues values[];
+ public final SortedSetDocValues[] values;
/** ordinal map mapping ords from <code>values</code> to global ord space */
public final OrdinalMap mapping;
@@ -834,8 +825,7 @@ public class MultiDocValues {
/** Creates a new MultiSortedSetDocValues over <code>values</code> */
public MultiSortedSetDocValues(
- SortedSetDocValues values[], int docStarts[], OrdinalMap mapping, long totalCost)
- throws IOException {
+ SortedSetDocValues[] values, int[] docStarts, OrdinalMap mapping, long totalCost) {
assert docStarts.length == values.length + 1;
this.values = values;
this.docStarts = docStarts;
@@ -865,7 +855,6 @@ public class MultiDocValues {
if (newDocID == NO_MORE_DOCS) {
currentValues = null;
- continue;
} else {
docID = currentDocStart + newDocID;
return docID;
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java
index 6100d5f..8dc223c 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java
@@ -28,6 +28,7 @@ import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.OrdinalMap;
import org.apache.lucene.index.ReaderUtil;
+import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.search.ConjunctionUtils;
import org.apache.lucene.search.DocIdSetIterator;
@@ -326,16 +327,21 @@ public class StringValueFacetCounts extends Facets {
}
private void countOneSegment(
- SortedSetDocValues segValues, int segmentOrd, FacetsCollector.MatchingDocs hits)
+ SortedSetDocValues multiValues, int segmentOrd, FacetsCollector.MatchingDocs hits)
throws IOException {
+ // It's slightly more efficient to work against SortedDocValues if the field is actually
+ // single-valued (see: LUCENE-5309)
+ SortedDocValues singleValues = DocValues.unwrapSingleton(multiValues);
+ DocIdSetIterator valuesIt = singleValues != null ? singleValues : multiValues;
+
// Intersect hits with doc values unless we're "counting all," in which case we'll iterate
// all doc values for this segment:
DocIdSetIterator it;
if (hits == null) {
- it = segValues;
+ it = valuesIt;
} else {
- it = ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits.iterator(), segValues));
+ it = ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits.iterator(), valuesIt));
}
// TODO: yet another option is to count all segs
@@ -350,16 +356,23 @@ public class StringValueFacetCounts extends Facets {
if (ordinalMap == null) {
// If there's no ordinal map we don't need to map segment ordinals to globals, so counting
// is very straight-forward:
- for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
- int term = (int) segValues.nextOrd();
- boolean countedDocInTotal = false;
- while (term != SortedSetDocValues.NO_MORE_ORDS) {
- increment(term);
- if (countedDocInTotal == false) {
- totalDocCount++;
+ if (singleValues != null) {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ increment(singleValues.ordValue());
+ totalDocCount++;
+ }
+ } else {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ int term = (int) multiValues.nextOrd();
+ boolean countedDocInTotal = false;
+ while (term != SortedSetDocValues.NO_MORE_ORDS) {
+ increment(term);
+ if (countedDocInTotal == false) {
+ totalDocCount++;
+ countedDocInTotal = true;
+ }
+ term = (int) multiValues.nextOrd();
}
- countedDocInTotal = true;
- term = (int) segValues.nextOrd();
}
}
} else {
@@ -367,20 +380,27 @@ public class StringValueFacetCounts extends Facets {
// depending on how many hits we have to count relative to how many unique doc val ordinals
// there are in this segment:
final LongValues ordMap = ordinalMap.getGlobalOrds(segmentOrd);
- int segmentCardinality = (int) segValues.getValueCount();
+ int segmentCardinality = (int) multiValues.getValueCount();
if (hits != null && hits.totalHits < segmentCardinality / 10) {
// Remap every ord to global ord as we iterate:
- for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
- int term = (int) segValues.nextOrd();
- boolean countedDocInTotal = false;
- while (term != SortedSetDocValues.NO_MORE_ORDS) {
- increment((int) ordMap.get(term));
- if (countedDocInTotal == false) {
- totalDocCount++;
+ if (singleValues != null) {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ increment((int) ordMap.get(singleValues.ordValue()));
+ totalDocCount++;
+ }
+ } else {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ int term = (int) multiValues.nextOrd();
+ boolean countedDocInTotal = false;
+ while (term != SortedSetDocValues.NO_MORE_ORDS) {
+ increment((int) ordMap.get(term));
+ if (countedDocInTotal == false) {
+ totalDocCount++;
+ countedDocInTotal = true;
+ }
+ term = (int) multiValues.nextOrd();
}
- countedDocInTotal = true;
- term = (int) segValues.nextOrd();
}
}
} else {
@@ -389,16 +409,23 @@ public class StringValueFacetCounts extends Facets {
// we expect to visit a large percentage of the unique ordinals (lots of hits relative
// to the segment cardinality), so we count the segment densely:
final int[] segCounts = new int[segmentCardinality];
- for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
- int term = (int) segValues.nextOrd();
- boolean countedDocInTotal = false;
- while (term != SortedSetDocValues.NO_MORE_ORDS) {
- segCounts[term]++;
- if (countedDocInTotal == false) {
- totalDocCount++;
+ if (singleValues != null) {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ segCounts[singleValues.ordValue()]++;
+ totalDocCount++;
+ }
+ } else {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ int term = (int) multiValues.nextOrd();
+ boolean countedDocInTotal = false;
+ while (term != SortedSetDocValues.NO_MORE_ORDS) {
+ segCounts[term]++;
+ if (countedDocInTotal == false) {
+ totalDocCount++;
+ countedDocInTotal = true;
+ }
+ term = (int) multiValues.nextOrd();
}
- countedDocInTotal = true;
- term = (int) segValues.nextOrd();
}
}
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java
index 40f0bef..59efa80 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java
@@ -36,6 +36,7 @@ import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.facet.LabelAndValue;
import org.apache.lucene.facet.TopOrdAndIntQueue;
import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState.OrdRange;
+import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
@@ -43,6 +44,7 @@ import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues;
import org.apache.lucene.index.OrdinalMap;
import org.apache.lucene.index.ReaderUtil;
+import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.search.ConjunctionUtils;
import org.apache.lucene.search.DocIdSetIterator;
@@ -103,7 +105,7 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
return getDim(dim, ordRange, topN);
}
- private final FacetResult getDim(String dim, OrdRange ordRange, int topN) throws IOException {
+ private FacetResult getDim(String dim, OrdRange ordRange, int topN) throws IOException {
TopOrdAndIntQueue q = null;
@@ -168,12 +170,17 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
@Override
public Void call() throws IOException {
- SortedSetDocValues segValues = leafReader.getSortedSetDocValues(field);
- if (segValues == null) {
+ SortedSetDocValues multiValues = DocValues.getSortedSet(leafReader, field);
+ if (multiValues == null) {
// nothing to count here
return null;
}
+ // It's slightly more efficient to work against SortedDocValues if the field is actually
+ // single-valued (see: LUCENE-5309)
+ SortedDocValues singleValues = DocValues.unwrapSingleton(multiValues);
+ DocIdSetIterator valuesIt = singleValues != null ? singleValues : multiValues;
+
// TODO: yet another option is to count all segs
// first, only in seg-ord space, and then do a
// merge-sort-PQ in the end to only "resolve to
@@ -186,34 +193,46 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
DocIdSetIterator it;
if (hits == null) {
// count all
- it = segValues;
+ it = valuesIt;
} else {
- it = ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits.iterator(), segValues));
+ it = ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits.iterator(), valuesIt));
}
if (ordinalMap != null) {
final LongValues ordMap = ordinalMap.getGlobalOrds(segOrd);
- int numSegOrds = (int) segValues.getValueCount();
+ int numSegOrds = (int) multiValues.getValueCount();
if (hits != null && hits.totalHits < numSegOrds / 10) {
// Remap every ord to global ord as we iterate:
- for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
- int term = (int) segValues.nextOrd();
- while (term != SortedSetDocValues.NO_MORE_ORDS) {
- counts.incrementAndGet((int) ordMap.get(term));
- term = (int) segValues.nextOrd();
+ if (singleValues != null) {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ counts.incrementAndGet((int) ordMap.get(singleValues.ordValue()));
+ }
+ } else {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ int term = (int) multiValues.nextOrd();
+ while (term != SortedSetDocValues.NO_MORE_ORDS) {
+ counts.incrementAndGet((int) ordMap.get(term));
+ term = (int) multiValues.nextOrd();
+ }
}
}
} else {
// First count in seg-ord space:
final int[] segCounts = new int[numSegOrds];
- for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
- int term = (int) segValues.nextOrd();
- while (term != SortedSetDocValues.NO_MORE_ORDS) {
- segCounts[term]++;
- term = (int) segValues.nextOrd();
+ if (singleValues != null) {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ segCounts[singleValues.ordValue()]++;
+ }
+ } else {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ int term = (int) multiValues.nextOrd();
+ while (term != SortedSetDocValues.NO_MORE_ORDS) {
+ segCounts[term]++;
+ term = (int) multiValues.nextOrd();
+ }
}
}
@@ -228,11 +247,17 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
} else {
// No ord mapping (e.g., single segment index):
// just aggregate directly into counts:
- for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
- int term = (int) segValues.nextOrd();
- while (term != SortedSetDocValues.NO_MORE_ORDS) {
- counts.incrementAndGet(term);
- term = (int) segValues.nextOrd();
+ if (singleValues != null) {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ counts.incrementAndGet(singleValues.ordValue());
+ }
+ } else {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ int term = (int) multiValues.nextOrd();
+ while (term != SortedSetDocValues.NO_MORE_ORDS) {
+ counts.incrementAndGet(term);
+ term = (int) multiValues.nextOrd();
+ }
}
}
}
@@ -242,8 +267,7 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
}
/** Does all the "real work" of tallying up the counts. */
- private final void count(List<MatchingDocs> matchingDocs)
- throws IOException, InterruptedException {
+ private void count(List<MatchingDocs> matchingDocs) throws IOException, InterruptedException {
OrdinalMap ordinalMap;
@@ -286,7 +310,7 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
}
/** Does all the "real work" of tallying up the counts. */
- private final void countAll() throws IOException, InterruptedException {
+ private void countAll() throws IOException, InterruptedException {
OrdinalMap ordinalMap;
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
index a3b72bc..f0a573e 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
@@ -31,6 +31,7 @@ import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.facet.LabelAndValue;
import org.apache.lucene.facet.TopOrdAndIntQueue;
import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState.OrdRange;
+import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
@@ -38,6 +39,7 @@ import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues;
import org.apache.lucene.index.OrdinalMap;
import org.apache.lucene.index.ReaderUtil;
+import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.search.ConjunctionUtils;
import org.apache.lucene.search.DocIdSetIterator;
@@ -101,7 +103,7 @@ public class SortedSetDocValuesFacetCounts extends Facets {
return getDim(dim, ordRange, topN);
}
- private final FacetResult getDim(String dim, OrdRange ordRange, int topN) throws IOException {
+ private FacetResult getDim(String dim, OrdRange ordRange, int topN) throws IOException {
TopOrdAndIntQueue q = null;
@@ -151,17 +153,22 @@ public class SortedSetDocValuesFacetCounts extends Facets {
private void countOneSegment(
OrdinalMap ordinalMap, LeafReader reader, int segOrd, MatchingDocs hits) throws IOException {
- SortedSetDocValues segValues = reader.getSortedSetDocValues(field);
- if (segValues == null) {
+ SortedSetDocValues multiValues = DocValues.getSortedSet(reader, field);
+ if (multiValues == null) {
// nothing to count
return;
}
+ // It's slightly more efficient to work against SortedDocValues if the field is actually
+ // single-valued (see: LUCENE-5309)
+ SortedDocValues singleValues = DocValues.unwrapSingleton(multiValues);
+ DocIdSetIterator valuesIt = singleValues != null ? singleValues : multiValues;
+
DocIdSetIterator it;
if (hits == null) {
- it = segValues;
+ it = valuesIt;
} else {
- it = ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits.iterator(), segValues));
+ it = ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits.iterator(), valuesIt));
}
// TODO: yet another option is to count all segs
@@ -176,26 +183,37 @@ public class SortedSetDocValuesFacetCounts extends Facets {
if (ordinalMap != null) {
final LongValues ordMap = ordinalMap.getGlobalOrds(segOrd);
- int numSegOrds = (int) segValues.getValueCount();
+ int numSegOrds = (int) multiValues.getValueCount();
if (hits != null && hits.totalHits < numSegOrds / 10) {
// Remap every ord to global ord as we iterate:
- for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
- int term = (int) segValues.nextOrd();
- while (term != SortedSetDocValues.NO_MORE_ORDS) {
- // ordinalMap.getGlobalOrd(segOrd, term));
- counts[(int) ordMap.get(term)]++;
- term = (int) segValues.nextOrd();
+ if (singleValues != null) {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ counts[(int) ordMap.get(singleValues.ordValue())]++;
+ }
+ } else {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ int term = (int) multiValues.nextOrd();
+ while (term != SortedSetDocValues.NO_MORE_ORDS) {
+ counts[(int) ordMap.get(term)]++;
+ term = (int) multiValues.nextOrd();
+ }
}
}
} else {
// First count in seg-ord space:
final int[] segCounts = new int[numSegOrds];
- for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
- int term = (int) segValues.nextOrd();
- while (term != SortedSetDocValues.NO_MORE_ORDS) {
- segCounts[term]++;
- term = (int) segValues.nextOrd();
+ if (singleValues != null) {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ segCounts[singleValues.ordValue()]++;
+ }
+ } else {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ int term = (int) multiValues.nextOrd();
+ while (term != SortedSetDocValues.NO_MORE_ORDS) {
+ segCounts[term]++;
+ term = (int) multiValues.nextOrd();
+ }
}
}
@@ -211,18 +229,24 @@ public class SortedSetDocValuesFacetCounts extends Facets {
} else {
// No ord mapping (e.g., single segment index):
// just aggregate directly into counts:
- for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
- int term = (int) segValues.nextOrd();
- while (term != SortedSetDocValues.NO_MORE_ORDS) {
- counts[term]++;
- term = (int) segValues.nextOrd();
+ if (singleValues != null) {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ counts[singleValues.ordValue()]++;
+ }
+ } else {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ int term = (int) multiValues.nextOrd();
+ while (term != SortedSetDocValues.NO_MORE_ORDS) {
+ counts[term]++;
+ term = (int) multiValues.nextOrd();
+ }
}
}
}
}
/** Does all the "real work" of tallying up the counts. */
- private final void count(List<MatchingDocs> matchingDocs) throws IOException {
+ private void count(List<MatchingDocs> matchingDocs) throws IOException {
OrdinalMap ordinalMap;
@@ -253,7 +277,7 @@ public class SortedSetDocValuesFacetCounts extends Facets {
}
/** Does all the "real work" of tallying up the counts. */
- private final void countAll() throws IOException {
+ private void countAll() throws IOException {
OrdinalMap ordinalMap;
diff --git a/lucene/facet/src/test/org/apache/lucene/facet/TestStringValueFacetCounts.java b/lucene/facet/src/test/org/apache/lucene/facet/TestStringValueFacetCounts.java
index 886c3da..3d27f28 100644
--- a/lucene/facet/src/test/org/apache/lucene/facet/TestStringValueFacetCounts.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/TestStringValueFacetCounts.java
@@ -236,51 +236,62 @@ public class TestStringValueFacetCounts extends FacetTestCase {
public void testRandom() throws Exception {
- Directory dir = newDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
-
- // Build up test data
- String[] tokens = getRandomTokens(50); // 50 random values to pick from
- int numDocs = atLeast(1000);
- int expectedTotalDocCount = 0;
- Map<String, Integer> expected = new HashMap<>();
- for (int i = 0; i < numDocs; i++) {
- Document doc = new Document();
- int valCount = random().nextInt(5); // each doc can have up to 5 values
- Set<String> docVals = new HashSet<>();
- for (int j = 0; j < valCount; j++) {
- int tokenIdx = random().nextInt(tokens.length);
- String val = tokens[tokenIdx];
- // values should only be counted once per document
- if (docVals.contains(val) == false) {
- expected.put(val, expected.getOrDefault(val, 0) + 1);
+ int fullIterations = LuceneTestCase.TEST_NIGHTLY ? 20 : 3;
+ for (int iter = 0; iter < fullIterations; iter++) {
+ Directory dir = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
+
+ // Build up test data
+ String[] tokens = getRandomTokens(50); // 50 random values to pick from
+ int numDocs = atLeast(1000);
+ int expectedTotalDocCount = 0;
+ Map<String, Integer> expected = new HashMap<>();
+ for (int i = 0; i < numDocs; i++) {
+ Document doc = new Document();
+ // Sometimes we restrict docs to be single-valued, but most of the time they can have up to
+ // 5:
+ int maxValuesPerDoc;
+ if (random().nextInt(10) < 8) {
+ maxValuesPerDoc = 5;
+ } else {
+ maxValuesPerDoc = 1;
+ }
+ int valCount = random().nextInt(maxValuesPerDoc);
+ Set<String> docVals = new HashSet<>();
+ for (int j = 0; j < valCount; j++) {
+ int tokenIdx = random().nextInt(tokens.length);
+ String val = tokens[tokenIdx];
+ // values should only be counted once per document
+ if (docVals.contains(val) == false) {
+ expected.put(val, expected.getOrDefault(val, 0) + 1);
+ }
+ docVals.add(val);
+ doc.add(new SortedSetDocValuesField("field", new BytesRef(val)));
+ }
+ // only docs with at least one value in the field should be counted in the total
+ if (docVals.isEmpty() == false) {
+ expectedTotalDocCount++;
+ }
+ writer.addDocument(doc);
+ if (random().nextInt(10) == 0) {
+ writer.commit(); // sometimes commit
}
- docVals.add(val);
- doc.add(new SortedSetDocValuesField("field", new BytesRef(val)));
- }
- // only docs with at least one value in the field should be counted in the total
- if (docVals.isEmpty() == false) {
- expectedTotalDocCount++;
- }
- writer.addDocument(doc);
- if (random().nextInt(10) == 0) {
- writer.commit(); // sometimes commit
}
- }
- IndexSearcher searcher = newSearcher(writer.getReader());
- writer.close();
+ IndexSearcher searcher = newSearcher(writer.getReader());
+ writer.close();
- // run iterations with random values of topN
- int iterations = LuceneTestCase.TEST_NIGHTLY ? 10_000 : 50;
- int[] topNs = new int[iterations];
- for (int i = 0; i < iterations; i++) {
- topNs[i] = atLeast(1);
- }
+ // run iterations with random values of topN
+ int iterations = LuceneTestCase.TEST_NIGHTLY ? 10_000 : 50;
+ int[] topNs = new int[iterations];
+ for (int i = 0; i < iterations; i++) {
+ topNs[i] = atLeast(1);
+ }
- checkFacetResult(expected, expectedTotalDocCount, searcher, topNs);
+ checkFacetResult(expected, expectedTotalDocCount, searcher, topNs);
- IOUtils.close(searcher.getIndexReader(), dir);
+ IOUtils.close(searcher.getIndexReader(), dir);
+ }
}
private void checkFacetResult(
diff --git a/lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java b/lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java
index 83caaac..59fc41e 100644
--- a/lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java
@@ -45,6 +45,7 @@ import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.NamedThreadFactory;
import org.apache.lucene.util.TestUtil;
@@ -105,6 +106,59 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
IOUtils.close(searcher.getIndexReader(), dir);
}
+ public void testBasicSingleValued() throws Exception {
+ Directory dir = newDirectory();
+
+ FacetsConfig config = new FacetsConfig();
+ config.setMultiValued("a", false);
+ RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
+
+ Document doc = new Document();
+ doc.add(new SortedSetDocValuesFacetField("a", "foo"));
+ doc.add(new SortedSetDocValuesFacetField("b", "bar"));
+ writer.addDocument(config.build(doc));
+ doc = new Document();
+ doc.add(new SortedSetDocValuesFacetField("a", "foo"));
+ writer.addDocument(config.build(doc));
+ if (random().nextBoolean()) {
+ writer.commit();
+ }
+
+ doc = new Document();
+ doc.add(new SortedSetDocValuesFacetField("a", "baz"));
+ writer.addDocument(config.build(doc));
+
+ // NRT open
+ IndexSearcher searcher = newSearcher(writer.getReader());
+
+ // Per-top-reader state:
+ SortedSetDocValuesReaderState state =
+ new DefaultSortedSetDocValuesReaderState(searcher.getIndexReader());
+
+ ExecutorService exec = randomExecutorServiceOrNull();
+ Facets facets = getAllFacets(searcher, state, exec);
+
+ assertEquals(
+ "dim=a path=[] value=3 childCount=2\n foo (2)\n baz (1)\n",
+ facets.getTopChildren(10, "a").toString());
+ assertEquals(
+ "dim=b path=[] value=1 childCount=1\n bar (1)\n",
+ facets.getTopChildren(10, "b").toString());
+
+ // DrillDown:
+ DrillDownQuery q = new DrillDownQuery(config);
+ q.add("a", "foo");
+ q.add("b", "bar");
+ TopDocs hits = searcher.search(q, 1);
+ assertEquals(1, hits.totalHits.value);
+
+ if (exec != null) {
+ exec.shutdownNow();
+ }
+ writer.close();
+ IOUtils.close(searcher.getIndexReader(), dir);
+ }
+
public void testDrillDownOptions() throws Exception {
Directory dir = newDirectory();
@@ -352,107 +406,117 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
}
public void testRandom() throws Exception {
- String[] tokens = getRandomTokens(10);
- Directory indexDir = newDirectory();
- Directory taxoDir = newDirectory();
-
- RandomIndexWriter w = new RandomIndexWriter(random(), indexDir);
- FacetsConfig config = new FacetsConfig();
- int numDocs = atLeast(1000);
- int numDims = TestUtil.nextInt(random(), 1, 7);
- List<TestDoc> testDocs = getRandomDocs(tokens, numDocs, numDims);
- for (TestDoc testDoc : testDocs) {
- Document doc = new Document();
- doc.add(newStringField("content", testDoc.content, Field.Store.NO));
- for (int j = 0; j < numDims; j++) {
- if (testDoc.dims[j] != null) {
- doc.add(new SortedSetDocValuesFacetField("dim" + j, testDoc.dims[j]));
+ int fullIterations = LuceneTestCase.TEST_NIGHTLY ? 20 : 3;
+ for (int fullIter = 0; fullIter < fullIterations; fullIter++) {
+ String[] tokens = getRandomTokens(10);
+ Directory indexDir = newDirectory();
+ Directory taxoDir = newDirectory();
+
+ RandomIndexWriter w = new RandomIndexWriter(random(), indexDir);
+ FacetsConfig config = new FacetsConfig();
+ int numDocs = atLeast(1000);
+ // Most of the time allow up to 7 dims per doc, but occasionally limit all docs to a single
+ // dim:
+ int numDims;
+ if (random().nextInt(10) < 8) {
+ numDims = TestUtil.nextInt(random(), 1, 7);
+ } else {
+ numDims = 1;
+ }
+ List<TestDoc> testDocs = getRandomDocs(tokens, numDocs, numDims);
+ for (TestDoc testDoc : testDocs) {
+ Document doc = new Document();
+ doc.add(newStringField("content", testDoc.content, Field.Store.NO));
+ for (int j = 0; j < numDims; j++) {
+ if (testDoc.dims[j] != null) {
+ doc.add(new SortedSetDocValuesFacetField("dim" + j, testDoc.dims[j]));
+ }
}
+ w.addDocument(config.build(doc));
}
- w.addDocument(config.build(doc));
- }
- // NRT open
- IndexSearcher searcher = newSearcher(w.getReader());
+ // NRT open
+ IndexSearcher searcher = newSearcher(w.getReader());
- // Per-top-reader state:
- SortedSetDocValuesReaderState state =
- new DefaultSortedSetDocValuesReaderState(searcher.getIndexReader());
- ExecutorService exec = randomExecutorServiceOrNull();
+ // Per-top-reader state:
+ SortedSetDocValuesReaderState state =
+ new DefaultSortedSetDocValuesReaderState(searcher.getIndexReader());
+ ExecutorService exec = randomExecutorServiceOrNull();
- int iters = atLeast(100);
- for (int iter = 0; iter < iters; iter++) {
- String searchToken = tokens[random().nextInt(tokens.length)];
- if (VERBOSE) {
- System.out.println("\nTEST: iter content=" + searchToken);
- }
- FacetsCollector fc = new FacetsCollector();
- FacetsCollector.search(searcher, new TermQuery(new Term("content", searchToken)), 10, fc);
- Facets facets;
- if (exec != null) {
- facets = new ConcurrentSortedSetDocValuesFacetCounts(state, fc, exec);
- } else {
- facets = new SortedSetDocValuesFacetCounts(state, fc);
- }
+ int iters = atLeast(100);
+ for (int iter = 0; iter < iters; iter++) {
+ String searchToken = tokens[random().nextInt(tokens.length)];
+ if (VERBOSE) {
+ System.out.println("\nTEST: iter content=" + searchToken);
+ }
+ FacetsCollector fc = new FacetsCollector();
+ FacetsCollector.search(searcher, new TermQuery(new Term("content", searchToken)), 10, fc);
+ Facets facets;
+ if (exec != null) {
+ facets = new ConcurrentSortedSetDocValuesFacetCounts(state, fc, exec);
+ } else {
+ facets = new SortedSetDocValuesFacetCounts(state, fc);
+ }
- // Slow, yet hopefully bug-free, faceting:
- @SuppressWarnings({"rawtypes", "unchecked"})
- Map<String, Integer>[] expectedCounts = new HashMap[numDims];
- for (int i = 0; i < numDims; i++) {
- expectedCounts[i] = new HashMap<>();
- }
+ // Slow, yet hopefully bug-free, faceting:
+ @SuppressWarnings({"rawtypes", "unchecked"})
+ Map<String, Integer>[] expectedCounts = new HashMap[numDims];
+ for (int i = 0; i < numDims; i++) {
+ expectedCounts[i] = new HashMap<>();
+ }
- for (TestDoc doc : testDocs) {
- if (doc.content.equals(searchToken)) {
- for (int j = 0; j < numDims; j++) {
- if (doc.dims[j] != null) {
- Integer v = expectedCounts[j].get(doc.dims[j]);
- if (v == null) {
- expectedCounts[j].put(doc.dims[j], 1);
- } else {
- expectedCounts[j].put(doc.dims[j], v.intValue() + 1);
+ for (TestDoc doc : testDocs) {
+ if (doc.content.equals(searchToken)) {
+ for (int j = 0; j < numDims; j++) {
+ if (doc.dims[j] != null) {
+ Integer v = expectedCounts[j].get(doc.dims[j]);
+ if (v == null) {
+ expectedCounts[j].put(doc.dims[j], 1);
+ } else {
+ expectedCounts[j].put(doc.dims[j], v.intValue() + 1);
+ }
}
}
}
}
- }
- List<FacetResult> expected = new ArrayList<>();
- for (int i = 0; i < numDims; i++) {
- List<LabelAndValue> labelValues = new ArrayList<>();
- int totCount = 0;
- for (Map.Entry<String, Integer> ent : expectedCounts[i].entrySet()) {
- labelValues.add(new LabelAndValue(ent.getKey(), ent.getValue()));
- totCount += ent.getValue();
- }
- sortLabelValues(labelValues);
- if (totCount > 0) {
- expected.add(
- new FacetResult(
- "dim" + i,
- new String[0],
- totCount,
- labelValues.toArray(new LabelAndValue[labelValues.size()]),
- labelValues.size()));
+ List<FacetResult> expected = new ArrayList<>();
+ for (int i = 0; i < numDims; i++) {
+ List<LabelAndValue> labelValues = new ArrayList<>();
+ int totCount = 0;
+ for (Map.Entry<String, Integer> ent : expectedCounts[i].entrySet()) {
+ labelValues.add(new LabelAndValue(ent.getKey(), ent.getValue()));
+ totCount += ent.getValue();
+ }
+ sortLabelValues(labelValues);
+ if (totCount > 0) {
+ expected.add(
+ new FacetResult(
+ "dim" + i,
+ new String[0],
+ totCount,
+ labelValues.toArray(new LabelAndValue[labelValues.size()]),
+ labelValues.size()));
+ }
}
- }
- // Sort by highest value, tie break by value:
- sortFacetResults(expected);
+ // Sort by highest value, tie break by value:
+ sortFacetResults(expected);
- List<FacetResult> actual = facets.getAllDims(10);
+ List<FacetResult> actual = facets.getAllDims(10);
- // Messy: fixup ties
- // sortTies(actual);
+ // Messy: fixup ties
+ // sortTies(actual);
- assertEquals(expected, actual);
- }
+ assertEquals(expected, actual);
+ }
- if (exec != null) {
- exec.shutdownNow();
+ if (exec != null) {
+ exec.shutdownNow();
+ }
+ w.close();
+ IOUtils.close(searcher.getIndexReader(), indexDir, taxoDir);
}
- w.close();
- IOUtils.close(searcher.getIndexReader(), indexDir, taxoDir);
}
public void testNonExistentDimension() throws Exception {