You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2022/04/21 16:48:37 UTC
[lucene] 01/02: LUCENE-10517: Improve performance of SortedSetDV faceting by iterating on class types (#812)
This is an automated email from the ASF dual-hosted git repository.
jpountz pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git
commit 26d015301df60096f9e61260311da65df51d3a3c
Author: Chris Hegarty <62...@users.noreply.github.com>
AuthorDate: Thu Apr 21 17:39:53 2022 +0100
LUCENE-10517: Improve performance of SortedSetDV faceting by iterating on class types (#812)
---
.../lucene/facet/StringValueFacetCounts.java | 97 +++++++++++++++++-
.../ConcurrentSortedSetDocValuesFacetCounts.java | 110 +++++++++++++++++----
.../sortedset/SortedSetDocValuesFacetCounts.java | 89 ++++++++++++++++-
.../facet/taxonomy/FastTaxonomyFacetCounts.java | 44 ++++++---
4 files changed, 300 insertions(+), 40 deletions(-)
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java
index 6876d36c9a7..3486dbf66e1 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java
@@ -306,7 +306,12 @@ public class StringValueFacetCounts extends Facets {
assert ordinalMap == null;
LeafReaderContext context = leaves.get(0);
- countOneSegment(docValues, context.ord, null, context.reader().getLiveDocs());
+ Bits liveDocs = context.reader().getLiveDocs();
+ if (liveDocs == null) {
+ countOneSegmentNHLD(docValues, context.ord);
+ } else {
+ countOneSegment(docValues, context.ord, null, liveDocs);
+ }
} else {
// Since we have more than one segment, we should have a non-null ordinalMap and docValues
// should be a MultiSortedSetDocValues instance:
@@ -318,7 +323,12 @@ public class StringValueFacetCounts extends Facets {
for (int i = 0; i < numLeaves; i++) {
LeafReaderContext context = leaves.get(i);
- countOneSegment(multiValues.values[i], context.ord, null, context.reader().getLiveDocs());
+ Bits liveDocs = context.reader().getLiveDocs();
+ if (liveDocs == null) {
+ countOneSegmentNHLD(multiValues.values[i], context.ord);
+ } else {
+ countOneSegment(multiValues.values[i], context.ord, null, liveDocs);
+ }
}
}
}
@@ -339,7 +349,8 @@ public class StringValueFacetCounts extends Facets {
// all doc values for this segment:
DocIdSetIterator it;
if (hits == null) {
- it = (liveDocs != null) ? FacetUtils.liveDocsDISI(valuesIt, liveDocs) : valuesIt;
+ assert liveDocs != null;
+ it = FacetUtils.liveDocsDISI(valuesIt, liveDocs);
} else {
it = ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits.iterator(), valuesIt));
}
@@ -440,6 +451,86 @@ public class StringValueFacetCounts extends Facets {
}
}
+ // Variant of countOneSegment, that has No Hits or Live Docs
+ private void countOneSegmentNHLD(SortedSetDocValues multiValues, int segmentOrd)
+ throws IOException {
+
+ // It's slightly more efficient to work against SortedDocValues if the field is actually
+ // single-valued (see: LUCENE-5309)
+ SortedDocValues singleValues = DocValues.unwrapSingleton(multiValues);
+
+ if (ordinalMap == null) {
+ // If there's no ordinal map we don't need to map segment ordinals to globals, so counting
+ // is very straight-forward:
+ if (singleValues != null) {
+ for (int doc = singleValues.nextDoc();
+ doc != DocIdSetIterator.NO_MORE_DOCS;
+ doc = singleValues.nextDoc()) {
+ increment(singleValues.ordValue());
+ totalDocCount++;
+ }
+ } else {
+ for (int doc = multiValues.nextDoc();
+ doc != DocIdSetIterator.NO_MORE_DOCS;
+ doc = multiValues.nextDoc()) {
+ boolean countedDocInTotal = false;
+ for (int term = (int) multiValues.nextOrd();
+ term != SortedSetDocValues.NO_MORE_ORDS;
+ term = (int) multiValues.nextOrd()) {
+ increment(term);
+ if (countedDocInTotal == false) {
+ totalDocCount++;
+ countedDocInTotal = true;
+ }
+ }
+ }
+ }
+ } else {
+ // We need to map segment ordinals to globals. We have two different approaches to this
+ // depending on how many hits we have to count relative to how many unique doc val ordinals
+ // there are in this segment:
+ final LongValues ordMap = ordinalMap.getGlobalOrds(segmentOrd);
+ int segmentCardinality = (int) multiValues.getValueCount();
+
+ // First count in seg-ord space.
+ // At this point, we're either counting all ordinals or our heuristic suggests that
+ // we expect to visit a large percentage of the unique ordinals (lots of hits relative
+ // to the segment cardinality), so we count the segment densely:
+ final int[] segCounts = new int[segmentCardinality];
+ if (singleValues != null) {
+ for (int doc = singleValues.nextDoc();
+ doc != DocIdSetIterator.NO_MORE_DOCS;
+ doc = singleValues.nextDoc()) {
+ segCounts[singleValues.ordValue()]++;
+ totalDocCount++;
+ }
+ } else {
+ for (int doc = multiValues.nextDoc();
+ doc != DocIdSetIterator.NO_MORE_DOCS;
+ doc = multiValues.nextDoc()) {
+ boolean countedDocInTotal = false;
+ for (int term = (int) multiValues.nextOrd();
+ term != SortedSetDocValues.NO_MORE_ORDS;
+ term = (int) multiValues.nextOrd()) {
+ segCounts[term]++;
+ if (countedDocInTotal == false) {
+ totalDocCount++;
+ countedDocInTotal = true;
+ }
+ }
+ }
+ }
+
+ // Then, migrate to global ords:
+ for (int ord = 0; ord < segmentCardinality; ord++) {
+ int count = segCounts[ord];
+ if (count != 0) {
+ increment((int) ordMap.get(ord), count);
+ }
+ }
+ }
+ }
+
private void increment(int ordinal) {
increment(ordinal, 1);
}
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java
index c1ce41533ea..89c49cf1b13 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java
@@ -259,15 +259,39 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
if (hits != null && hits.totalHits < numSegOrds / 10) {
// Remap every ord to global ord as we iterate:
if (singleValues != null) {
- for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
- counts.incrementAndGet((int) ordMap.get(singleValues.ordValue()));
+ if (singleValues == it) {
+ for (int doc = singleValues.nextDoc();
+ doc != DocIdSetIterator.NO_MORE_DOCS;
+ doc = singleValues.nextDoc()) {
+ counts.incrementAndGet((int) ordMap.get(singleValues.ordValue()));
+ }
+ } else {
+ for (int doc = it.nextDoc();
+ doc != DocIdSetIterator.NO_MORE_DOCS;
+ doc = it.nextDoc()) {
+ counts.incrementAndGet((int) ordMap.get(singleValues.ordValue()));
+ }
}
} else {
- for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
- int term = (int) multiValues.nextOrd();
- while (term != SortedSetDocValues.NO_MORE_ORDS) {
- counts.incrementAndGet((int) ordMap.get(term));
- term = (int) multiValues.nextOrd();
+ if (multiValues == it) {
+ for (int doc = multiValues.nextDoc();
+ doc != DocIdSetIterator.NO_MORE_DOCS;
+ doc = multiValues.nextDoc()) {
+ for (int term = (int) multiValues.nextOrd();
+ term != SortedSetDocValues.NO_MORE_ORDS;
+ term = (int) multiValues.nextOrd()) {
+ counts.incrementAndGet((int) ordMap.get(term));
+ }
+ }
+ } else {
+ for (int doc = it.nextDoc();
+ doc != DocIdSetIterator.NO_MORE_DOCS;
+ doc = it.nextDoc()) {
+ for (int term = (int) multiValues.nextOrd();
+ term != SortedSetDocValues.NO_MORE_ORDS;
+ term = (int) multiValues.nextOrd()) {
+ counts.incrementAndGet((int) ordMap.get(term));
+ }
}
}
}
@@ -276,15 +300,39 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
// First count in seg-ord space:
final int[] segCounts = new int[numSegOrds];
if (singleValues != null) {
- for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
- segCounts[singleValues.ordValue()]++;
+ if (singleValues == it) {
+ for (int doc = singleValues.nextDoc();
+ doc != DocIdSetIterator.NO_MORE_DOCS;
+ doc = singleValues.nextDoc()) {
+ segCounts[singleValues.ordValue()]++;
+ }
+ } else {
+ for (int doc = it.nextDoc();
+ doc != DocIdSetIterator.NO_MORE_DOCS;
+ doc = it.nextDoc()) {
+ segCounts[singleValues.ordValue()]++;
+ }
}
} else {
- for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
- int term = (int) multiValues.nextOrd();
- while (term != SortedSetDocValues.NO_MORE_ORDS) {
- segCounts[term]++;
- term = (int) multiValues.nextOrd();
+ if (multiValues == it) {
+ for (int doc = multiValues.nextDoc();
+ doc != DocIdSetIterator.NO_MORE_DOCS;
+ doc = multiValues.nextDoc()) {
+ for (int term = (int) multiValues.nextOrd();
+ term != SortedSetDocValues.NO_MORE_ORDS;
+ term = (int) multiValues.nextOrd()) {
+ segCounts[term]++;
+ }
+ }
+ } else {
+ for (int doc = it.nextDoc();
+ doc != DocIdSetIterator.NO_MORE_DOCS;
+ doc = it.nextDoc()) {
+ for (int term = (int) multiValues.nextOrd();
+ term != SortedSetDocValues.NO_MORE_ORDS;
+ term = (int) multiValues.nextOrd()) {
+ segCounts[term]++;
+ }
}
}
}
@@ -301,15 +349,35 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
// No ord mapping (e.g., single segment index):
// just aggregate directly into counts:
if (singleValues != null) {
- for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
- counts.incrementAndGet(singleValues.ordValue());
+ if (singleValues == it) {
+ for (int doc = singleValues.nextDoc();
+ doc != DocIdSetIterator.NO_MORE_DOCS;
+ doc = singleValues.nextDoc()) {
+ counts.incrementAndGet(singleValues.ordValue());
+ }
+ } else {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ counts.incrementAndGet(singleValues.ordValue());
+ }
}
} else {
- for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
- int term = (int) multiValues.nextOrd();
- while (term != SortedSetDocValues.NO_MORE_ORDS) {
- counts.incrementAndGet(term);
- term = (int) multiValues.nextOrd();
+ if (multiValues == it) {
+ for (int doc = multiValues.nextDoc();
+ doc != DocIdSetIterator.NO_MORE_DOCS;
+ doc = multiValues.nextDoc()) {
+ for (int term = (int) multiValues.nextOrd();
+ term != SortedSetDocValues.NO_MORE_ORDS;
+ term = (int) multiValues.nextOrd()) {
+ counts.incrementAndGet(term);
+ }
+ }
+ } else {
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ for (int term = (int) multiValues.nextOrd();
+ term != SortedSetDocValues.NO_MORE_ORDS;
+ term = (int) multiValues.nextOrd()) {
+ counts.incrementAndGet(term);
+ }
}
}
}
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
index c0f3daecc1e..356d752ffdd 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
@@ -279,6 +279,83 @@ public class SortedSetDocValuesFacetCounts extends Facets {
return childOrdsResult.dimCount;
}
+ // Variant of countOneSegment, that has No Hits or Live Docs
+ private void countOneSegmentNHLD(OrdinalMap ordinalMap, LeafReader reader, int segOrd)
+ throws IOException {
+ SortedSetDocValues multiValues = DocValues.getSortedSet(reader, field);
+ if (multiValues == null) {
+ // nothing to count
+ return;
+ }
+
+ // It's slightly more efficient to work against SortedDocValues if the field is actually
+ // single-valued (see: LUCENE-5309)
+ SortedDocValues singleValues = DocValues.unwrapSingleton(multiValues);
+
+ // TODO: yet another option is to count all segs
+ // first, only in seg-ord space, and then do a
+ // merge-sort-PQ in the end to only "resolve to
+ // global" those seg ords that can compete, if we know
+ // we just want top K? ie, this is the same algo
+ // that'd be used for merging facets across shards
+ // (distributed faceting). but this has much higher
+ // temp ram req'ts (sum of number of ords across all
+ // segs)
+ if (ordinalMap != null) {
+ final LongValues ordMap = ordinalMap.getGlobalOrds(segOrd);
+ int numSegOrds = (int) multiValues.getValueCount();
+
+ // First count in seg-ord space:
+ final int[] segCounts = new int[numSegOrds];
+ if (singleValues != null) {
+ for (int doc = singleValues.nextDoc();
+ doc != DocIdSetIterator.NO_MORE_DOCS;
+ doc = singleValues.nextDoc()) {
+ segCounts[singleValues.ordValue()]++;
+ }
+ } else {
+ for (int doc = multiValues.nextDoc();
+ doc != DocIdSetIterator.NO_MORE_DOCS;
+ doc = multiValues.nextDoc()) {
+ int term = (int) multiValues.nextOrd();
+ while (term != SortedSetDocValues.NO_MORE_ORDS) {
+ segCounts[term]++;
+ term = (int) multiValues.nextOrd();
+ }
+ }
+ }
+
+ // Then, migrate to global ords:
+ for (int ord = 0; ord < numSegOrds; ord++) {
+ int count = segCounts[ord];
+ if (count != 0) {
+ // ordinalMap.getGlobalOrd(segOrd, ord));
+ counts[(int) ordMap.get(ord)] += count;
+ }
+ }
+ } else {
+ // No ord mapping (e.g., single segment index):
+ // just aggregate directly into counts:
+ if (singleValues != null) {
+ for (int doc = singleValues.nextDoc();
+ doc != DocIdSetIterator.NO_MORE_DOCS;
+ doc = singleValues.nextDoc()) {
+ counts[singleValues.ordValue()]++;
+ }
+ } else {
+ for (int doc = multiValues.nextDoc();
+ doc != DocIdSetIterator.NO_MORE_DOCS;
+ doc = multiValues.nextDoc()) {
+ int term = (int) multiValues.nextOrd();
+ while (term != SortedSetDocValues.NO_MORE_ORDS) {
+ counts[term]++;
+ term = (int) multiValues.nextOrd();
+ }
+ }
+ }
+ }
+ }
+
private void countOneSegment(
OrdinalMap ordinalMap, LeafReader reader, int segOrd, MatchingDocs hits, Bits liveDocs)
throws IOException {
@@ -295,7 +372,8 @@ public class SortedSetDocValuesFacetCounts extends Facets {
DocIdSetIterator it;
if (hits == null) {
- it = (liveDocs != null) ? FacetUtils.liveDocsDISI(valuesIt, liveDocs) : valuesIt;
+ assert liveDocs != null;
+ it = FacetUtils.liveDocsDISI(valuesIt, liveDocs);
} else {
it = ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits.iterator(), valuesIt));
}
@@ -421,8 +499,13 @@ public class SortedSetDocValuesFacetCounts extends Facets {
for (LeafReaderContext context : state.getReader().leaves()) {
- countOneSegment(
- ordinalMap, context.reader(), context.ord, null, context.reader().getLiveDocs());
+ Bits liveDocs = context.reader().getLiveDocs();
+ if (liveDocs == null) {
+ countOneSegmentNHLD(ordinalMap, context.reader(), context.ord);
+ } else {
+ countOneSegment(
+ ordinalMap, context.reader(), context.ord, null, context.reader().getLiveDocs());
+ }
}
}
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java
index 1adb3993333..b1daa753917 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java
@@ -127,23 +127,41 @@ public class FastTaxonomyFacetCounts extends IntTaxonomyFacets {
NumericDocValues singleValued = DocValues.unwrapSingleton(multiValued);
if (singleValued != null) {
- for (int doc = singleValued.nextDoc();
- doc != DocIdSetIterator.NO_MORE_DOCS;
- doc = singleValued.nextDoc()) {
- if (liveDocs != null && liveDocs.get(doc) == false) {
- continue;
+ if (liveDocs == null) {
+ for (int doc = singleValued.nextDoc();
+ doc != DocIdSetIterator.NO_MORE_DOCS;
+ doc = singleValued.nextDoc()) {
+ values[(int) singleValued.longValue()]++;
+ }
+ } else {
+ for (int doc = singleValued.nextDoc();
+ doc != DocIdSetIterator.NO_MORE_DOCS;
+ doc = singleValued.nextDoc()) {
+ if (liveDocs.get(doc) == false) {
+ continue;
+ }
+ values[(int) singleValued.longValue()]++;
}
- values[(int) singleValued.longValue()]++;
}
} else {
- for (int doc = multiValued.nextDoc();
- doc != DocIdSetIterator.NO_MORE_DOCS;
- doc = multiValued.nextDoc()) {
- if (liveDocs != null && liveDocs.get(doc) == false) {
- continue;
+ if (liveDocs == null) {
+ for (int doc = multiValued.nextDoc();
+ doc != DocIdSetIterator.NO_MORE_DOCS;
+ doc = multiValued.nextDoc()) {
+ for (int i = 0; i < multiValued.docValueCount(); i++) {
+ values[(int) multiValued.nextValue()]++;
+ }
}
- for (int i = 0; i < multiValued.docValueCount(); i++) {
- values[(int) multiValued.nextValue()]++;
+ } else {
+ for (int doc = multiValued.nextDoc();
+ doc != DocIdSetIterator.NO_MORE_DOCS;
+ doc = multiValued.nextDoc()) {
+ if (liveDocs.get(doc) == false) {
+ continue;
+ }
+ for (int i = 0; i < multiValued.docValueCount(); i++) {
+ values[(int) multiValued.nextValue()]++;
+ }
}
}
}