You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2022/04/21 16:48:37 UTC
[lucene] 01/02: LUCENE-10517: Improve performance of SortedSetDV faceting by iterating on class types (#812)

This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git

commit 26d015301df60096f9e61260311da65df51d3a3c
Author: Chris Hegarty <62...@users.noreply.github.com>
AuthorDate: Thu Apr 21 17:39:53 2022 +0100

    LUCENE-10517: Improve performance of SortedSetDV faceting by iterating on class types (#812)
---
 .../lucene/facet/StringValueFacetCounts.java       |  97 +++++++++++++++++-
 .../ConcurrentSortedSetDocValuesFacetCounts.java   | 110 +++++++++++++++++----
 .../sortedset/SortedSetDocValuesFacetCounts.java   |  89 ++++++++++++++++-
 .../facet/taxonomy/FastTaxonomyFacetCounts.java    |  44 ++++++---
 4 files changed, 300 insertions(+), 40 deletions(-)

diff --git a/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java
index 6876d36c9a7..3486dbf66e1 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java
@@ -306,7 +306,12 @@ public class StringValueFacetCounts extends Facets {
       assert ordinalMap == null;
 
       LeafReaderContext context = leaves.get(0);
-      countOneSegment(docValues, context.ord, null, context.reader().getLiveDocs());
+      Bits liveDocs = context.reader().getLiveDocs();
+      if (liveDocs == null) {
+        countOneSegmentNHLD(docValues, context.ord);
+      } else {
+        countOneSegment(docValues, context.ord, null, liveDocs);
+      }
     } else {
       // Since we have more than one segment, we should have a non-null ordinalMap and docValues
       // should be a MultiSortedSetDocValues instance:
@@ -318,7 +323,12 @@ public class StringValueFacetCounts extends Facets {
 
       for (int i = 0; i < numLeaves; i++) {
         LeafReaderContext context = leaves.get(i);
-        countOneSegment(multiValues.values[i], context.ord, null, context.reader().getLiveDocs());
+        Bits liveDocs = context.reader().getLiveDocs();
+        if (liveDocs == null) {
+          countOneSegmentNHLD(multiValues.values[i], context.ord);
+        } else {
+          countOneSegment(multiValues.values[i], context.ord, null, liveDocs);
+        }
       }
     }
   }
@@ -339,7 +349,8 @@ public class StringValueFacetCounts extends Facets {
     // all doc values for this segment:
     DocIdSetIterator it;
     if (hits == null) {
-      it = (liveDocs != null) ? FacetUtils.liveDocsDISI(valuesIt, liveDocs) : valuesIt;
+      assert liveDocs != null;
+      it = FacetUtils.liveDocsDISI(valuesIt, liveDocs);
     } else {
       it = ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits.iterator(), valuesIt));
     }
@@ -440,6 +451,86 @@ public class StringValueFacetCounts extends Facets {
     }
   }
 
+  // Variant of countOneSegment, that has No Hits or Live Docs
+  private void countOneSegmentNHLD(SortedSetDocValues multiValues, int segmentOrd)
+      throws IOException {
+
+    // It's slightly more efficient to work against SortedDocValues if the field is actually
+    // single-valued (see: LUCENE-5309)
+    SortedDocValues singleValues = DocValues.unwrapSingleton(multiValues);
+
+    if (ordinalMap == null) {
+      // If there's no ordinal map we don't need to map segment ordinals to globals, so counting
+      // is very straight-forward:
+      if (singleValues != null) {
+        for (int doc = singleValues.nextDoc();
+            doc != DocIdSetIterator.NO_MORE_DOCS;
+            doc = singleValues.nextDoc()) {
+          increment(singleValues.ordValue());
+          totalDocCount++;
+        }
+      } else {
+        for (int doc = multiValues.nextDoc();
+            doc != DocIdSetIterator.NO_MORE_DOCS;
+            doc = multiValues.nextDoc()) {
+          boolean countedDocInTotal = false;
+          for (int term = (int) multiValues.nextOrd();
+              term != SortedSetDocValues.NO_MORE_ORDS;
+              term = (int) multiValues.nextOrd()) {
+            increment(term);
+            if (countedDocInTotal == false) {
+              totalDocCount++;
+              countedDocInTotal = true;
+            }
+          }
+        }
+      }
+    } else {
+      // We need to map segment ordinals to globals. We have two different approaches to this
+      // depending on how many hits we have to count relative to how many unique doc val ordinals
+      // there are in this segment:
+      final LongValues ordMap = ordinalMap.getGlobalOrds(segmentOrd);
+      int segmentCardinality = (int) multiValues.getValueCount();
+
+      // First count in seg-ord space.
+      // At this point, we're either counting all ordinals or our heuristic suggests that
+      // we expect to visit a large percentage of the unique ordinals (lots of hits relative
+      // to the segment cardinality), so we count the segment densely:
+      final int[] segCounts = new int[segmentCardinality];
+      if (singleValues != null) {
+        for (int doc = singleValues.nextDoc();
+            doc != DocIdSetIterator.NO_MORE_DOCS;
+            doc = singleValues.nextDoc()) {
+          segCounts[singleValues.ordValue()]++;
+          totalDocCount++;
+        }
+      } else {
+        for (int doc = multiValues.nextDoc();
+            doc != DocIdSetIterator.NO_MORE_DOCS;
+            doc = multiValues.nextDoc()) {
+          boolean countedDocInTotal = false;
+          for (int term = (int) multiValues.nextOrd();
+              term != SortedSetDocValues.NO_MORE_ORDS;
+              term = (int) multiValues.nextOrd()) {
+            segCounts[term]++;
+            if (countedDocInTotal == false) {
+              totalDocCount++;
+              countedDocInTotal = true;
+            }
+          }
+        }
+      }
+
+      // Then, migrate to global ords:
+      for (int ord = 0; ord < segmentCardinality; ord++) {
+        int count = segCounts[ord];
+        if (count != 0) {
+          increment((int) ordMap.get(ord), count);
+        }
+      }
+    }
+  }
+
   private void increment(int ordinal) {
     increment(ordinal, 1);
   }
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java
index c1ce41533ea..89c49cf1b13 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java
@@ -259,15 +259,39 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
         if (hits != null && hits.totalHits < numSegOrds / 10) {
           // Remap every ord to global ord as we iterate:
           if (singleValues != null) {
-            for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
-              counts.incrementAndGet((int) ordMap.get(singleValues.ordValue()));
+            if (singleValues == it) {
+              for (int doc = singleValues.nextDoc();
+                  doc != DocIdSetIterator.NO_MORE_DOCS;
+                  doc = singleValues.nextDoc()) {
+                counts.incrementAndGet((int) ordMap.get(singleValues.ordValue()));
+              }
+            } else {
+              for (int doc = it.nextDoc();
+                  doc != DocIdSetIterator.NO_MORE_DOCS;
+                  doc = it.nextDoc()) {
+                counts.incrementAndGet((int) ordMap.get(singleValues.ordValue()));
+              }
             }
           } else {
-            for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
-              int term = (int) multiValues.nextOrd();
-              while (term != SortedSetDocValues.NO_MORE_ORDS) {
-                counts.incrementAndGet((int) ordMap.get(term));
-                term = (int) multiValues.nextOrd();
+            if (multiValues == it) {
+              for (int doc = multiValues.nextDoc();
+                  doc != DocIdSetIterator.NO_MORE_DOCS;
+                  doc = multiValues.nextDoc()) {
+                for (int term = (int) multiValues.nextOrd();
+                    term != SortedSetDocValues.NO_MORE_ORDS;
+                    term = (int) multiValues.nextOrd()) {
+                  counts.incrementAndGet((int) ordMap.get(term));
+                }
+              }
+            } else {
+              for (int doc = it.nextDoc();
+                  doc != DocIdSetIterator.NO_MORE_DOCS;
+                  doc = it.nextDoc()) {
+                for (int term = (int) multiValues.nextOrd();
+                    term != SortedSetDocValues.NO_MORE_ORDS;
+                    term = (int) multiValues.nextOrd()) {
+                  counts.incrementAndGet((int) ordMap.get(term));
+                }
               }
             }
           }
@@ -276,15 +300,39 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
           // First count in seg-ord space:
           final int[] segCounts = new int[numSegOrds];
           if (singleValues != null) {
-            for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
-              segCounts[singleValues.ordValue()]++;
+            if (singleValues == it) {
+              for (int doc = singleValues.nextDoc();
+                  doc != DocIdSetIterator.NO_MORE_DOCS;
+                  doc = singleValues.nextDoc()) {
+                segCounts[singleValues.ordValue()]++;
+              }
+            } else {
+              for (int doc = it.nextDoc();
+                  doc != DocIdSetIterator.NO_MORE_DOCS;
+                  doc = it.nextDoc()) {
+                segCounts[singleValues.ordValue()]++;
+              }
             }
           } else {
-            for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
-              int term = (int) multiValues.nextOrd();
-              while (term != SortedSetDocValues.NO_MORE_ORDS) {
-                segCounts[term]++;
-                term = (int) multiValues.nextOrd();
+            if (multiValues == it) {
+              for (int doc = multiValues.nextDoc();
+                  doc != DocIdSetIterator.NO_MORE_DOCS;
+                  doc = multiValues.nextDoc()) {
+                for (int term = (int) multiValues.nextOrd();
+                    term != SortedSetDocValues.NO_MORE_ORDS;
+                    term = (int) multiValues.nextOrd()) {
+                  segCounts[term]++;
+                }
+              }
+            } else {
+              for (int doc = it.nextDoc();
+                  doc != DocIdSetIterator.NO_MORE_DOCS;
+                  doc = it.nextDoc()) {
+                for (int term = (int) multiValues.nextOrd();
+                    term != SortedSetDocValues.NO_MORE_ORDS;
+                    term = (int) multiValues.nextOrd()) {
+                  segCounts[term]++;
+                }
               }
             }
           }
@@ -301,15 +349,35 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
         // No ord mapping (e.g., single segment index):
         // just aggregate directly into counts:
         if (singleValues != null) {
-          for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
-            counts.incrementAndGet(singleValues.ordValue());
+          if (singleValues == it) {
+            for (int doc = singleValues.nextDoc();
+                doc != DocIdSetIterator.NO_MORE_DOCS;
+                doc = singleValues.nextDoc()) {
+              counts.incrementAndGet(singleValues.ordValue());
+            }
+          } else {
+            for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+              counts.incrementAndGet(singleValues.ordValue());
+            }
           }
         } else {
-          for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
-            int term = (int) multiValues.nextOrd();
-            while (term != SortedSetDocValues.NO_MORE_ORDS) {
-              counts.incrementAndGet(term);
-              term = (int) multiValues.nextOrd();
+          if (multiValues == it) {
+            for (int doc = multiValues.nextDoc();
+                doc != DocIdSetIterator.NO_MORE_DOCS;
+                doc = multiValues.nextDoc()) {
+              for (int term = (int) multiValues.nextOrd();
+                  term != SortedSetDocValues.NO_MORE_ORDS;
+                  term = (int) multiValues.nextOrd()) {
+                counts.incrementAndGet(term);
+              }
+            }
+          } else {
+            for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+              for (int term = (int) multiValues.nextOrd();
+                  term != SortedSetDocValues.NO_MORE_ORDS;
+                  term = (int) multiValues.nextOrd()) {
+                counts.incrementAndGet(term);
+              }
             }
           }
         }
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
index c0f3daecc1e..356d752ffdd 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
@@ -279,6 +279,83 @@ public class SortedSetDocValuesFacetCounts extends Facets {
     return childOrdsResult.dimCount;
   }
 
+  // Variant of countOneSegment, that has No Hits or Live Docs
+  private void countOneSegmentNHLD(OrdinalMap ordinalMap, LeafReader reader, int segOrd)
+      throws IOException {
+    SortedSetDocValues multiValues = DocValues.getSortedSet(reader, field);
+    if (multiValues == null) {
+      // nothing to count
+      return;
+    }
+
+    // It's slightly more efficient to work against SortedDocValues if the field is actually
+    // single-valued (see: LUCENE-5309)
+    SortedDocValues singleValues = DocValues.unwrapSingleton(multiValues);
+
+    // TODO: yet another option is to count all segs
+    // first, only in seg-ord space, and then do a
+    // merge-sort-PQ in the end to only "resolve to
+    // global" those seg ords that can compete, if we know
+    // we just want top K?  ie, this is the same algo
+    // that'd be used for merging facets across shards
+    // (distributed faceting).  but this has much higher
+    // temp ram req'ts (sum of number of ords across all
+    // segs)
+    if (ordinalMap != null) {
+      final LongValues ordMap = ordinalMap.getGlobalOrds(segOrd);
+      int numSegOrds = (int) multiValues.getValueCount();
+
+      // First count in seg-ord space:
+      final int[] segCounts = new int[numSegOrds];
+      if (singleValues != null) {
+        for (int doc = singleValues.nextDoc();
+            doc != DocIdSetIterator.NO_MORE_DOCS;
+            doc = singleValues.nextDoc()) {
+          segCounts[singleValues.ordValue()]++;
+        }
+      } else {
+        for (int doc = multiValues.nextDoc();
+            doc != DocIdSetIterator.NO_MORE_DOCS;
+            doc = multiValues.nextDoc()) {
+          int term = (int) multiValues.nextOrd();
+          while (term != SortedSetDocValues.NO_MORE_ORDS) {
+            segCounts[term]++;
+            term = (int) multiValues.nextOrd();
+          }
+        }
+      }
+
+      // Then, migrate to global ords:
+      for (int ord = 0; ord < numSegOrds; ord++) {
+        int count = segCounts[ord];
+        if (count != 0) {
+          // ordinalMap.getGlobalOrd(segOrd, ord));
+          counts[(int) ordMap.get(ord)] += count;
+        }
+      }
+    } else {
+      // No ord mapping (e.g., single segment index):
+      // just aggregate directly into counts:
+      if (singleValues != null) {
+        for (int doc = singleValues.nextDoc();
+            doc != DocIdSetIterator.NO_MORE_DOCS;
+            doc = singleValues.nextDoc()) {
+          counts[singleValues.ordValue()]++;
+        }
+      } else {
+        for (int doc = multiValues.nextDoc();
+            doc != DocIdSetIterator.NO_MORE_DOCS;
+            doc = multiValues.nextDoc()) {
+          int term = (int) multiValues.nextOrd();
+          while (term != SortedSetDocValues.NO_MORE_ORDS) {
+            counts[term]++;
+            term = (int) multiValues.nextOrd();
+          }
+        }
+      }
+    }
+  }
+
   private void countOneSegment(
       OrdinalMap ordinalMap, LeafReader reader, int segOrd, MatchingDocs hits, Bits liveDocs)
       throws IOException {
@@ -295,7 +372,8 @@ public class SortedSetDocValuesFacetCounts extends Facets {
 
     DocIdSetIterator it;
     if (hits == null) {
-      it = (liveDocs != null) ? FacetUtils.liveDocsDISI(valuesIt, liveDocs) : valuesIt;
+      assert liveDocs != null;
+      it = FacetUtils.liveDocsDISI(valuesIt, liveDocs);
     } else {
       it = ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits.iterator(), valuesIt));
     }
@@ -421,8 +499,13 @@ public class SortedSetDocValuesFacetCounts extends Facets {
 
     for (LeafReaderContext context : state.getReader().leaves()) {
 
-      countOneSegment(
-          ordinalMap, context.reader(), context.ord, null, context.reader().getLiveDocs());
+      Bits liveDocs = context.reader().getLiveDocs();
+      if (liveDocs == null) {
+        countOneSegmentNHLD(ordinalMap, context.reader(), context.ord);
+      } else {
+        countOneSegment(
+            ordinalMap, context.reader(), context.ord, null, context.reader().getLiveDocs());
+      }
     }
   }
 
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java
index 1adb3993333..b1daa753917 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java
@@ -127,23 +127,41 @@ public class FastTaxonomyFacetCounts extends IntTaxonomyFacets {
       NumericDocValues singleValued = DocValues.unwrapSingleton(multiValued);
 
       if (singleValued != null) {
-        for (int doc = singleValued.nextDoc();
-            doc != DocIdSetIterator.NO_MORE_DOCS;
-            doc = singleValued.nextDoc()) {
-          if (liveDocs != null && liveDocs.get(doc) == false) {
-            continue;
+        if (liveDocs == null) {
+          for (int doc = singleValued.nextDoc();
+              doc != DocIdSetIterator.NO_MORE_DOCS;
+              doc = singleValued.nextDoc()) {
+            values[(int) singleValued.longValue()]++;
+          }
+        } else {
+          for (int doc = singleValued.nextDoc();
+              doc != DocIdSetIterator.NO_MORE_DOCS;
+              doc = singleValued.nextDoc()) {
+            if (liveDocs.get(doc) == false) {
+              continue;
+            }
+            values[(int) singleValued.longValue()]++;
           }
-          values[(int) singleValued.longValue()]++;
         }
       } else {
-        for (int doc = multiValued.nextDoc();
-            doc != DocIdSetIterator.NO_MORE_DOCS;
-            doc = multiValued.nextDoc()) {
-          if (liveDocs != null && liveDocs.get(doc) == false) {
-            continue;
+        if (liveDocs == null) {
+          for (int doc = multiValued.nextDoc();
+              doc != DocIdSetIterator.NO_MORE_DOCS;
+              doc = multiValued.nextDoc()) {
+            for (int i = 0; i < multiValued.docValueCount(); i++) {
+              values[(int) multiValued.nextValue()]++;
+            }
           }
-          for (int i = 0; i < multiValued.docValueCount(); i++) {
-            values[(int) multiValued.nextValue()]++;
+        } else {
+          for (int doc = multiValued.nextDoc();
+              doc != DocIdSetIterator.NO_MORE_DOCS;
+              doc = multiValued.nextDoc()) {
+            if (liveDocs.get(doc) == false) {
+              continue;
+            }
+            for (int i = 0; i < multiValued.docValueCount(); i++) {
+              values[(int) multiValued.nextValue()]++;
+            }
           }
         }
       }