You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pinot.apache.org by GitBox <gi...@apache.org> on 2022/03/29 18:11:33 UTC
[GitHub] [pinot] Jackie-Jiang commented on a change in pull request #8411: accelerate counts over filters

Jackie-Jiang commented on a change in pull request #8411:
URL: https://github.com/apache/pinot/pull/8411#discussion_r837719142



##########
File path: pinot-core/src/main/java/org/apache/pinot/core/operator/filter/AndFilterOperator.java
##########
@@ -45,6 +47,39 @@ protected FilterBlock getNextBlock() {
     return new FilterBlock(new AndDocIdSet(filterBlockDocIdSets));
   }
 
+  @Override
+  public boolean canOptimizeCount() {
+    boolean allChildrenCanProduceBitmaps = true;
+    for (BaseFilterOperator child : _filterOperators) {
+      allChildrenCanProduceBitmaps &= (child.isResultMatchingAll() | child.canProduceBitmaps());
+    }
+    return allChildrenCanProduceBitmaps;
+  }
+
+  @Override
+  public int getNumMatchingDocs() {
+    List<BitmapCollection> children = new ArrayList<>(_filterOperators.size());
+    for (BaseFilterOperator child : _filterOperators) {
+      if (!child.isResultMatchingAll()) {

Review comment:
       Same here, we may remove this check

##########
File path: pinot-core/src/main/java/org/apache/pinot/core/operator/filter/AndFilterOperator.java
##########
@@ -45,6 +47,39 @@ protected FilterBlock getNextBlock() {
     return new FilterBlock(new AndDocIdSet(filterBlockDocIdSets));
   }
 
+  @Override
+  public boolean canOptimizeCount() {
+    boolean allChildrenCanProduceBitmaps = true;
+    for (BaseFilterOperator child : _filterOperators) {
+      allChildrenCanProduceBitmaps &= (child.isResultMatchingAll() | child.canProduceBitmaps());
+    }
+    return allChildrenCanProduceBitmaps;
+  }
+
+  @Override
+  public int getNumMatchingDocs() {
+    List<BitmapCollection> children = new ArrayList<>(_filterOperators.size());
+    for (BaseFilterOperator child : _filterOperators) {
+      if (!child.isResultMatchingAll()) {
+        children.add(child.getBitmaps());
+      }
+    }
+    switch (children.size()) {
+      case 0:
+        return _filterOperators.get(0).getNumMatchingDocs();
+      case 1:
+        return children.get(0).cardinality();

Review comment:
       These 2 branches are not possible to hit. We can change this switch to a if check

##########
File path: pinot-core/src/main/java/org/apache/pinot/core/operator/filter/OrFilterOperator.java
##########
@@ -32,6 +34,8 @@
   private final List<BaseFilterOperator> _filterOperators;
   private final int _numDocs;
 
+  private boolean _anyChildrenMatchAll = false;

Review comment:
       Same comment as the `AndFilterOperator`. All the children cannot have `isResultMatchingAll()` or `isResultEmpty()` as `false`

##########
File path: pinot-core/src/main/java/org/apache/pinot/core/operator/filter/SortedIndexBasedFilterOperator.java
##########
@@ -132,6 +133,96 @@ protected FilterBlock getNextBlock() {
     }
   }
 
+  @Override
+  public boolean canOptimizeCount() {
+    return true;
+  }
+
+  @Override
+  public int getNumMatchingDocs() {
+    int count = 0;
+    boolean exclusive = _predicateEvaluator.isExclusive();
+    if (_predicateEvaluator instanceof SortedDictionaryBasedRangePredicateEvaluator) {
+      // For RANGE predicate, use start/end document id to construct a new document id range
+      SortedDictionaryBasedRangePredicateEvaluator rangePredicateEvaluator =
+          (SortedDictionaryBasedRangePredicateEvaluator) _predicateEvaluator;
+      int startDocId = _sortedIndexReader.getDocIds(rangePredicateEvaluator.getStartDictId()).getLeft();
+      // NOTE: End dictionary id is exclusive in OfflineDictionaryBasedRangePredicateEvaluator.
+      int endDocId = _sortedIndexReader.getDocIds(rangePredicateEvaluator.getEndDictId() - 1).getRight();
+      count = endDocId - startDocId + 1;
+    } else {
+      int[] dictIds =
+          exclusive ? _predicateEvaluator.getNonMatchingDictIds() : _predicateEvaluator.getMatchingDictIds();
+      int numDictIds = dictIds.length;
+      // NOTE: PredicateEvaluator without matching/non-matching dictionary ids should not reach here.
+      Preconditions.checkState(numDictIds > 0);
+      if (numDictIds == 1) {
+        IntPair docIdRange = _sortedIndexReader.getDocIds(dictIds[0]);
+        count = docIdRange.getRight() - docIdRange.getLeft() + 1;
+      } else {
+        // Sort the dictIds in ascending order so that their respective docIdRanges are adjacent if they are adjacent
+        Arrays.sort(dictIds);

Review comment:
       Why do we need this? We can simply loop over all the dictIds, and add `docIdRange.getRight() - docIdRange.getLeft() + 1` to the count

##########
File path: pinot-core/src/main/java/org/apache/pinot/core/operator/filter/SortedIndexBasedFilterOperator.java
##########
@@ -132,6 +133,96 @@ protected FilterBlock getNextBlock() {
     }
   }
 
+  @Override
+  public boolean canOptimizeCount() {
+    return true;
+  }
+
+  @Override
+  public int getNumMatchingDocs() {
+    int count = 0;
+    boolean exclusive = _predicateEvaluator.isExclusive();
+    if (_predicateEvaluator instanceof SortedDictionaryBasedRangePredicateEvaluator) {
+      // For RANGE predicate, use start/end document id to construct a new document id range
+      SortedDictionaryBasedRangePredicateEvaluator rangePredicateEvaluator =
+          (SortedDictionaryBasedRangePredicateEvaluator) _predicateEvaluator;
+      int startDocId = _sortedIndexReader.getDocIds(rangePredicateEvaluator.getStartDictId()).getLeft();
+      // NOTE: End dictionary id is exclusive in OfflineDictionaryBasedRangePredicateEvaluator.
+      int endDocId = _sortedIndexReader.getDocIds(rangePredicateEvaluator.getEndDictId() - 1).getRight();
+      count = endDocId - startDocId + 1;
+    } else {
+      int[] dictIds =
+          exclusive ? _predicateEvaluator.getNonMatchingDictIds() : _predicateEvaluator.getMatchingDictIds();
+      int numDictIds = dictIds.length;
+      // NOTE: PredicateEvaluator without matching/non-matching dictionary ids should not reach here.
+      Preconditions.checkState(numDictIds > 0);
+      if (numDictIds == 1) {
+        IntPair docIdRange = _sortedIndexReader.getDocIds(dictIds[0]);
+        count = docIdRange.getRight() - docIdRange.getLeft() + 1;
+      } else {
+        // Sort the dictIds in ascending order so that their respective docIdRanges are adjacent if they are adjacent
+        Arrays.sort(dictIds);
+        IntPair lastDocIdRange = _sortedIndexReader.getDocIds(dictIds[0]);
+        for (int i = 1; i < numDictIds; i++) {
+          IntPair docIdRange = _sortedIndexReader.getDocIds(dictIds[i]);
+          if (docIdRange.getLeft() == lastDocIdRange.getRight() + 1) {
+            lastDocIdRange.setRight(docIdRange.getRight());
+          } else {
+            count += lastDocIdRange.getRight() - lastDocIdRange.getLeft() + 1;
+            lastDocIdRange = docIdRange;
+          }
+        }
+        count += lastDocIdRange.getRight() - lastDocIdRange.getLeft() + 1;
+      }
+    }
+    return exclusive ? _numDocs - count : count;
+  }
+
+  @Override
+  public boolean canProduceBitmaps() {
+    return true;
+  }
+
+  @Override
+  public BitmapCollection getBitmaps() {
+    MutableRoaringBitmap bitmap = new MutableRoaringBitmap();
+    boolean exclusive = _predicateEvaluator.isExclusive();
+    if (_predicateEvaluator instanceof SortedDictionaryBasedRangePredicateEvaluator) {
+      // For RANGE predicate, use start/end document id to construct a new document id range
+      SortedDictionaryBasedRangePredicateEvaluator rangePredicateEvaluator =
+          (SortedDictionaryBasedRangePredicateEvaluator) _predicateEvaluator;
+      int startDocId = _sortedIndexReader.getDocIds(rangePredicateEvaluator.getStartDictId()).getLeft();
+      // NOTE: End dictionary id is exclusive in OfflineDictionaryBasedRangePredicateEvaluator.
+      int endDocId = _sortedIndexReader.getDocIds(rangePredicateEvaluator.getEndDictId() - 1).getRight();
+      bitmap.add(startDocId, endDocId + 1L);
+    } else {
+      int[] dictIds =
+          exclusive ? _predicateEvaluator.getNonMatchingDictIds() : _predicateEvaluator.getMatchingDictIds();
+      int numDictIds = dictIds.length;
+      // NOTE: PredicateEvaluator without matching/non-matching dictionary ids should not reach here.
+      Preconditions.checkState(numDictIds > 0);
+      if (numDictIds == 1) {
+        IntPair docIdRange = _sortedIndexReader.getDocIds(dictIds[0]);
+        bitmap.add(docIdRange.getLeft(), docIdRange.getRight() + 1L);
+      } else {
+        // Sort the dictIds in ascending order so that their respective docIdRanges are adjacent if they are adjacent

Review comment:
       Will this be faster than adding each range individually?

##########
File path: pinot-core/src/main/java/org/apache/pinot/core/operator/filter/AndFilterOperator.java
##########
@@ -45,6 +47,39 @@ protected FilterBlock getNextBlock() {
     return new FilterBlock(new AndDocIdSet(filterBlockDocIdSets));
   }
 
+  @Override
+  public boolean canOptimizeCount() {
+    boolean allChildrenCanProduceBitmaps = true;
+    for (BaseFilterOperator child : _filterOperators) {
+      allChildrenCanProduceBitmaps &= (child.isResultMatchingAll() | child.canProduceBitmaps());

Review comment:
       All the child operators will always have `isResultEmpty()` and `isResultMatchingAll()` false, no need to have an extra check (may add an assert for readability).
   
   Shall we early terminate?
   ```
     for (BaseFilterOperator child : _filterOperators) {
       if (!child.canProduceBitmaps()) {
         return false;
       }
     }
     return true;
   ```

##########
File path: pinot-core/src/main/java/org/apache/pinot/core/operator/filter/BaseFilterOperator.java
##########
@@ -40,4 +43,128 @@ public boolean isResultEmpty() {
   public boolean isResultMatchingAll() {
     return false;
   }
+
+  /**
+   * Returns {@code true} if the filter has an optimized count implementation.
+   */
+  public boolean canOptimizeCount() {
+    return false;
+  }
+
+  /**
+   * @return the number of matching docs, or throws if it cannot produce this count.
+   */
+  public int getNumMatchingDocs() {
+    throw new UnsupportedOperationException();
+  }
+
+  /**
+   * @return true if the filter operator can produce a bitmap of docIds
+   */
+  public boolean canProduceBitmaps() {
+    return false;
+  }
+
+  /**
+   * @return bitmaps of matching docIds
+   */
+  public BaseFilterOperator.BitmapCollection getBitmaps() {
+    throw new UnsupportedOperationException();
+  }
+
+  public static class BitmapCollection {

Review comment:
       Can we add some tests on this class, especially when `_bitmaps` is empty? I feel some of the corner cases are not handled properly

##########
File path: pinot-core/src/main/java/org/apache/pinot/core/operator/filter/BaseFilterOperator.java
##########
@@ -40,4 +43,128 @@ public boolean isResultEmpty() {
   public boolean isResultMatchingAll() {
     return false;
   }
+
+  /**
+   * Returns {@code true} if the filter has an optimized count implementation.
+   */
+  public boolean canOptimizeCount() {
+    return false;
+  }
+
+  /**
+   * @return the number of matching docs, or throws if it cannot produce this count.
+   */
+  public int getNumMatchingDocs() {
+    throw new UnsupportedOperationException();
+  }
+
+  /**
+   * @return true if the filter operator can produce a bitmap of docIds
+   */
+  public boolean canProduceBitmaps() {
+    return false;
+  }
+
+  /**
+   * @return bitmaps of matching docIds
+   */
+  public BaseFilterOperator.BitmapCollection getBitmaps() {
+    throw new UnsupportedOperationException();
+  }
+
+  public static class BitmapCollection {
+    private final int _numDocs;
+    private boolean _inverted;
+    private final ImmutableRoaringBitmap[] _bitmaps;

Review comment:
       Is it possible to support both `RoaringBitmap` and `ImmutableRoaringBitmap` (using `ImmutableBitmapDataProvider`) in case some bitmaps are not from the inverted index, and based on my understanding, creating `MutableRoaringBitmap` is more costly than `RoaringBitmap`

##########
File path: pinot-core/src/main/java/org/apache/pinot/core/operator/filter/BitmapBasedFilterOperator.java
##########
@@ -100,6 +101,67 @@ protected FilterBlock getNextBlock() {
     }
   }
 
+  @Override
+  public boolean canOptimizeCount() {
+    return true;
+  }
+
+  @Override
+  public int getNumMatchingDocs() {
+    int count = 0;
+    if (_invertedIndexReader == null) {
+      count = _docIds.getCardinality();
+    } else {
+      int[] dictIds = _exclusive
+          ? _predicateEvaluator.getNonMatchingDictIds()
+          : _predicateEvaluator.getMatchingDictIds();
+      switch (dictIds.length) {
+        case 0:
+          break;
+        case 1: {
+          count = _invertedIndexReader.getDocIds(dictIds[0]).getCardinality();
+          break;
+        }
+        case 2: {
+          count = ImmutableRoaringBitmap.orCardinality(_invertedIndexReader.getDocIds(dictIds[0]),
+              _invertedIndexReader.getDocIds(dictIds[1]));
+          break;
+        }
+        default: {
+          // this could be optimised if the bitmaps are known to be disjoint (as in a single value bitmap index)

Review comment:
       We may get this info from `DataSource.getDataSourceMetadata().isSingleValue()`. IMO this is worth optimizing in this PR as it is the most common case

##########
File path: pinot-core/src/main/java/org/apache/pinot/core/operator/filter/BitmapBasedFilterOperator.java
##########
@@ -100,6 +101,67 @@ protected FilterBlock getNextBlock() {
     }
   }
 
+  @Override
+  public boolean canOptimizeCount() {
+    return true;
+  }
+
+  @Override
+  public int getNumMatchingDocs() {
+    int count = 0;
+    if (_invertedIndexReader == null) {

Review comment:
       (minor) Shall we keep the same if check condition in `getNumMatchingDocs()` and `getBitmaps()` for readability

##########
File path: pinot-core/src/main/java/org/apache/pinot/core/operator/filter/OrFilterOperator.java
##########
@@ -60,4 +64,37 @@ public String toExplainString() {
   public List<Operator> getChildOperators() {
     return new ArrayList<>(_filterOperators);
   }
+
+  @Override
+  public boolean canOptimizeCount() {
+    boolean allChildrenProduceBitmaps = true;
+    for (BaseFilterOperator child : _filterOperators) {
+      allChildrenProduceBitmaps &= child.canProduceBitmaps();
+      _anyChildrenMatchAll |= child.isResultMatchingAll();
+    }
+    return _anyChildrenMatchAll | allChildrenProduceBitmaps;
+  }
+
+  @Override
+  public int getNumMatchingDocs() {
+    if (_anyChildrenMatchAll) {
+      return _numDocs;
+    }
+    List<BitmapCollection> children = new ArrayList<>(_filterOperators.size());
+    for (BaseFilterOperator child : _filterOperators) {
+      children.add(child.getBitmaps());
+    }
+    switch (children.size()) {
+      case 1:
+        return children.get(0).cardinality();

Review comment:
       This branch should never be hit

##########
File path: pinot-core/src/main/java/org/apache/pinot/core/operator/filter/BitmapBasedFilterOperator.java
##########
@@ -100,6 +101,67 @@ protected FilterBlock getNextBlock() {
     }
   }
 
+  @Override
+  public boolean canOptimizeCount() {
+    return true;
+  }
+
+  @Override
+  public int getNumMatchingDocs() {
+    int count = 0;
+    if (_invertedIndexReader == null) {
+      count = _docIds.getCardinality();
+    } else {
+      int[] dictIds = _exclusive
+          ? _predicateEvaluator.getNonMatchingDictIds()
+          : _predicateEvaluator.getMatchingDictIds();
+      switch (dictIds.length) {
+        case 0:
+          break;
+        case 1: {
+          count = _invertedIndexReader.getDocIds(dictIds[0]).getCardinality();
+          break;
+        }
+        case 2: {
+          count = ImmutableRoaringBitmap.orCardinality(_invertedIndexReader.getDocIds(dictIds[0]),
+              _invertedIndexReader.getDocIds(dictIds[1]));
+          break;
+        }
+        default: {
+          // this could be optimised if the bitmaps are known to be disjoint (as in a single value bitmap index)
+          MutableRoaringBitmap bitmap = new MutableRoaringBitmap();
+          for (int dictId : dictIds) {
+            bitmap.or(_invertedIndexReader.getDocIds(dictId));
+          }
+          count = bitmap.getCardinality();
+          break;
+        }
+      }
+    }
+    return _exclusive ? _numDocs - count : count;
+  }
+
+  @Override
+  public boolean canProduceBitmaps() {
+    return true;
+  }
+
+  @Override
+  public BitmapCollection getBitmaps() {
+    if (_docIds == null) {
+      int[] dictIds = _exclusive
+          ? _predicateEvaluator.getNonMatchingDictIds()
+          : _predicateEvaluator.getMatchingDictIds();
+      ImmutableRoaringBitmap[] bitmaps = new ImmutableRoaringBitmap[dictIds.length];
+      for (int i = 0; i < dictIds.length; i++) {
+        bitmaps[i] = (_invertedIndexReader.getDocIds(dictIds[i]));

Review comment:
       For SV column, we may `or` the disjoint bitmaps before creating the `BitmapCollection`




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org