You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pinot.apache.org by GitBox <gi...@apache.org> on 2020/08/17 18:09:47 UTC
[GitHub] [incubator-pinot] Jackie-Jiang commented on a change in pull request #5872: Support for exact distinct count for non int data types

Jackie-Jiang commented on a change in pull request #5872:
URL: https://github.com/apache/incubator-pinot/pull/5872#discussion_r471661236



##########
File path: pinot-common/src/main/java/org/apache/pinot/common/function/AggregationFunctionType.java
##########
@@ -61,7 +61,8 @@
   PERCENTILEMV("percentileMV"),
   PERCENTILEESTMV("percentileEstMV"),
   PERCENTILETDIGESTMV("percentileTDigestMV"),
-  DISTINCT("distinct");
+  DISTINCT("distinct"),
+  DISTINCTRAWBLOOMFILTER("distinctRawBloomFilter");

Review comment:
       What is this for?

##########
File path: pinot-core/src/main/java/org/apache/pinot/core/common/ObjectSerDeUtils.java
##########
@@ -111,6 +127,14 @@ public static ObjectType getObjectType(Object value) {
         return ObjectType.Geometry;
       } else if (value instanceof RoaringBitmap) {
         return ObjectType.RoaringBitmap;
+      } else if (value instanceof LongSet) {
+        return ObjectType.LongSet;
+      } else if (value instanceof it.unimi.dsi.fastutil.floats.FloatSet) {

Review comment:
       ```suggestion
         } else if (value instanceof FloatSet) {
   ```

##########
File path: pinot-core/src/main/java/org/apache/pinot/core/common/ObjectSerDeUtils.java
##########
@@ -538,23 +695,8 @@ public RoaringBitmap deserialize(ByteBuffer byteBuffer) {
 
   // NOTE: DO NOT change the order, it has to be the same order as the ObjectType
   //@formatter:off
-  private static final ObjectSerDe[] SER_DES = {
-      STRING_SER_DE,
-      LONG_SER_DE,
-      DOUBLE_SER_DE,
-      DOUBLE_ARRAY_LIST_SER_DE,
-      AVG_PAIR_SER_DE,
-      MIN_MAX_RANGE_PAIR_SER_DE,
-      HYPER_LOG_LOG_SER_DE,
-      QUANTILE_DIGEST_SER_DE,
-      MAP_SER_DE,
-      INT_SET_SER_DE,
-      TDIGEST_SER_DE,
-      DISTINCT_TABLE_SER_DE,
-      DATA_SKETCH_SER_DE,
-      GEOMETRY_SER_DE,
-      ROARING_BITMAP_SER_DE
-  };
+  private static final ObjectSerDe[] SER_DES =

Review comment:
       Revert this reformat (you may want to enable formatter markers in comments in your IDE)

##########
File path: pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/DistinctCountAggregationFunction.java
##########
@@ -233,41 +241,103 @@ public void aggregateGroupByMV(int length, int[][] groupKeysArray, GroupByResult
   }
 
   @Override
-  public IntOpenHashSet extractAggregationResult(AggregationResultHolder aggregationResultHolder) {
+  public AbstractCollection extractAggregationResult(AggregationResultHolder aggregationResultHolder) {
     Object result = aggregationResultHolder.getResult();
     if (result == null) {
-      return new IntOpenHashSet();
+      return emptyCollection();
     }
 
     if (result instanceof DictIdsWrapper) {
       // For dictionary-encoded expression, convert dictionary ids to hash code of the values
       return convertToValueSet((DictIdsWrapper) result);
     } else {
       // For non-dictionary-encoded expression, directly return the value set
-      return (IntOpenHashSet) result;
+      return (AbstractCollection) result;
     }
   }
 
+  private AbstractCollection emptyCollection() {
+    return new AbstractCollection() {

Review comment:
       I don't think this works for ser/de. You need to construct a type specific set based on the data type

##########
File path: pinot-core/src/main/java/org/apache/pinot/core/operator/query/DictionaryBasedAggregationOperator.java
##########
@@ -77,36 +83,42 @@ protected IntermediateResultsBlock getNextBlock() {
               .add(new MinMaxRangePair(dictionary.getDoubleValue(0), dictionary.getDoubleValue(dictionarySize - 1)));
           break;
         case DISTINCTCOUNT:
-          IntOpenHashSet set = new IntOpenHashSet(dictionarySize);
+          AbstractCollection set;
           switch (dictionary.getValueType()) {
             case INT:
+              set = new IntOpenHashSet(dictionarySize);
               for (int dictId = 0; dictId < dictionarySize; dictId++) {
                 set.add(dictionary.getIntValue(dictId));
               }
               break;
             case LONG:
+              set = new LongOpenHashSet(dictionarySize);
               for (int dictId = 0; dictId < dictionarySize; dictId++) {
-                set.add(Long.hashCode(dictionary.getLongValue(dictId)));
+                set.add(dictionary.getLongValue(dictId));
               }
               break;
             case FLOAT:
+              set = new FloatOpenHashSet(dictionarySize);
               for (int dictId = 0; dictId < dictionarySize; dictId++) {
-                set.add(Float.hashCode(dictionary.getFloatValue(dictId)));
+                set.add(dictionary.getFloatValue(dictId));
               }
               break;
             case DOUBLE:
+              set = new DoubleOpenHashSet(dictionarySize);
               for (int dictId = 0; dictId < dictionarySize; dictId++) {
-                set.add(Double.hashCode(dictionary.getDoubleValue(dictId)));
+                set.add(dictionary.getDoubleValue(dictId));
               }
               break;
             case STRING:
+              set = new ObjectOpenHashSet<ByteBuffer>(dictionarySize);
               for (int dictId = 0; dictId < dictionarySize; dictId++) {
-                set.add(dictionary.getStringValue(dictId).hashCode());
+                set.add(ByteBuffer.wrap(dictionary.getStringValue(dictId).getBytes(Charsets.UTF_8)));

Review comment:
       Suggest using `ByteArray` instead of `ByteBuffer` to store bytes

##########
File path: pinot-core/src/main/java/org/apache/pinot/core/common/ObjectSerDeUtils.java
##########
@@ -111,6 +127,14 @@ public static ObjectType getObjectType(Object value) {
         return ObjectType.Geometry;
       } else if (value instanceof RoaringBitmap) {
         return ObjectType.RoaringBitmap;
+      } else if (value instanceof LongSet) {
+        return ObjectType.LongSet;
+      } else if (value instanceof it.unimi.dsi.fastutil.floats.FloatSet) {
+        return ObjectType.FloatSet;
+      } else if (value instanceof it.unimi.dsi.fastutil.doubles.DoubleSet) {

Review comment:
       ```suggestion
         } else if (value instanceof DoubleSet) {
   ```

##########
File path: pinot-core/src/main/java/org/apache/pinot/core/operator/query/DictionaryBasedAggregationOperator.java
##########
@@ -77,36 +83,42 @@ protected IntermediateResultsBlock getNextBlock() {
               .add(new MinMaxRangePair(dictionary.getDoubleValue(0), dictionary.getDoubleValue(dictionarySize - 1)));
           break;
         case DISTINCTCOUNT:
-          IntOpenHashSet set = new IntOpenHashSet(dictionarySize);
+          AbstractCollection set;
           switch (dictionary.getValueType()) {
             case INT:
+              set = new IntOpenHashSet(dictionarySize);
               for (int dictId = 0; dictId < dictionarySize; dictId++) {
                 set.add(dictionary.getIntValue(dictId));
               }
               break;
             case LONG:
+              set = new LongOpenHashSet(dictionarySize);
               for (int dictId = 0; dictId < dictionarySize; dictId++) {
-                set.add(Long.hashCode(dictionary.getLongValue(dictId)));
+                set.add(dictionary.getLongValue(dictId));
               }
               break;
             case FLOAT:
+              set = new FloatOpenHashSet(dictionarySize);
               for (int dictId = 0; dictId < dictionarySize; dictId++) {
-                set.add(Float.hashCode(dictionary.getFloatValue(dictId)));
+                set.add(dictionary.getFloatValue(dictId));
               }
               break;
             case DOUBLE:
+              set = new DoubleOpenHashSet(dictionarySize);
               for (int dictId = 0; dictId < dictionarySize; dictId++) {
-                set.add(Double.hashCode(dictionary.getDoubleValue(dictId)));
+                set.add(dictionary.getDoubleValue(dictId));
               }
               break;
             case STRING:
+              set = new ObjectOpenHashSet<ByteBuffer>(dictionarySize);
               for (int dictId = 0; dictId < dictionarySize; dictId++) {
-                set.add(dictionary.getStringValue(dictId).hashCode());
+                set.add(ByteBuffer.wrap(dictionary.getStringValue(dictId).getBytes(Charsets.UTF_8)));

Review comment:
       Use `StringUtils.encodeUtf8()` to encode string for better performance




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org