You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pinot.apache.org by GitBox <gi...@apache.org> on 2022/06/15 10:19:09 UTC

[GitHub] [pinot] richardstartin commented on a diff in pull request #8891: Optimize dictionary lookup for IN clause

richardstartin commented on code in PR #8891:
URL: https://github.com/apache/pinot/pull/8891#discussion_r897801870


##########
pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/NotInPredicateEvaluatorFactory.java:
##########
@@ -70,67 +68,80 @@ public static BaseDictionaryBasedPredicateEvaluator newDictionaryBasedEvaluator(
    */
   public static BaseRawValueBasedPredicateEvaluator newRawValueBasedEvaluator(NotInPredicate notInPredicate,
       DataType dataType) {
-    List<String> values = notInPredicate.getValues();
-    int hashSetSize = HashUtil.getMinHashSetSize(values.size());
     switch (dataType) {
       case INT: {
-        IntSet nonMatchingValues = new IntOpenHashSet(hashSetSize);
-        for (String value : values) {
-          nonMatchingValues.add(Integer.parseInt(value));
+        int[] intValues = notInPredicate.getIntValues();
+        IntSet nonMatchingValues = new IntOpenHashSet(HashUtil.getMinHashSetSize(intValues.length));
+        for (int value : intValues) {
+          nonMatchingValues.add(value);
         }
         return new IntRawValueBasedNotInPredicateEvaluator(notInPredicate, nonMatchingValues);
       }
       case LONG: {
-        LongSet nonMatchingValues = new LongOpenHashSet(hashSetSize);
-        for (String value : values) {
-          nonMatchingValues.add(Long.parseLong(value));
+        long[] longValues = notInPredicate.getLongValues();
+        LongSet nonMatchingValues = new LongOpenHashSet(HashUtil.getMinHashSetSize(longValues.length));
+        for (long value : longValues) {
+          nonMatchingValues.add(value);
         }
         return new LongRawValueBasedNotInPredicateEvaluator(notInPredicate, nonMatchingValues);
       }
       case FLOAT: {
-        FloatSet nonMatchingValues = new FloatOpenHashSet(hashSetSize);
-        for (String value : values) {
-          nonMatchingValues.add(Float.parseFloat(value));
+        float[] floatValues = notInPredicate.getFloatValues();
+        FloatSet nonMatchingValues = new FloatOpenHashSet(HashUtil.getMinHashSetSize(floatValues.length));
+        for (float value : floatValues) {
+          nonMatchingValues.add(value);
         }
         return new FloatRawValueBasedNotInPredicateEvaluator(notInPredicate, nonMatchingValues);
       }
       case DOUBLE: {
-        DoubleSet nonMatchingValues = new DoubleOpenHashSet(hashSetSize);
-        for (String value : values) {
-          nonMatchingValues.add(Double.parseDouble(value));
+        double[] doubleValues = notInPredicate.getDoubleValues();
+        DoubleSet nonMatchingValues = new DoubleOpenHashSet(HashUtil.getMinHashSetSize(doubleValues.length));
+        for (double value : doubleValues) {
+          nonMatchingValues.add(value);
         }
         return new DoubleRawValueBasedNotInPredicateEvaluator(notInPredicate, nonMatchingValues);
       }
       case BIG_DECIMAL: {
-        TreeSet<BigDecimal> nonMatchingValues = new TreeSet<>();
-        for (String value : values) {
-          nonMatchingValues.add(new BigDecimal(value));
-        }
+        BigDecimal[] bigDecimalValues = notInPredicate.getBigDecimalValues();
+        // NOTE: Use TreeSet because BigDecimal's compareTo() is not consistent with equals()
+        //       E.g. compareTo(3.0, 3) returns 0 but equals(3.0, 3) returns false
+        TreeSet<BigDecimal> nonMatchingValues = new TreeSet<>(Arrays.asList(bigDecimalValues));
         return new BigDecimalRawValueBasedNotInPredicateEvaluator(notInPredicate, nonMatchingValues);
       }
       case BOOLEAN: {
-        IntSet nonMatchingValues = new IntOpenHashSet(hashSetSize);
-        for (String value : values) {
-          nonMatchingValues.add(BooleanUtils.toInt(value));
+        int[] booleanValues = notInPredicate.getBooleanValues();
+        IntSet nonMatchingValues = new IntOpenHashSet(HashUtil.getMinHashSetSize(booleanValues.length));
+        for (int value : booleanValues) {
+          nonMatchingValues.add(value);
         }
         return new IntRawValueBasedNotInPredicateEvaluator(notInPredicate, nonMatchingValues);
       }
       case TIMESTAMP: {
-        LongSet nonMatchingValues = new LongOpenHashSet(hashSetSize);
-        for (String value : values) {
-          nonMatchingValues.add(TimestampUtils.toMillisSinceEpoch(value));
+        long[] timestampValues = notInPredicate.getTimestampValues();
+        LongSet nonMatchingValues = new LongOpenHashSet(HashUtil.getMinHashSetSize(timestampValues.length));
+        for (long value : timestampValues) {
+          nonMatchingValues.add(value);
         }
         return new LongRawValueBasedNotInPredicateEvaluator(notInPredicate, nonMatchingValues);
       }
       case STRING: {
-        Set<String> nonMatchingValues = new ObjectOpenHashSet<>(hashSetSize);
-        nonMatchingValues.addAll(values);
+        List<String> stringValues = notInPredicate.getValues();
+        Set<String> nonMatchingValues = new ObjectOpenHashSet<>(HashUtil.getMinHashSetSize(stringValues.size()));
+        // NOTE: Add value-by-value to avoid overhead
+        for (String value : stringValues) {
+          //noinspection UseBulkOperation
+          nonMatchingValues.add(value);
+        }
         return new StringRawValueBasedNotInPredicateEvaluator(notInPredicate, nonMatchingValues);
       }
       case BYTES: {
-        Set<ByteArray> nonMatchingValues = new ObjectOpenHashSet<>(hashSetSize);
-        for (String value : values) {
-          nonMatchingValues.add(BytesUtils.toByteArray(value));
+        ByteArray[] bytesValues = notInPredicate.getBytesValues();
+        Set<ByteArray> nonMatchingValues = new ObjectOpenHashSet<>(HashUtil.getMinHashSetSize(bytesValues.length));
+        // NOTE: Add value-by-value to avoid overhead
+        //noinspection ManualArrayToCollectionCopy

Review Comment:
   What is the overhead being avoided here? Have you compared with 
   
   ```java
           Set<ByteArray> nonMatchingValues = new ObjectOpenHashSet<>(Arrays.asList(notInPredicate.getBytesValues()));
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org