You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@druid.apache.org by fj...@apache.org on 2020/08/26 03:05:59 UTC

[druid] branch master updated: Remove NUMERIC_HASHING_THRESHOLD (#10313)

This is an automated email from the ASF dual-hosted git repository.

fjy pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/druid.git


The following commit(s) were added to refs/heads/master by this push:
     new a9de00d  Remove NUMERIC_HASHING_THRESHOLD (#10313)
a9de00d is described below

commit a9de00d43ab34f6d9c5c1ba749617b7f2e1f1559
Author: Suneet Saldanha <su...@apache.org>
AuthorDate: Tue Aug 25 20:05:39 2020 -0700

    Remove NUMERIC_HASHING_THRESHOLD (#10313)
    
    * Make NUMERIC_HASHING_THRESHOLD configurable
    
    Change the default numeric hashing threshold to 1 and make it configurable.
    
    Benchmarks attached to this PR show that binary searches are not more faster
    than doing a set contains check. The attached flamegraph shows the amount of
    time a query spent in the binary search. Given the benchmarks, we can expect
    to see roughly a 2x speed up in this part of the query which works out to
    ~ a 10% faster query in this instance.
    
    * Remove NUMERIC_HASHING_THRESHOLD
    
    * Remove stale docs
---
 .../apache/druid/benchmark/ContainsBenchmark.java  | 103 +++++++++++++++++++++
 .../org/apache/druid/query/filter/InDimFilter.java |  59 ++++++------
 .../apache/druid/query/topn/TopNQueryBuilder.java  |   2 +-
 .../segment/join/filter/JoinFilterAnalyzer.java    |   8 +-
 .../filter/FloatAndDoubleFilteringTest.java        |   9 +-
 .../druid/segment/filter/LongFilteringTest.java    |   9 +-
 .../druid/segment/filter/TimeFilteringTest.java    |   5 +-
 7 files changed, 145 insertions(+), 50 deletions(-)

diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/ContainsBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/ContainsBenchmark.java
new file mode 100644
index 0000000..4e6eb95
--- /dev/null
+++ b/benchmarks/src/test/java/org/apache/druid/benchmark/ContainsBenchmark.java
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.benchmark;
+
+import it.unimi.dsi.fastutil.longs.LongArraySet;
+import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.util.Arrays;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.concurrent.TimeUnit;
+
+@State(Scope.Benchmark)
+@BenchmarkMode(Mode.AverageTime)
+@Warmup(iterations = 5)
+@Measurement(iterations = 10)
+@Fork(value = 1)
+public class ContainsBenchmark
+{
+  private static final long[] LONGS;
+  private static final long[] SORTED_LONGS;
+  private static final LongOpenHashSet LONG_HASH_SET;
+  private static final LongArraySet LONG_ARRAY_SET;
+
+  private long worstSearchValue;
+  private long worstSearchValueBin;
+
+  static {
+    LONGS = new long[16];
+    for (int i = 0; i < LONGS.length; i++) {
+      LONGS[i] = ThreadLocalRandom.current().nextInt(Short.MAX_VALUE);
+    }
+
+    LONG_HASH_SET = new LongOpenHashSet(LONGS);
+    LONG_ARRAY_SET = new LongArraySet(LONGS);
+    SORTED_LONGS = Arrays.copyOf(LONGS, LONGS.length);
+
+    Arrays.sort(SORTED_LONGS);
+
+  }
+
+  @Setup
+  public void setUp()
+  {
+    worstSearchValue = LONGS[LONGS.length - 1];
+    worstSearchValueBin = SORTED_LONGS[(SORTED_LONGS.length - 1) >>> 1];
+  }
+
+  @Benchmark
+  @BenchmarkMode(Mode.AverageTime)
+  @OutputTimeUnit(TimeUnit.NANOSECONDS)
+  public void linearSearch(Blackhole blackhole)
+  {
+    boolean found = LONG_ARRAY_SET.contains(worstSearchValue);
+    blackhole.consume(found);
+  }
+
+  @Benchmark
+  @BenchmarkMode(Mode.AverageTime)
+  @OutputTimeUnit(TimeUnit.NANOSECONDS)
+  public void hashSetSearch(Blackhole blackhole)
+  {
+
+    boolean found = LONG_HASH_SET.contains(worstSearchValueBin);
+    blackhole.consume(found);
+  }
+
+  @Benchmark
+  @BenchmarkMode(Mode.AverageTime)
+  @OutputTimeUnit(TimeUnit.NANOSECONDS)
+  public void binarySearch(Blackhole blackhole)
+  {
+    boolean found = Arrays.binarySearch(SORTED_LONGS, worstSearchValueBin) >= 0;
+    blackhole.consume(found);
+  }
+}
diff --git a/processing/src/main/java/org/apache/druid/query/filter/InDimFilter.java b/processing/src/main/java/org/apache/druid/query/filter/InDimFilter.java
index 8b0520c..063d6d3 100644
--- a/processing/src/main/java/org/apache/druid/query/filter/InDimFilter.java
+++ b/processing/src/main/java/org/apache/druid/query/filter/InDimFilter.java
@@ -66,7 +66,6 @@ import org.apache.druid.segment.vector.VectorColumnSelectorFactory;
 import javax.annotation.Nullable;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.Collection;
 import java.util.Comparator;
 import java.util.HashSet;
@@ -78,10 +77,6 @@ import java.util.Set;
 
 public class InDimFilter extends AbstractOptimizableDimFilter implements Filter
 {
-  // determined through benchmark that binary search on long[] is faster than HashSet until ~16 elements
-  // Hashing threshold is not applied to String for now, String still uses ImmutableSortedSet
-  public static final int NUMERIC_HASHING_THRESHOLD = 16;
-
   // Values can contain `null` object
   private final Set<String> values;
   private final String dimension;
@@ -114,6 +109,25 @@ public class InDimFilter extends AbstractOptimizableDimFilter implements Filter
   }
 
   /**
+   *
+   * @param dimension
+   * @param values This collection instance can be reused if possible to avoid copying a big collection.
+   *               Callers should <b>not</b> modify the collection after it is passed to this constructor.
+   */
+  public InDimFilter(
+      String dimension,
+      Set<String> values
+  )
+  {
+    this(
+        dimension,
+        values,
+        null,
+        null
+    );
+  }
+
+  /**
    * This constructor should be called only in unit tests since accepting a Collection makes copying more likely.
    */
   @VisibleForTesting
@@ -483,14 +497,9 @@ public class InDimFilter extends AbstractOptimizableDimFilter implements Filter
       }
     }
 
-    if (longs.size() > NUMERIC_HASHING_THRESHOLD) {
-      final LongOpenHashSet longHashSet = new LongOpenHashSet(longs);
-      return longHashSet::contains;
-    } else {
-      final long[] longArray = longs.toLongArray();
-      Arrays.sort(longArray);
-      return input -> Arrays.binarySearch(longArray, input) >= 0;
-    }
+
+    final LongOpenHashSet longHashSet = new LongOpenHashSet(longs);
+    return longHashSet::contains;
   }
 
   private static DruidFloatPredicate createFloatPredicate(final Set<String> values)
@@ -503,16 +512,8 @@ public class InDimFilter extends AbstractOptimizableDimFilter implements Filter
       }
     }
 
-    if (floatBits.size() > NUMERIC_HASHING_THRESHOLD) {
-      final IntOpenHashSet floatBitsHashSet = new IntOpenHashSet(floatBits);
-
-      return input -> floatBitsHashSet.contains(Float.floatToIntBits(input));
-    } else {
-      final int[] floatBitsArray = floatBits.toIntArray();
-      Arrays.sort(floatBitsArray);
-
-      return input -> Arrays.binarySearch(floatBitsArray, Float.floatToIntBits(input)) >= 0;
-    }
+    final IntOpenHashSet floatBitsHashSet = new IntOpenHashSet(floatBits);
+    return input -> floatBitsHashSet.contains(Float.floatToIntBits(input));
   }
 
   private static DruidDoublePredicate createDoublePredicate(final Set<String> values)
@@ -525,16 +526,8 @@ public class InDimFilter extends AbstractOptimizableDimFilter implements Filter
       }
     }
 
-    if (doubleBits.size() > NUMERIC_HASHING_THRESHOLD) {
-      final LongOpenHashSet doubleBitsHashSet = new LongOpenHashSet(doubleBits);
-
-      return input -> doubleBitsHashSet.contains(Double.doubleToLongBits(input));
-    } else {
-      final long[] doubleBitsArray = doubleBits.toLongArray();
-      Arrays.sort(doubleBitsArray);
-
-      return input -> Arrays.binarySearch(doubleBitsArray, Double.doubleToLongBits(input)) >= 0;
-    }
+    final LongOpenHashSet doubleBitsHashSet = new LongOpenHashSet(doubleBits);
+    return input -> doubleBitsHashSet.contains(Double.doubleToLongBits(input));
   }
 
   @VisibleForTesting
diff --git a/processing/src/main/java/org/apache/druid/query/topn/TopNQueryBuilder.java b/processing/src/main/java/org/apache/druid/query/topn/TopNQueryBuilder.java
index 700ab58..6699085 100644
--- a/processing/src/main/java/org/apache/druid/query/topn/TopNQueryBuilder.java
+++ b/processing/src/main/java/org/apache/druid/query/topn/TopNQueryBuilder.java
@@ -236,7 +236,7 @@ public class TopNQueryBuilder
   {
     final Set<String> filterValues = Sets.newHashSet(values);
     filterValues.add(value);
-    dimFilter = new InDimFilter(dimensionName, filterValues, null, null);
+    dimFilter = new InDimFilter(dimensionName, filterValues);
     return this;
   }
 
diff --git a/processing/src/main/java/org/apache/druid/segment/join/filter/JoinFilterAnalyzer.java b/processing/src/main/java/org/apache/druid/segment/join/filter/JoinFilterAnalyzer.java
index 67b1ee0..d3dd5cf 100644
--- a/processing/src/main/java/org/apache/druid/segment/join/filter/JoinFilterAnalyzer.java
+++ b/processing/src/main/java/org/apache/druid/segment/join/filter/JoinFilterAnalyzer.java
@@ -438,9 +438,7 @@ public class JoinFilterAnalyzer
         for (String correlatedBaseColumn : correlationAnalysis.getBaseColumns()) {
           Filter rewrittenFilter = new InDimFilter(
               correlatedBaseColumn,
-              newFilterValues,
-              null,
-              null
+              newFilterValues
           ).toFilter();
           newFilters.add(rewrittenFilter);
         }
@@ -461,9 +459,7 @@ public class JoinFilterAnalyzer
 
           Filter rewrittenFilter = new InDimFilter(
               pushDownVirtualColumn.getOutputName(),
-              newFilterValues,
-              null,
-              null
+              newFilterValues
           ).toFilter();
           newFilters.add(rewrittenFilter);
         }
diff --git a/processing/src/test/java/org/apache/druid/segment/filter/FloatAndDoubleFilteringTest.java b/processing/src/test/java/org/apache/druid/segment/filter/FloatAndDoubleFilteringTest.java
index 1ef9315..4c597e7 100644
--- a/processing/src/test/java/org/apache/druid/segment/filter/FloatAndDoubleFilteringTest.java
+++ b/processing/src/test/java/org/apache/druid/segment/filter/FloatAndDoubleFilteringTest.java
@@ -76,6 +76,7 @@ public class FloatAndDoubleFilteringTest extends BaseFilterTest
   private static final String TIMESTAMP_COLUMN = "ts";
   private static int EXECUTOR_NUM_THREADS = 16;
   private static int EXECUTOR_NUM_TASKS = 2000;
+  private static final int NUM_FILTER_VALUES = 32;
 
   private static final InputRowParser<Map<String, Object>> PARSER = new MapInputRowParser(
       new TimeAndDimsParseSpec(
@@ -200,8 +201,8 @@ public class FloatAndDoubleFilteringTest extends BaseFilterTest
     );
 
     // cross the hashing threshold to test hashset implementation, filter on even values
-    List<String> infilterValues = new ArrayList<>(InDimFilter.NUMERIC_HASHING_THRESHOLD * 2);
-    for (int i = 0; i < InDimFilter.NUMERIC_HASHING_THRESHOLD * 2; i++) {
+    List<String> infilterValues = new ArrayList<>(NUM_FILTER_VALUES);
+    for (int i = 0; i < NUM_FILTER_VALUES; i++) {
       infilterValues.add(String.valueOf(i * 2));
     }
     assertFilterMatches(
@@ -377,8 +378,8 @@ public class FloatAndDoubleFilteringTest extends BaseFilterTest
     );
 
     // cross the hashing threshold to test hashset implementation, filter on even values
-    List<String> infilterValues = new ArrayList<>(InDimFilter.NUMERIC_HASHING_THRESHOLD * 2);
-    for (int i = 0; i < InDimFilter.NUMERIC_HASHING_THRESHOLD * 2; i++) {
+    List<String> infilterValues = new ArrayList<>(NUM_FILTER_VALUES);
+    for (int i = 0; i < NUM_FILTER_VALUES; i++) {
       infilterValues.add(String.valueOf(i * 2));
     }
     assertFilterMatchesMultithreaded(
diff --git a/processing/src/test/java/org/apache/druid/segment/filter/LongFilteringTest.java b/processing/src/test/java/org/apache/druid/segment/filter/LongFilteringTest.java
index 1cee93b..48975d2 100644
--- a/processing/src/test/java/org/apache/druid/segment/filter/LongFilteringTest.java
+++ b/processing/src/test/java/org/apache/druid/segment/filter/LongFilteringTest.java
@@ -73,6 +73,7 @@ public class LongFilteringTest extends BaseFilterTest
   private static final String TIMESTAMP_COLUMN = "ts";
   private static int EXECUTOR_NUM_THREADS = 16;
   private static int EXECUTOR_NUM_TASKS = 2000;
+  private static final int NUM_FILTER_VALUES = 32;
 
   private static final InputRowParser<Map<String, Object>> PARSER = new MapInputRowParser(
       new TimeAndDimsParseSpec(
@@ -245,8 +246,8 @@ public class LongFilteringTest extends BaseFilterTest
     );
 
     // cross the hashing threshold to test hashset implementation, filter on even values
-    List<String> infilterValues = new ArrayList<>(InDimFilter.NUMERIC_HASHING_THRESHOLD * 2);
-    for (int i = 0; i < InDimFilter.NUMERIC_HASHING_THRESHOLD * 2; i++) {
+    List<String> infilterValues = new ArrayList<>(NUM_FILTER_VALUES);
+    for (int i = 0; i < NUM_FILTER_VALUES; i++) {
       infilterValues.add(String.valueOf(i * 2));
     }
     assertFilterMatches(
@@ -393,8 +394,8 @@ public class LongFilteringTest extends BaseFilterTest
     );
 
     // cross the hashing threshold to test hashset implementation, filter on even values
-    List<String> infilterValues = new ArrayList<>(InDimFilter.NUMERIC_HASHING_THRESHOLD * 2);
-    for (int i = 0; i < InDimFilter.NUMERIC_HASHING_THRESHOLD * 2; i++) {
+    List<String> infilterValues = new ArrayList<>(NUM_FILTER_VALUES);
+    for (int i = 0; i < NUM_FILTER_VALUES; i++) {
       infilterValues.add(String.valueOf(i * 2));
     }
     assertFilterMatchesMultithreaded(
diff --git a/processing/src/test/java/org/apache/druid/segment/filter/TimeFilteringTest.java b/processing/src/test/java/org/apache/druid/segment/filter/TimeFilteringTest.java
index 0cb15cc..574fa01 100644
--- a/processing/src/test/java/org/apache/druid/segment/filter/TimeFilteringTest.java
+++ b/processing/src/test/java/org/apache/druid/segment/filter/TimeFilteringTest.java
@@ -67,6 +67,7 @@ import java.util.Map;
 public class TimeFilteringTest extends BaseFilterTest
 {
   private static final String TIMESTAMP_COLUMN = "ts";
+  private static final int NUM_FILTER_VALUES = 32;
 
   private static final InputRowParser<Map<String, Object>> PARSER = new MapInputRowParser(
       new TimeAndDimsParseSpec(
@@ -132,8 +133,8 @@ public class TimeFilteringTest extends BaseFilterTest
     );
 
     // cross the hashing threshold to test hashset implementation, filter on even values
-    List<String> infilterValues = new ArrayList<>(InDimFilter.NUMERIC_HASHING_THRESHOLD * 2);
-    for (int i = 0; i < InDimFilter.NUMERIC_HASHING_THRESHOLD * 2; i++) {
+    List<String> infilterValues = new ArrayList<>(NUM_FILTER_VALUES);
+    for (int i = 0; i < NUM_FILTER_VALUES; i++) {
       infilterValues.add(String.valueOf(i * 2));
     }
     assertFilterMatches(


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@druid.apache.org
For additional commands, e-mail: commits-help@druid.apache.org