You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by am...@apache.org on 2019/04/09 00:33:56 UTC
[drill] branch master updated: DRILL-7119: Compute range predicate selectivity using histograms.

This is an automated email from the ASF dual-hosted git repository.

amansinha pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git


The following commit(s) were added to refs/heads/master by this push:
     new 849f896  DRILL-7119: Compute range predicate selectivity using histograms.
849f896 is described below

commit 849f896c491118571303a1313f75bece37e80b7e
Author: Aman Sinha <as...@maprtech.com>
AuthorDate: Mon Mar 25 06:55:57 2019 -0700

    DRILL-7119: Compute range predicate selectivity using histograms.
    
    Address code review comments.  Add unit test for histogram usage.
    
    close apache/drill#1733
---
 .../drill/exec/planner/common/DrillStatsTable.java |   2 +-
 .../drill/exec/planner/common/Histogram.java       |   2 +-
 .../planner/common/NumericEquiDepthHistogram.java  | 188 ++++++++++++++++++++-
 .../exec/planner/cost/DrillRelMdSelectivity.java   |  28 ++-
 .../test/java/org/apache/drill/PlanTestBase.java   |   4 +-
 .../org/apache/drill/exec/sql/TestAnalyze.java     |  15 ++
 6 files changed, 225 insertions(+), 14 deletions(-)

diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/DrillStatsTable.java b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/DrillStatsTable.java
index 65b0b64..d34a5e2 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/DrillStatsTable.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/DrillStatsTable.java
@@ -215,7 +215,7 @@ public class DrillStatsTable {
 
             // get the histogram for this column
             Histogram hist = cs.getHistogram();
-            histogram.put(cs.getName(), hist);
+            histogram.put(cs.getName().toUpperCase(), hist);
           }
         }
       }
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/Histogram.java b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/Histogram.java
index 1980ee5..c6444a3 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/Histogram.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/Histogram.java
@@ -37,5 +37,5 @@ public interface Histogram {
    * @param filter
    * @return estimated selectivity or NULL if it could not be estimated for any reason
    */
-  Double estimatedSelectivity(RexNode filter);
+  Double estimatedSelectivity(final RexNode filter);
 }
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/NumericEquiDepthHistogram.java b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/NumericEquiDepthHistogram.java
index 386141e..9d5bf6f 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/NumericEquiDepthHistogram.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/NumericEquiDepthHistogram.java
@@ -21,8 +21,15 @@ package org.apache.drill.exec.planner.common;
 import com.fasterxml.jackson.annotation.JsonProperty;
 import com.fasterxml.jackson.annotation.JsonTypeName;
 
+import java.util.List;
+
+import org.apache.calcite.rex.RexCall;
 import org.apache.calcite.rex.RexNode;
+import org.apache.calcite.rex.RexLiteral;
 import com.clearspring.analytics.stream.quantile.TDigest;
+import org.apache.calcite.sql.SqlKind;
+import org.apache.calcite.sql.SqlOperator;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
 
 /**
  * A column specific equi-depth histogram which is meant for numeric data types
@@ -30,6 +37,19 @@ import com.clearspring.analytics.stream.quantile.TDigest;
 @JsonTypeName("numeric-equi-depth")
 public class NumericEquiDepthHistogram implements Histogram {
 
+  /**
+   * Use a small non-zero selectivity rather than 0 to account for the fact that
+   * histogram boundaries are approximate and even if some values lie outside the
+   * range, we cannot be absolutely sure
+   */
+  static final double SMALL_SELECTIVITY = 0.0001;
+
+  /**
+   * Use a large selectivity of 1.0 whenever we are reasonably confident that all rows
+   * qualify. Even if this is off by a small fraction, it is acceptable.
+   */
+  static final double LARGE_SELECTIVITY = 1.0;
+
   // For equi-depth, all buckets will have same (approx) number of rows
   @JsonProperty("numRowsPerBucket")
   private long numRowsPerBucket;
@@ -69,27 +89,177 @@ public class NumericEquiDepthHistogram implements Histogram {
   }
 
   @Override
-  public Double estimatedSelectivity(RexNode filter) {
+  public Double estimatedSelectivity(final RexNode filter) {
     if (numRowsPerBucket >= 0) {
-      return 1.0;
-    } else {
-      return null;
+      // at a minimum, the histogram should have a start and end point of 1 bucket, so at least 2 entries
+      Preconditions.checkArgument(buckets.length >= 2,  "Histogram has invalid number of entries");
+      final int first = 0;
+      final int last = buckets.length - 1;
+
+      // number of buckets is 1 less than the total # entries in the buckets array since last
+      // entry is the end point of the last bucket
+      final int numBuckets = buckets.length - 1;
+      final long totalRows = numBuckets * numRowsPerBucket;
+      if (filter instanceof RexCall) {
+        // get the operator
+        SqlOperator op = ((RexCall) filter).getOperator();
+        if (op.getKind() == SqlKind.GREATER_THAN ||
+                op.getKind() == SqlKind.GREATER_THAN_OR_EQUAL) {
+          Double value = getLiteralValue(filter);
+          if (value != null) {
+
+            // *** Handle the boundary conditions first ***
+
+            // if value is less than or equal to the first bucket's start point then all rows qualify
+            int result = value.compareTo(buckets[first]);
+            if (result <= 0) {
+              return LARGE_SELECTIVITY;
+            }
+            // if value is greater than the end point of the last bucket, then none of the rows qualify
+            result = value.compareTo(buckets[last]);
+            if (result > 0) {
+              return SMALL_SELECTIVITY;
+            } else if (result == 0) {
+              if (op.getKind() == SqlKind.GREATER_THAN_OR_EQUAL) {
+                // value is exactly equal to the last bucket's end point so we take the ratio 1/bucket_width
+                long totalFilterRows = (long) (1 / (buckets[last] - buckets[last - 1]) * numRowsPerBucket);
+                double selectivity = (double) totalFilterRows / totalRows;
+                return selectivity;
+              } else {
+                // predicate is 'column > value' and value is equal to last bucket's endpoint, so none of
+                // the rows qualify
+                return SMALL_SELECTIVITY;
+              }
+            }
+
+            // *** End of boundary conditions ****
+
+            int n = getContainingBucket(value, numBuckets);
+            if (n >= 0) {
+              // all buckets to the right of containing bucket will be fully covered
+              int coveredBuckets = (last) - (n + 1);
+              long coveredRows = numRowsPerBucket * coveredBuckets;
+              // num matching rows in the current bucket is a function of (end_point_of_bucket - value)
+              long partialRows = (long) ((buckets[n + 1] - value) / (buckets[n + 1] - buckets[n]) * numRowsPerBucket);
+              long totalFilterRows = partialRows + coveredRows;
+              double selectivity = (double)totalFilterRows/totalRows;
+              return selectivity;
+            } else {
+              // value does not exist in any of the buckets
+              return SMALL_SELECTIVITY;
+            }
+          }
+        } else if (op.getKind() == SqlKind.LESS_THAN ||
+                op.getKind() == SqlKind.LESS_THAN_OR_EQUAL) {
+          Double value = getLiteralValue(filter);
+          if (value != null) {
+
+            // *** Handle the boundary conditions first ***
+
+            // if value is greater than the last bucket's end point then all rows qualify
+            int result = value.compareTo(buckets[last]);
+            if (result >= 0) {
+              return LARGE_SELECTIVITY;
+            }
+            // if value is less than the first bucket's start point then none of the rows qualify
+            result = value.compareTo(buckets[first]);
+            if (result < 0) {
+              return SMALL_SELECTIVITY;
+            } else if (result == 0) {
+              if (op.getKind() == SqlKind.LESS_THAN_OR_EQUAL) {
+                // value is exactly equal to the first bucket's start point so we take the ratio 1/bucket_width
+                long totalFilterRows = (long) (1 / (buckets[first + 1] - buckets[first]) * numRowsPerBucket);
+                double selectivity = (double) totalFilterRows / totalRows;
+                return selectivity;
+              } else {
+                // predicate is 'column < value' and value is equal to first bucket's start point, so none of
+                // the rows qualify
+                return SMALL_SELECTIVITY;
+              }
+            }
+
+            // *** End of boundary conditions ****
+
+            int n = getContainingBucket(value, numBuckets);
+            if (n >= 0) {
+              // all buckets to the left will be fully covered
+              int coveredBuckets = n;
+              long coveredRows = numRowsPerBucket * coveredBuckets;
+              // num matching rows in the current bucket is a function of (value - start_point_of_bucket)
+              long partialRows = (long) ((value - buckets[n]) / (buckets[n + 1] - buckets[n]) * numRowsPerBucket);
+              long totalFilterRows = partialRows + coveredRows;
+              double selectivity = (double)totalFilterRows / totalRows;
+              return selectivity;
+            } else {
+              // value does not exist in any of the buckets
+              return SMALL_SELECTIVITY;
+            }
+          }
+        }
+      }
+    }
+    return null;
+  }
+
+  private int getContainingBucket(final Double value, final int numBuckets) {
+    int i = 0;
+    int containing_bucket = -1;
+    // check which bucket this value falls in
+    for (; i <= numBuckets; i++) {
+      int result = buckets[i].compareTo(value);
+      if (result > 0) {
+        containing_bucket = i - 1;
+        break;
+      } else if (result == 0) {
+        containing_bucket = i;
+        break;
+      }
+    }
+    return containing_bucket;
+  }
+
+  private Double getLiteralValue(final RexNode filter) {
+    Double value = null;
+    List<RexNode> operands = ((RexCall) filter).getOperands();
+    if (operands.size() == 2 && operands.get(1) instanceof RexLiteral) {
+      RexLiteral l = ((RexLiteral) operands.get(1));
+
+      switch (l.getTypeName()) {
+        case DATE:
+        case TIMESTAMP:
+        case TIME:
+          value = (double) ((java.util.Calendar) l.getValue()).getTimeInMillis();
+          break;
+        case INTEGER:
+        case BIGINT:
+        case FLOAT:
+        case DOUBLE:
+        case DECIMAL:
+        case BOOLEAN:
+          value = l.getValueAs(Double.class);
+          break;
+        default:
+          break;
+      }
     }
+    return value;
   }
 
   /**
-   * Utility method to build a Numeric Equi-Depth Histogram from a t-digest byte array
+   * Build a Numeric Equi-Depth Histogram from a t-digest byte array
    * @param tdigest_array
+   * @param numBuckets
+   * @param nonNullCount
    * @return An instance of NumericEquiDepthHistogram
    */
-  public static NumericEquiDepthHistogram buildFromTDigest(byte[] tdigest_array,
-                                                           int numBuckets,
-                                                           long nonNullCount) {
+  public static NumericEquiDepthHistogram buildFromTDigest(final byte[] tdigest_array,
+                                                           final int numBuckets,
+                                                           final long nonNullCount) {
     TDigest tdigest = TDigest.fromBytes(java.nio.ByteBuffer.wrap(tdigest_array));
 
     NumericEquiDepthHistogram histogram = new NumericEquiDepthHistogram(numBuckets);
 
-    double q = 1.0/numBuckets;
+    final double q = 1.0/numBuckets;
     int i = 0;
     for (; i < numBuckets; i++) {
       // get the starting point of the i-th quantile
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/cost/DrillRelMdSelectivity.java b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/cost/DrillRelMdSelectivity.java
index aae8b1d..4a6646e 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/cost/DrillRelMdSelectivity.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/cost/DrillRelMdSelectivity.java
@@ -19,6 +19,8 @@ package org.apache.drill.exec.planner.cost;
 
 import java.util.ArrayList;
 import java.util.List;
+import java.util.EnumSet;
+import java.util.Set;
 import org.apache.calcite.plan.RelOptUtil;
 import org.apache.calcite.plan.volcano.RelSubset;
 import org.apache.calcite.rel.RelNode;
@@ -45,6 +47,7 @@ import org.apache.drill.exec.physical.base.GroupScan;
 import org.apache.drill.exec.planner.common.DrillJoinRelBase;
 import org.apache.drill.exec.planner.common.DrillRelOptUtil;
 import org.apache.drill.exec.planner.common.DrillScanRelBase;
+import org.apache.drill.exec.planner.common.Histogram;
 import org.apache.drill.exec.planner.logical.DrillScanRel;
 import org.apache.drill.exec.planner.logical.DrillTable;
 import org.apache.drill.exec.planner.physical.PlannerSettings;
@@ -64,6 +67,11 @@ public class DrillRelMdSelectivity extends RelMdSelectivity {
    */
   private static final double LIKE_PREDICATE_SELECTIVITY = 0.05;
 
+  public static final Set<SqlKind> RANGE_PREDICATE =
+    EnumSet.of(
+      SqlKind.LESS_THAN, SqlKind.GREATER_THAN,
+      SqlKind.LESS_THAN_OR_EQUAL, SqlKind.GREATER_THAN_OR_EQUAL);
+
   @Override
   public Double getSelectivity(RelNode rel, RelMetadataQuery mq, RexNode predicate) {
     if (rel instanceof RelSubset && !DrillRelOptUtil.guessRows(rel)) {
@@ -145,6 +153,8 @@ public class DrillRelMdSelectivity extends RelMdSelectivity {
           orSel += RelMdUtil.guessSelectivity(orPred);  //CALCITE guess
         } else if (orPred.isA(SqlKind.EQUALS)) {
           orSel += computeEqualsSelectivity(table, orPred, fieldNames);
+        } else if (orPred.isA(RANGE_PREDICATE)) {
+          orSel += computeRangeSelectivity(table, orPred, fieldNames);
         } else if (orPred.isA(SqlKind.NOT_EQUALS)) {
           orSel += 1.0 - computeEqualsSelectivity(table, orPred, fieldNames);
         } else if (orPred.isA(SqlKind.LIKE)) {
@@ -167,7 +177,7 @@ public class DrillRelMdSelectivity extends RelMdSelectivity {
         } else if (orPred.isA(SqlKind.IS_NOT_NULL)) {
           orSel += computeIsNotNullSelectivity(table, orPred, fieldNames);
         } else {
-          //Use the CALCITE guess. TODO: Use histograms for COMPARISON operator
+          // Use the CALCITE guess.
           orSel += guessSelectivity(orPred);
         }
       }
@@ -188,6 +198,22 @@ public class DrillRelMdSelectivity extends RelMdSelectivity {
     return guessSelectivity(orPred);
   }
 
+  // Use histogram if available for the range predicate selectivity
+  private double computeRangeSelectivity(DrillTable table, RexNode orPred, List<String> fieldNames) {
+    String col = getColumn(orPred, fieldNames);
+    if (col != null) {
+      if (table.getStatsTable() != null
+        && table.getStatsTable().getHistogram(col) != null) {
+        Histogram histogram = table.getStatsTable().getHistogram(col);
+        Double sel = ((Histogram) histogram).estimatedSelectivity(orPred);
+        if (sel != null) {
+          return sel;
+        }
+      }
+    }
+    return guessSelectivity(orPred);
+  }
+
   private double computeIsNotNullSelectivity(DrillTable table, RexNode orPred, List<String> fieldNames) {
     String col = getColumn(orPred, fieldNames);
     if (col != null) {
diff --git a/exec/java-exec/src/test/java/org/apache/drill/PlanTestBase.java b/exec/java-exec/src/test/java/org/apache/drill/PlanTestBase.java
index da257c0..34126cf 100644
--- a/exec/java-exec/src/test/java/org/apache/drill/PlanTestBase.java
+++ b/exec/java-exec/src/test/java/org/apache/drill/PlanTestBase.java
@@ -157,7 +157,7 @@ public class PlanTestBase extends BaseTestQuery {
       for (final String s : expectedPatterns) {
         final Pattern p = Pattern.compile(s);
         final Matcher m = p.matcher(plan);
-        assertTrue(EXPECTED_NOT_FOUND + s, m.find());
+        assertTrue(EXPECTED_NOT_FOUND + s + "\n" + plan, m.find());
       }
     }
 
@@ -166,7 +166,7 @@ public class PlanTestBase extends BaseTestQuery {
       for (final String s : excludedPatterns) {
         final Pattern p = Pattern.compile(s);
         final Matcher m = p.matcher(plan);
-        assertFalse(UNEXPECTED_FOUND + s, m.find());
+        assertFalse(UNEXPECTED_FOUND + s + "\n" + plan, m.find());
       }
     }
   }
diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/sql/TestAnalyze.java b/exec/java-exec/src/test/java/org/apache/drill/exec/sql/TestAnalyze.java
index 30d23b3..8270352 100644
--- a/exec/java-exec/src/test/java/org/apache/drill/exec/sql/TestAnalyze.java
+++ b/exec/java-exec/src/test/java/org/apache/drill/exec/sql/TestAnalyze.java
@@ -400,6 +400,21 @@ public class TestAnalyze extends BaseTestQuery {
               .baselineValues("`hire_date_and_time`", 7)
               .baselineValues("`salary`", 11)
               .go();
+
+      // test the use of the just created histogram
+      test("alter session set `planner.statistics.use` = true");
+
+      // check boundary conditions: last bucket
+      String query = "select 1 from dfs.tmp.employee1 where store_id > 21";
+      String[] expectedPlan1 = {"Filter\\(condition.*\\).*rowcount = 112.*,.*",
+              "Scan.*columns=\\[`store_id`\\].*rowcount = 1128.0.*"};
+      PlanTestBase.testPlanWithAttributesMatchingPatterns(query, expectedPlan1, new String[]{});
+
+      query = "select 1 from dfs.tmp.employee1 where store_id < 15";
+      String[] expectedPlan2 = {"Filter\\(condition.*\\).*rowcount = 676.*,.*",
+              "Scan.*columns=\\[`store_id`\\].*rowcount = 1128.0.*"};
+      PlanTestBase.testPlanWithAttributesMatchingPatterns(query, expectedPlan2, new String[]{});
+
     } finally {
       test("ALTER SESSION SET `planner.slice_target` = " + ExecConstants.SLICE_TARGET_DEFAULT);
     }