You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by am...@apache.org on 2019/04/09 00:33:56 UTC
[drill] branch master updated: DRILL-7119: Compute range predicate
selectivity using histograms.
This is an automated email from the ASF dual-hosted git repository.
amansinha pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git
The following commit(s) were added to refs/heads/master by this push:
new 849f896 DRILL-7119: Compute range predicate selectivity using histograms.
849f896 is described below
commit 849f896c491118571303a1313f75bece37e80b7e
Author: Aman Sinha <as...@maprtech.com>
AuthorDate: Mon Mar 25 06:55:57 2019 -0700
DRILL-7119: Compute range predicate selectivity using histograms.
Address code review comments. Add unit test for histogram usage.
close apache/drill#1733
---
.../drill/exec/planner/common/DrillStatsTable.java | 2 +-
.../drill/exec/planner/common/Histogram.java | 2 +-
.../planner/common/NumericEquiDepthHistogram.java | 188 ++++++++++++++++++++-
.../exec/planner/cost/DrillRelMdSelectivity.java | 28 ++-
.../test/java/org/apache/drill/PlanTestBase.java | 4 +-
.../org/apache/drill/exec/sql/TestAnalyze.java | 15 ++
6 files changed, 225 insertions(+), 14 deletions(-)
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/DrillStatsTable.java b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/DrillStatsTable.java
index 65b0b64..d34a5e2 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/DrillStatsTable.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/DrillStatsTable.java
@@ -215,7 +215,7 @@ public class DrillStatsTable {
// get the histogram for this column
Histogram hist = cs.getHistogram();
- histogram.put(cs.getName(), hist);
+ histogram.put(cs.getName().toUpperCase(), hist);
}
}
}
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/Histogram.java b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/Histogram.java
index 1980ee5..c6444a3 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/Histogram.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/Histogram.java
@@ -37,5 +37,5 @@ public interface Histogram {
* @param filter
* @return estimated selectivity or NULL if it could not be estimated for any reason
*/
- Double estimatedSelectivity(RexNode filter);
+ Double estimatedSelectivity(final RexNode filter);
}
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/NumericEquiDepthHistogram.java b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/NumericEquiDepthHistogram.java
index 386141e..9d5bf6f 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/NumericEquiDepthHistogram.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/NumericEquiDepthHistogram.java
@@ -21,8 +21,15 @@ package org.apache.drill.exec.planner.common;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonTypeName;
+import java.util.List;
+
+import org.apache.calcite.rex.RexCall;
import org.apache.calcite.rex.RexNode;
+import org.apache.calcite.rex.RexLiteral;
import com.clearspring.analytics.stream.quantile.TDigest;
+import org.apache.calcite.sql.SqlKind;
+import org.apache.calcite.sql.SqlOperator;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
/**
* A column specific equi-depth histogram which is meant for numeric data types
@@ -30,6 +37,19 @@ import com.clearspring.analytics.stream.quantile.TDigest;
@JsonTypeName("numeric-equi-depth")
public class NumericEquiDepthHistogram implements Histogram {
+ /**
+ * Use a small non-zero selectivity rather than 0 to account for the fact that
+ * histogram boundaries are approximate and even if some values lie outside the
+ * range, we cannot be absolutely sure
+ */
+ static final double SMALL_SELECTIVITY = 0.0001;
+
+ /**
+ * Use a large selectivity of 1.0 whenever we are reasonably confident that all rows
+ * qualify. Even if this is off by a small fraction, it is acceptable.
+ */
+ static final double LARGE_SELECTIVITY = 1.0;
+
// For equi-depth, all buckets will have same (approx) number of rows
@JsonProperty("numRowsPerBucket")
private long numRowsPerBucket;
@@ -69,27 +89,177 @@ public class NumericEquiDepthHistogram implements Histogram {
}
@Override
- public Double estimatedSelectivity(RexNode filter) {
+ public Double estimatedSelectivity(final RexNode filter) {
if (numRowsPerBucket >= 0) {
- return 1.0;
- } else {
- return null;
+ // at a minimum, the histogram should have a start and end point of 1 bucket, so at least 2 entries
+ Preconditions.checkArgument(buckets.length >= 2, "Histogram has invalid number of entries");
+ final int first = 0;
+ final int last = buckets.length - 1;
+
+ // number of buckets is 1 less than the total # entries in the buckets array since last
+ // entry is the end point of the last bucket
+ final int numBuckets = buckets.length - 1;
+ final long totalRows = numBuckets * numRowsPerBucket;
+ if (filter instanceof RexCall) {
+ // get the operator
+ SqlOperator op = ((RexCall) filter).getOperator();
+ if (op.getKind() == SqlKind.GREATER_THAN ||
+ op.getKind() == SqlKind.GREATER_THAN_OR_EQUAL) {
+ Double value = getLiteralValue(filter);
+ if (value != null) {
+
+ // *** Handle the boundary conditions first ***
+
+ // if value is less than or equal to the first bucket's start point then all rows qualify
+ int result = value.compareTo(buckets[first]);
+ if (result <= 0) {
+ return LARGE_SELECTIVITY;
+ }
+ // if value is greater than the end point of the last bucket, then none of the rows qualify
+ result = value.compareTo(buckets[last]);
+ if (result > 0) {
+ return SMALL_SELECTIVITY;
+ } else if (result == 0) {
+ if (op.getKind() == SqlKind.GREATER_THAN_OR_EQUAL) {
+ // value is exactly equal to the last bucket's end point so we take the ratio 1/bucket_width
+ long totalFilterRows = (long) (1 / (buckets[last] - buckets[last - 1]) * numRowsPerBucket);
+ double selectivity = (double) totalFilterRows / totalRows;
+ return selectivity;
+ } else {
+ // predicate is 'column > value' and value is equal to last bucket's endpoint, so none of
+ // the rows qualify
+ return SMALL_SELECTIVITY;
+ }
+ }
+
+ // *** End of boundary conditions ****
+
+ int n = getContainingBucket(value, numBuckets);
+ if (n >= 0) {
+ // all buckets to the right of containing bucket will be fully covered
+ int coveredBuckets = (last) - (n + 1);
+ long coveredRows = numRowsPerBucket * coveredBuckets;
+ // num matching rows in the current bucket is a function of (end_point_of_bucket - value)
+ long partialRows = (long) ((buckets[n + 1] - value) / (buckets[n + 1] - buckets[n]) * numRowsPerBucket);
+ long totalFilterRows = partialRows + coveredRows;
+ double selectivity = (double)totalFilterRows/totalRows;
+ return selectivity;
+ } else {
+ // value does not exist in any of the buckets
+ return SMALL_SELECTIVITY;
+ }
+ }
+ } else if (op.getKind() == SqlKind.LESS_THAN ||
+ op.getKind() == SqlKind.LESS_THAN_OR_EQUAL) {
+ Double value = getLiteralValue(filter);
+ if (value != null) {
+
+ // *** Handle the boundary conditions first ***
+
+ // if value is greater than the last bucket's end point then all rows qualify
+ int result = value.compareTo(buckets[last]);
+ if (result >= 0) {
+ return LARGE_SELECTIVITY;
+ }
+ // if value is less than the first bucket's start point then none of the rows qualify
+ result = value.compareTo(buckets[first]);
+ if (result < 0) {
+ return SMALL_SELECTIVITY;
+ } else if (result == 0) {
+ if (op.getKind() == SqlKind.LESS_THAN_OR_EQUAL) {
+ // value is exactly equal to the first bucket's start point so we take the ratio 1/bucket_width
+ long totalFilterRows = (long) (1 / (buckets[first + 1] - buckets[first]) * numRowsPerBucket);
+ double selectivity = (double) totalFilterRows / totalRows;
+ return selectivity;
+ } else {
+ // predicate is 'column < value' and value is equal to first bucket's start point, so none of
+ // the rows qualify
+ return SMALL_SELECTIVITY;
+ }
+ }
+
+ // *** End of boundary conditions ****
+
+ int n = getContainingBucket(value, numBuckets);
+ if (n >= 0) {
+ // all buckets to the left will be fully covered
+ int coveredBuckets = n;
+ long coveredRows = numRowsPerBucket * coveredBuckets;
+ // num matching rows in the current bucket is a function of (value - start_point_of_bucket)
+ long partialRows = (long) ((value - buckets[n]) / (buckets[n + 1] - buckets[n]) * numRowsPerBucket);
+ long totalFilterRows = partialRows + coveredRows;
+ double selectivity = (double)totalFilterRows / totalRows;
+ return selectivity;
+ } else {
+ // value does not exist in any of the buckets
+ return SMALL_SELECTIVITY;
+ }
+ }
+ }
+ }
+ }
+ return null;
+ }
+
+ private int getContainingBucket(final Double value, final int numBuckets) {
+ int i = 0;
+ int containing_bucket = -1;
+ // check which bucket this value falls in
+ for (; i <= numBuckets; i++) {
+ int result = buckets[i].compareTo(value);
+ if (result > 0) {
+ containing_bucket = i - 1;
+ break;
+ } else if (result == 0) {
+ containing_bucket = i;
+ break;
+ }
+ }
+ return containing_bucket;
+ }
+
+ private Double getLiteralValue(final RexNode filter) {
+ Double value = null;
+ List<RexNode> operands = ((RexCall) filter).getOperands();
+ if (operands.size() == 2 && operands.get(1) instanceof RexLiteral) {
+ RexLiteral l = ((RexLiteral) operands.get(1));
+
+ switch (l.getTypeName()) {
+ case DATE:
+ case TIMESTAMP:
+ case TIME:
+ value = (double) ((java.util.Calendar) l.getValue()).getTimeInMillis();
+ break;
+ case INTEGER:
+ case BIGINT:
+ case FLOAT:
+ case DOUBLE:
+ case DECIMAL:
+ case BOOLEAN:
+ value = l.getValueAs(Double.class);
+ break;
+ default:
+ break;
+ }
}
+ return value;
}
/**
- * Utility method to build a Numeric Equi-Depth Histogram from a t-digest byte array
+ * Build a Numeric Equi-Depth Histogram from a t-digest byte array
* @param tdigest_array
+ * @param numBuckets
+ * @param nonNullCount
* @return An instance of NumericEquiDepthHistogram
*/
- public static NumericEquiDepthHistogram buildFromTDigest(byte[] tdigest_array,
- int numBuckets,
- long nonNullCount) {
+ public static NumericEquiDepthHistogram buildFromTDigest(final byte[] tdigest_array,
+ final int numBuckets,
+ final long nonNullCount) {
TDigest tdigest = TDigest.fromBytes(java.nio.ByteBuffer.wrap(tdigest_array));
NumericEquiDepthHistogram histogram = new NumericEquiDepthHistogram(numBuckets);
- double q = 1.0/numBuckets;
+ final double q = 1.0/numBuckets;
int i = 0;
for (; i < numBuckets; i++) {
// get the starting point of the i-th quantile
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/cost/DrillRelMdSelectivity.java b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/cost/DrillRelMdSelectivity.java
index aae8b1d..4a6646e 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/cost/DrillRelMdSelectivity.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/cost/DrillRelMdSelectivity.java
@@ -19,6 +19,8 @@ package org.apache.drill.exec.planner.cost;
import java.util.ArrayList;
import java.util.List;
+import java.util.EnumSet;
+import java.util.Set;
import org.apache.calcite.plan.RelOptUtil;
import org.apache.calcite.plan.volcano.RelSubset;
import org.apache.calcite.rel.RelNode;
@@ -45,6 +47,7 @@ import org.apache.drill.exec.physical.base.GroupScan;
import org.apache.drill.exec.planner.common.DrillJoinRelBase;
import org.apache.drill.exec.planner.common.DrillRelOptUtil;
import org.apache.drill.exec.planner.common.DrillScanRelBase;
+import org.apache.drill.exec.planner.common.Histogram;
import org.apache.drill.exec.planner.logical.DrillScanRel;
import org.apache.drill.exec.planner.logical.DrillTable;
import org.apache.drill.exec.planner.physical.PlannerSettings;
@@ -64,6 +67,11 @@ public class DrillRelMdSelectivity extends RelMdSelectivity {
*/
private static final double LIKE_PREDICATE_SELECTIVITY = 0.05;
+ public static final Set<SqlKind> RANGE_PREDICATE =
+ EnumSet.of(
+ SqlKind.LESS_THAN, SqlKind.GREATER_THAN,
+ SqlKind.LESS_THAN_OR_EQUAL, SqlKind.GREATER_THAN_OR_EQUAL);
+
@Override
public Double getSelectivity(RelNode rel, RelMetadataQuery mq, RexNode predicate) {
if (rel instanceof RelSubset && !DrillRelOptUtil.guessRows(rel)) {
@@ -145,6 +153,8 @@ public class DrillRelMdSelectivity extends RelMdSelectivity {
orSel += RelMdUtil.guessSelectivity(orPred); //CALCITE guess
} else if (orPred.isA(SqlKind.EQUALS)) {
orSel += computeEqualsSelectivity(table, orPred, fieldNames);
+ } else if (orPred.isA(RANGE_PREDICATE)) {
+ orSel += computeRangeSelectivity(table, orPred, fieldNames);
} else if (orPred.isA(SqlKind.NOT_EQUALS)) {
orSel += 1.0 - computeEqualsSelectivity(table, orPred, fieldNames);
} else if (orPred.isA(SqlKind.LIKE)) {
@@ -167,7 +177,7 @@ public class DrillRelMdSelectivity extends RelMdSelectivity {
} else if (orPred.isA(SqlKind.IS_NOT_NULL)) {
orSel += computeIsNotNullSelectivity(table, orPred, fieldNames);
} else {
- //Use the CALCITE guess. TODO: Use histograms for COMPARISON operator
+ // Use the CALCITE guess.
orSel += guessSelectivity(orPred);
}
}
@@ -188,6 +198,22 @@ public class DrillRelMdSelectivity extends RelMdSelectivity {
return guessSelectivity(orPred);
}
+ // Use histogram if available for the range predicate selectivity
+ private double computeRangeSelectivity(DrillTable table, RexNode orPred, List<String> fieldNames) {
+ String col = getColumn(orPred, fieldNames);
+ if (col != null) {
+ if (table.getStatsTable() != null
+ && table.getStatsTable().getHistogram(col) != null) {
+ Histogram histogram = table.getStatsTable().getHistogram(col);
+ Double sel = ((Histogram) histogram).estimatedSelectivity(orPred);
+ if (sel != null) {
+ return sel;
+ }
+ }
+ }
+ return guessSelectivity(orPred);
+ }
+
private double computeIsNotNullSelectivity(DrillTable table, RexNode orPred, List<String> fieldNames) {
String col = getColumn(orPred, fieldNames);
if (col != null) {
diff --git a/exec/java-exec/src/test/java/org/apache/drill/PlanTestBase.java b/exec/java-exec/src/test/java/org/apache/drill/PlanTestBase.java
index da257c0..34126cf 100644
--- a/exec/java-exec/src/test/java/org/apache/drill/PlanTestBase.java
+++ b/exec/java-exec/src/test/java/org/apache/drill/PlanTestBase.java
@@ -157,7 +157,7 @@ public class PlanTestBase extends BaseTestQuery {
for (final String s : expectedPatterns) {
final Pattern p = Pattern.compile(s);
final Matcher m = p.matcher(plan);
- assertTrue(EXPECTED_NOT_FOUND + s, m.find());
+ assertTrue(EXPECTED_NOT_FOUND + s + "\n" + plan, m.find());
}
}
@@ -166,7 +166,7 @@ public class PlanTestBase extends BaseTestQuery {
for (final String s : excludedPatterns) {
final Pattern p = Pattern.compile(s);
final Matcher m = p.matcher(plan);
- assertFalse(UNEXPECTED_FOUND + s, m.find());
+ assertFalse(UNEXPECTED_FOUND + s + "\n" + plan, m.find());
}
}
}
diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/sql/TestAnalyze.java b/exec/java-exec/src/test/java/org/apache/drill/exec/sql/TestAnalyze.java
index 30d23b3..8270352 100644
--- a/exec/java-exec/src/test/java/org/apache/drill/exec/sql/TestAnalyze.java
+++ b/exec/java-exec/src/test/java/org/apache/drill/exec/sql/TestAnalyze.java
@@ -400,6 +400,21 @@ public class TestAnalyze extends BaseTestQuery {
.baselineValues("`hire_date_and_time`", 7)
.baselineValues("`salary`", 11)
.go();
+
+ // test the use of the just created histogram
+ test("alter session set `planner.statistics.use` = true");
+
+ // check boundary conditions: last bucket
+ String query = "select 1 from dfs.tmp.employee1 where store_id > 21";
+ String[] expectedPlan1 = {"Filter\\(condition.*\\).*rowcount = 112.*,.*",
+ "Scan.*columns=\\[`store_id`\\].*rowcount = 1128.0.*"};
+ PlanTestBase.testPlanWithAttributesMatchingPatterns(query, expectedPlan1, new String[]{});
+
+ query = "select 1 from dfs.tmp.employee1 where store_id < 15";
+ String[] expectedPlan2 = {"Filter\\(condition.*\\).*rowcount = 676.*,.*",
+ "Scan.*columns=\\[`store_id`\\].*rowcount = 1128.0.*"};
+ PlanTestBase.testPlanWithAttributesMatchingPatterns(query, expectedPlan2, new String[]{});
+
} finally {
test("ALTER SESSION SET `planner.slice_target` = " + ExecConstants.SLICE_TARGET_DEFAULT);
}