You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by gp...@apache.org on 2019/03/21 01:54:40 UTC
[drill] branch master updated: DRILL-7108: Improve selectivity
estimates for (NOT)LIKE, NOT_EQUALS, IS NOT NULL predicates
This is an automated email from the ASF dual-hosted git repository.
gparai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git
The following commit(s) were added to refs/heads/master by this push:
new 7c2661d DRILL-7108: Improve selectivity estimates for (NOT)LIKE, NOT_EQUALS, IS NOT NULL predicates
7c2661d is described below
commit 7c2661d5dba80d2bebb100174ecefd5a3b194310
Author: Gautam Parai <gp...@maprtech.com>
AuthorDate: Tue Mar 12 18:29:16 2019 -0700
DRILL-7108: Improve selectivity estimates for (NOT)LIKE, NOT_EQUALS, IS NOT NULL predicates
closes #1699
---
.../drill/exec/planner/common/DrillStatsTable.java | 30 ++++++-
.../exec/planner/cost/DrillRelMdSelectivity.java | 98 +++++++++++++++++-----
2 files changed, 106 insertions(+), 22 deletions(-)
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/DrillStatsTable.java b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/DrillStatsTable.java
index 668f1d2..e934dfc 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/DrillStatsTable.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/DrillStatsTable.java
@@ -66,8 +66,9 @@ public class DrillStatsTable {
private final Path tablePath;
private final String schemaName;
private final String tableName;
- private final Map<String, Long> ndv = Maps.newHashMap();
private double rowCount = -1;
+ private final Map<String, Long> ndv = Maps.newHashMap();
+ private final Map<String, Long> nnRowCount = Maps.newHashMap();
private boolean materialized = false;
private TableStatistics statistics = null;
@@ -135,6 +136,32 @@ public class DrillStatsTable {
}
/**
+ * Get non-null rowcount for the column If stats are not present for the given column, a null is returned.
+ *
+ * Note: returned data may not be accurate. Accuracy depends on whether the table data has changed
+ * after the stats are computed.
+ *
+ * @param col - column for which non-null rowcount is desired
+ * @return non-null rowcount of the column, if available. NULL otherwise.
+ */
+ public Double getNNRowCount(String col) {
+ // Stats might not have materialized because of errors.
+ if (!materialized) {
+ return null;
+ }
+ final String upperCol = col.toUpperCase();
+ Long nnRowCntCol = nnRowCount.get(upperCol);
+ if (nnRowCntCol == null) {
+ nnRowCntCol = nnRowCount.get(SchemaPath.getSimplePath(upperCol).toString());
+ }
+ // Cap it at row count (just in case)
+ if (nnRowCntCol != null) {
+ return Math.min(nnRowCntCol, rowCount);
+ }
+ return null;
+ }
+
+ /**
* Read the stats from storage and keep them in memory.
* @param table - Drill table for which we require stats
* @param context - Query context
@@ -154,6 +181,7 @@ public class DrillStatsTable {
for (DirectoryStatistics_v1 ds : ((Statistics_v1) statistics).getDirectoryStatistics()) {
for (ColumnStatistics_v1 cs : ds.getColumnStatistics()) {
ndv.put(cs.getName().toUpperCase(), cs.getNdv());
+ nnRowCount.put(cs.getName().toUpperCase(), (long)cs.getNonNullCount());
rowCount = Math.max(rowCount, cs.getCount());
}
}
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/cost/DrillRelMdSelectivity.java b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/cost/DrillRelMdSelectivity.java
index e074be4..e87fae2 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/cost/DrillRelMdSelectivity.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/cost/DrillRelMdSelectivity.java
@@ -56,6 +56,13 @@ public class DrillRelMdSelectivity extends RelMdSelectivity {
private static final DrillRelMdSelectivity INSTANCE = new DrillRelMdSelectivity();
static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(DrillRelMdSelectivity.class);
public static final RelMetadataProvider SOURCE = ReflectiveRelMetadataProvider.reflectiveSource(BuiltInMethod.SELECTIVITY.method, INSTANCE);
+ /*
+ * For now, we are treating all LIKE predicates to have the same selectivity irrespective of the number or position
+ * of wildcard characters (%). This is no different than the present Drill/Calcite behaviour w.r.t to LIKE predicates.
+ * The difference being Calcite keeps the selectivity 25% whereas we keep it at 5%
+ * TODO: Differentiate leading/trailing wildcard characters(%) or explore different estimation techniques e.g. LSH-based
+ */
+ private static final double LIKE_PREDICATE_SELECTIVITY = 0.05;
@Override
public Double getSelectivity(RelNode rel, RelMetadataQuery mq, RexNode predicate) {
@@ -137,34 +144,32 @@ public class DrillRelMdSelectivity extends RelMdSelectivity {
for (RexNode pred : RelOptUtil.conjunctions(predicate)) {
double orSel = 0;
for (RexNode orPred : RelOptUtil.disjunctions(pred)) {
- //CALCITE guess
- Double guess = RelMdUtil.guessSelectivity(pred);
if (orPred.isA(SqlKind.EQUALS)) {
+ orSel += computeEqualsSelectivity(table, orPred, fieldNames);
+ } else if (orPred.isA(SqlKind.NOT_EQUALS)) {
+ orSel += 1.0 - computeEqualsSelectivity(table, orPred, fieldNames);
+ } else if (orPred.isA(SqlKind.LIKE)) {
+ // LIKE selectivity is 5% more than a similar equality predicate, capped at CALCITE guess
+ orSel += Math.min(computeEqualsSelectivity(table, orPred, fieldNames) + LIKE_PREDICATE_SELECTIVITY,
+ guessSelectivity(orPred));
+ } else if (orPred.isA(SqlKind.NOT)) {
if (orPred instanceof RexCall) {
- int colIdx = -1;
- RexInputRef op = findRexInputRef(orPred);
- if (op != null) {
- colIdx = op.hashCode();
- }
- if (colIdx != -1 && colIdx < fieldNames.size()) {
- String col = fieldNames.get(colIdx);
- if (table.getStatsTable() != null
- && table.getStatsTable().getNdv(col) != null) {
- orSel += 1.00 / table.getStatsTable().getNdv(col);
- } else {
- orSel += guess;
- }
+ // LIKE selectivity is 5% more than a similar equality predicate, capped at CALCITE guess
+ RexNode childOp = ((RexCall) orPred).getOperands().get(0);
+ if (childOp.isA(SqlKind.LIKE)) {
+ orSel += 1.0 - Math.min(computeEqualsSelectivity(table, childOp, fieldNames) + LIKE_PREDICATE_SELECTIVITY,
+ guessSelectivity(childOp));
} else {
- orSel += guess;
- if (logger.isDebugEnabled()) {
- logger.warn(String.format("No input reference $[%s] found for predicate [%s]",
- Integer.toString(colIdx), orPred.toString()));
- }
+ orSel += 1.0 - guessSelectivity(orPred);
}
}
+ } else if (orPred.isA(SqlKind.IS_NULL)) {
+ orSel += 1.0 - computeIsNotNullSelectivity(table, orPred, fieldNames);
+ } else if (orPred.isA(SqlKind.IS_NOT_NULL)) {
+ orSel += computeIsNotNullSelectivity(table, orPred, fieldNames);
} else {
//Use the CALCITE guess. TODO: Use histograms for COMPARISON operator
- orSel += guess;
+ orSel += guessSelectivity(orPred);
}
}
sel *= orSel;
@@ -173,6 +178,57 @@ public class DrillRelMdSelectivity extends RelMdSelectivity {
return (sel > 1.0) ? 1.0 : sel;
}
+ private double computeEqualsSelectivity(DrillTable table, RexNode orPred, List<String> fieldNames) {
+ String col = getColumn(orPred, fieldNames);
+ if (col != null) {
+ if (table.getStatsTable() != null
+ && table.getStatsTable().getNdv(col) != null) {
+ return 1.00 / table.getStatsTable().getNdv(col);
+ }
+ }
+ return guessSelectivity(orPred);
+ }
+
+ private double computeIsNotNullSelectivity(DrillTable table, RexNode orPred, List<String> fieldNames) {
+ String col = getColumn(orPred, fieldNames);
+ if (col != null) {
+ if (table.getStatsTable() != null
+ && table.getStatsTable().getNNRowCount(col) != null) {
+ // Cap selectivity below Calcite Guess
+ return Math.min((table.getStatsTable().getNNRowCount(col) / table.getStatsTable().getRowCount()),
+ RelMdUtil.guessSelectivity(orPred));
+ }
+ }
+ return guessSelectivity(orPred);
+ }
+
+ private String getColumn(RexNode orPred, List<String> fieldNames) {
+ if (orPred instanceof RexCall) {
+ int colIdx = -1;
+ RexInputRef op = findRexInputRef(orPred);
+ if (op != null) {
+ colIdx = op.hashCode();
+ }
+ if (colIdx != -1 && colIdx < fieldNames.size()) {
+ return fieldNames.get(colIdx);
+ } else {
+ if (logger.isDebugEnabled()) {
+ logger.warn(String.format("No input reference $[%s] found for predicate [%s]",
+ Integer.toString(colIdx), orPred.toString()));
+ }
+ }
+ }
+ return null;
+ }
+
+ private double guessSelectivity(RexNode orPred) {
+ if (logger.isDebugEnabled()) {
+ logger.warn(String.format("Using guess for predicate [%s]", orPred.toString()));
+ }
+ //CALCITE guess
+ return RelMdUtil.guessSelectivity(orPred);
+ }
+
private Double getJoinSelectivity(DrillJoinRelBase rel, RelMetadataQuery mq, RexNode predicate) {
double sel = 1.0;
// determine which filters apply to the left vs right