You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by gp...@apache.org on 2019/03/21 01:54:40 UTC

[drill] branch master updated: DRILL-7108: Improve selectivity estimates for (NOT)LIKE, NOT_EQUALS, IS NOT NULL predicates

This is an automated email from the ASF dual-hosted git repository.

gparai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git


The following commit(s) were added to refs/heads/master by this push:
     new 7c2661d  DRILL-7108: Improve selectivity estimates for (NOT)LIKE, NOT_EQUALS, IS NOT NULL predicates
7c2661d is described below

commit 7c2661d5dba80d2bebb100174ecefd5a3b194310
Author: Gautam Parai <gp...@maprtech.com>
AuthorDate: Tue Mar 12 18:29:16 2019 -0700

    DRILL-7108: Improve selectivity estimates for (NOT)LIKE, NOT_EQUALS, IS NOT NULL predicates
    
    closes #1699
---
 .../drill/exec/planner/common/DrillStatsTable.java | 30 ++++++-
 .../exec/planner/cost/DrillRelMdSelectivity.java   | 98 +++++++++++++++++-----
 2 files changed, 106 insertions(+), 22 deletions(-)

diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/DrillStatsTable.java b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/DrillStatsTable.java
index 668f1d2..e934dfc 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/DrillStatsTable.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/DrillStatsTable.java
@@ -66,8 +66,9 @@ public class DrillStatsTable {
   private final Path tablePath;
   private final String schemaName;
   private final String tableName;
-  private final Map<String, Long> ndv = Maps.newHashMap();
   private double rowCount = -1;
+  private final Map<String, Long> ndv = Maps.newHashMap();
+  private final Map<String, Long> nnRowCount = Maps.newHashMap();
   private boolean materialized = false;
   private TableStatistics statistics = null;
 
@@ -135,6 +136,32 @@ public class DrillStatsTable {
   }
 
   /**
+   * Get non-null rowcount for the column If stats are not present for the given column, a null is returned.
+   *
+   * Note: returned data may not be accurate. Accuracy depends on whether the table data has changed
+   * after the stats are computed.
+   *
+   * @param col - column for which non-null rowcount is desired
+   * @return non-null rowcount of the column, if available. NULL otherwise.
+   */
+  public Double getNNRowCount(String col) {
+    // Stats might not have materialized because of errors.
+    if (!materialized) {
+      return null;
+    }
+    final String upperCol = col.toUpperCase();
+    Long nnRowCntCol = nnRowCount.get(upperCol);
+    if (nnRowCntCol == null) {
+      nnRowCntCol = nnRowCount.get(SchemaPath.getSimplePath(upperCol).toString());
+    }
+    // Cap it at row count (just in case)
+    if (nnRowCntCol != null) {
+      return Math.min(nnRowCntCol, rowCount);
+    }
+    return null;
+  }
+
+  /**
    * Read the stats from storage and keep them in memory.
    * @param table - Drill table for which we require stats
    * @param context - Query context
@@ -154,6 +181,7 @@ public class DrillStatsTable {
         for (DirectoryStatistics_v1 ds : ((Statistics_v1) statistics).getDirectoryStatistics()) {
           for (ColumnStatistics_v1 cs : ds.getColumnStatistics()) {
             ndv.put(cs.getName().toUpperCase(), cs.getNdv());
+            nnRowCount.put(cs.getName().toUpperCase(), (long)cs.getNonNullCount());
             rowCount = Math.max(rowCount, cs.getCount());
           }
         }
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/cost/DrillRelMdSelectivity.java b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/cost/DrillRelMdSelectivity.java
index e074be4..e87fae2 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/cost/DrillRelMdSelectivity.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/cost/DrillRelMdSelectivity.java
@@ -56,6 +56,13 @@ public class DrillRelMdSelectivity extends RelMdSelectivity {
   private static final DrillRelMdSelectivity INSTANCE = new DrillRelMdSelectivity();
   static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(DrillRelMdSelectivity.class);
   public static final RelMetadataProvider SOURCE = ReflectiveRelMetadataProvider.reflectiveSource(BuiltInMethod.SELECTIVITY.method, INSTANCE);
+  /*
+   * For now, we are treating all LIKE predicates to have the same selectivity irrespective of the number or position
+   * of wildcard characters (%). This is no different than the present Drill/Calcite behaviour w.r.t to LIKE predicates.
+   * The difference being Calcite keeps the selectivity 25% whereas we keep it at 5%
+   * TODO: Differentiate leading/trailing wildcard characters(%) or explore different estimation techniques e.g. LSH-based
+   */
+  private static final double LIKE_PREDICATE_SELECTIVITY = 0.05;
 
   @Override
   public Double getSelectivity(RelNode rel, RelMetadataQuery mq, RexNode predicate) {
@@ -137,34 +144,32 @@ public class DrillRelMdSelectivity extends RelMdSelectivity {
     for (RexNode pred : RelOptUtil.conjunctions(predicate)) {
       double orSel = 0;
       for (RexNode orPred : RelOptUtil.disjunctions(pred)) {
-        //CALCITE guess
-        Double guess = RelMdUtil.guessSelectivity(pred);
         if (orPred.isA(SqlKind.EQUALS)) {
+          orSel += computeEqualsSelectivity(table, orPred, fieldNames);
+        } else if (orPred.isA(SqlKind.NOT_EQUALS)) {
+          orSel += 1.0 - computeEqualsSelectivity(table, orPred, fieldNames);
+        } else if (orPred.isA(SqlKind.LIKE)) {
+          // LIKE selectivity is 5% more than a similar equality predicate, capped at CALCITE guess
+          orSel +=  Math.min(computeEqualsSelectivity(table, orPred, fieldNames) + LIKE_PREDICATE_SELECTIVITY,
+              guessSelectivity(orPred));
+        } else if (orPred.isA(SqlKind.NOT)) {
           if (orPred instanceof RexCall) {
-            int colIdx = -1;
-            RexInputRef op = findRexInputRef(orPred);
-            if (op != null) {
-              colIdx = op.hashCode();
-            }
-            if (colIdx != -1 && colIdx < fieldNames.size()) {
-              String col = fieldNames.get(colIdx);
-              if (table.getStatsTable() != null
-                      && table.getStatsTable().getNdv(col) != null) {
-                orSel += 1.00 / table.getStatsTable().getNdv(col);
-              } else {
-                orSel += guess;
-              }
+            // LIKE selectivity is 5% more than a similar equality predicate, capped at CALCITE guess
+            RexNode childOp = ((RexCall) orPred).getOperands().get(0);
+            if (childOp.isA(SqlKind.LIKE)) {
+              orSel += 1.0 - Math.min(computeEqualsSelectivity(table, childOp, fieldNames) + LIKE_PREDICATE_SELECTIVITY,
+                      guessSelectivity(childOp));
             } else {
-              orSel += guess;
-              if (logger.isDebugEnabled()) {
-                logger.warn(String.format("No input reference $[%s] found for predicate [%s]",
-                    Integer.toString(colIdx), orPred.toString()));
-              }
+              orSel += 1.0 - guessSelectivity(orPred);
             }
           }
+        } else if (orPred.isA(SqlKind.IS_NULL)) {
+          orSel += 1.0 - computeIsNotNullSelectivity(table, orPred, fieldNames);
+        } else if (orPred.isA(SqlKind.IS_NOT_NULL)) {
+          orSel += computeIsNotNullSelectivity(table, orPred, fieldNames);
         } else {
           //Use the CALCITE guess. TODO: Use histograms for COMPARISON operator
-          orSel += guess;
+          orSel += guessSelectivity(orPred);
         }
       }
       sel *= orSel;
@@ -173,6 +178,57 @@ public class DrillRelMdSelectivity extends RelMdSelectivity {
     return (sel > 1.0) ? 1.0 : sel;
   }
 
+  private double computeEqualsSelectivity(DrillTable table, RexNode orPred, List<String> fieldNames) {
+    String col = getColumn(orPred, fieldNames);
+    if (col != null) {
+      if (table.getStatsTable() != null
+              && table.getStatsTable().getNdv(col) != null) {
+        return 1.00 / table.getStatsTable().getNdv(col);
+      }
+    }
+    return guessSelectivity(orPred);
+  }
+
+  private double computeIsNotNullSelectivity(DrillTable table, RexNode orPred, List<String> fieldNames) {
+    String col = getColumn(orPred, fieldNames);
+    if (col != null) {
+      if (table.getStatsTable() != null
+          && table.getStatsTable().getNNRowCount(col) != null) {
+        // Cap selectivity below Calcite Guess
+        return Math.min((table.getStatsTable().getNNRowCount(col) / table.getStatsTable().getRowCount()),
+            RelMdUtil.guessSelectivity(orPred));
+      }
+    }
+    return guessSelectivity(orPred);
+  }
+
+  private String getColumn(RexNode orPred, List<String> fieldNames) {
+    if (orPred instanceof RexCall) {
+      int colIdx = -1;
+      RexInputRef op = findRexInputRef(orPred);
+      if (op != null) {
+        colIdx = op.hashCode();
+      }
+      if (colIdx != -1 && colIdx < fieldNames.size()) {
+        return fieldNames.get(colIdx);
+      } else {
+        if (logger.isDebugEnabled()) {
+          logger.warn(String.format("No input reference $[%s] found for predicate [%s]",
+                  Integer.toString(colIdx), orPred.toString()));
+        }
+      }
+    }
+    return null;
+  }
+
+  private double guessSelectivity(RexNode orPred) {
+    if (logger.isDebugEnabled()) {
+      logger.warn(String.format("Using guess for predicate [%s]", orPred.toString()));
+    }
+    //CALCITE guess
+    return RelMdUtil.guessSelectivity(orPred);
+  }
+
   private Double getJoinSelectivity(DrillJoinRelBase rel, RelMetadataQuery mq, RexNode predicate) {
     double sel = 1.0;
     // determine which filters apply to the left vs right