You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by kw...@apache.org on 2017/02/03 23:57:40 UTC

[3/7] incubator-impala git commit: IMPALA-4792: Fix number of distinct values for a CASE with constant outputs

IMPALA-4792: Fix number of distinct values for a CASE with constant outputs

If all the return values of a Case expression have a known number of
distinct values (i.e. they are constant or statistics exist), then
the number of distinct values for the Case can be computed using this
information.

In order for the value from Case to be used at higher levels in the tree,
the implementation of computeNumDistinctValues for Expr needed to change.
Previously, Expr calculated the number of distinct values by finding any
SlotRefs in its tree and taking the maximum of the distinct values from
those SlotRefs. This would ignore the value from CaseExpr. To fix this,
Expr now takes the maximum number of distinct values across all of its
children.

-- explaining this statement shows cardinality = 2
explain select distinct case when id = 1 then 'yes' else 'no' end
from functional.alltypes;

-- explaining this statement shows cardinality = 2
explain select distinct char_length(case when id = 1 then 'yes' else 'no' end)
from functional.alltypes;

-- explaining this statement shows cardinality = 7300
explain select distinct case when id = 1 then 0 else id end
from functional.alltypes;

-- explaining this statement shows cardinality = 737 (date_string_col has lower
-- cardinality than id)
explain select distinct case when id = 1 then 'yes' else date_string_col end
from functional.alltypes;

For cases when the number of distinct values is not known for all the outputs,
this will return -1, indicating that the number of distinct values is not
known. The inputs (whens) are not used for calculating the number of distinct
values.

Change-Id: I21dbdaad8452b7e58c477612b47847dccd9d98d2
Reviewed-on: http://gerrit.cloudera.org:8080/5768
Reviewed-by: Alex Behm <al...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/59cdf6b8
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/59cdf6b8
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/59cdf6b8

Branch: refs/heads/master
Commit: 59cdf6b8f2a6180b727bcb9ee336a65381377ace
Parents: 5b8b618
Author: Joe McDonnell <jo...@cloudera.com>
Authored: Mon Jan 23 11:11:25 2017 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Fri Feb 3 00:30:35 2017 +0000

----------------------------------------------------------------------
 .../org/apache/impala/analysis/CaseExpr.java    |  62 ++++++++++
 .../java/org/apache/impala/analysis/Expr.java   |  18 +--
 .../org/apache/impala/analysis/ExprNdvTest.java | 113 +++++++++++++++++++
 3 files changed, 185 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/59cdf6b8/fe/src/main/java/org/apache/impala/analysis/CaseExpr.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/analysis/CaseExpr.java b/fe/src/main/java/org/apache/impala/analysis/CaseExpr.java
index dae8bcb..9d6ea69 100644
--- a/fe/src/main/java/org/apache/impala/analysis/CaseExpr.java
+++ b/fe/src/main/java/org/apache/impala/analysis/CaseExpr.java
@@ -17,6 +17,7 @@
 
 package org.apache.impala.analysis;
 
+import java.util.HashSet;
 import java.util.List;
 
 import org.apache.impala.catalog.Db;
@@ -31,7 +32,9 @@ import org.apache.impala.thrift.TExprNode;
 import org.apache.impala.thrift.TExprNodeType;
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
+import com.google.common.base.Predicates;
 import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
 
 /**
  * CASE and DECODE are represented using this class. The backend implementation is
@@ -373,6 +376,65 @@ public class CaseExpr extends Expr {
     }
   }
 
+  @Override
+  protected void computeNumDistinctValues() {
+    // Skip the first child if case expression
+    int loopStart = (hasCaseExpr_ ? 1 : 0);
+
+    // If all the outputs have a known number of distinct values (i.e. not -1), then
+    // sum the number of distinct constants with the maximum NDV for the non-constants.
+    //
+    // Otherwise, the number of distinct values is undetermined. The input cardinality
+    // (i.e. the when's) are not used.
+    boolean allOutputsKnown = true;
+    int numOutputConstants = 0;
+    long maxOutputNonConstNdv = -1;
+    HashSet<LiteralExpr> constLiteralSet = Sets.newHashSetWithExpectedSize(children_.size());
+
+    for (int i = loopStart; i < children_.size(); ++i) {
+      // The children follow this ordering:
+      // [optional first child] when1 then1 when2 then2 ... else
+      // After skipping optional first child, even indices are when expressions, except
+      // for the last child, which can be an else expression
+      if ((i - loopStart) % 2 == 0 && !(i == children_.size() - 1 && hasElseExpr_)) {
+        // This is a when expression
+        continue;
+      }
+
+      // This is an output expression (either then or else)
+      Expr outputExpr = children_.get(i);
+
+      if (outputExpr.isConstant()) {
+        if (outputExpr.isLiteral()) {
+          LiteralExpr outputLiteral = (LiteralExpr) outputExpr;
+          if (constLiteralSet.add(outputLiteral)) ++numOutputConstants;
+        } else {
+          ++numOutputConstants;
+        }
+      } else {
+        long outputNdv = outputExpr.getNumDistinctValues();
+        if (outputNdv == -1) allOutputsKnown = false;
+        maxOutputNonConstNdv = Math.max(maxOutputNonConstNdv, outputNdv);
+      }
+    }
+
+    // Else unspecified => NULL constant, which is not caught above
+    if (!hasElseExpr_) ++numOutputConstants;
+
+    if (allOutputsKnown) {
+      if (maxOutputNonConstNdv == -1) {
+        // All must be constant, because if we hit any SlotRef, this would be set
+        numDistinctValues_ = numOutputConstants;
+      } else {
+        numDistinctValues_ = numOutputConstants + maxOutputNonConstNdv;
+      }
+    } else {
+      // There is no correct answer when statistics are missing. Neither the
+      // known outputs nor the inputs provide information
+      numDistinctValues_ = -1;
+    }
+  }
+
   private boolean isCase() { return !isDecode(); }
   private boolean isDecode() { return decodeExpr_ != null; }
   public boolean hasCaseExpr() { return hasCaseExpr_; }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/59cdf6b8/fe/src/main/java/org/apache/impala/analysis/Expr.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/analysis/Expr.java b/fe/src/main/java/org/apache/impala/analysis/Expr.java
index 69e7790..d9c46bf 100644
--- a/fe/src/main/java/org/apache/impala/analysis/Expr.java
+++ b/fe/src/main/java/org/apache/impala/analysis/Expr.java
@@ -316,15 +316,17 @@ abstract public class Expr extends TreeNode<Expr> implements ParseNode, Cloneabl
     if (isConstant()) {
       numDistinctValues_ = 1;
     } else {
-      // if this Expr contains slotrefs, we estimate the # of distinct values
-      // to be the maximum such number for any of the slotrefs;
-      // the subclass analyze() function may well want to override this, if it
-      // knows better
-      List<SlotRef> slotRefs = Lists.newArrayList();
-      this.collect(Predicates.instanceOf(SlotRef.class), slotRefs);
       numDistinctValues_ = -1;
-      for (SlotRef slotRef: slotRefs) {
-        numDistinctValues_ = Math.max(numDistinctValues_, slotRef.numDistinctValues_);
+
+      // get the max number of distinct values over all children of this node
+      for (Expr child: children_) {
+        // A constant should not override a -1 from a SlotRef, so we only consider
+        // non-constant expressions. This is functionally similar to considering
+        // only the SlotRefs, except that it allows an Expr to override the values
+        // that come out of its children.
+        if (!child.isConstant()) {
+          numDistinctValues_ = Math.max(numDistinctValues_, child.getNumDistinctValues());
+        }
       }
     }
   }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/59cdf6b8/fe/src/test/java/org/apache/impala/analysis/ExprNdvTest.java
----------------------------------------------------------------------
diff --git a/fe/src/test/java/org/apache/impala/analysis/ExprNdvTest.java b/fe/src/test/java/org/apache/impala/analysis/ExprNdvTest.java
new file mode 100644
index 0000000..e49347e
--- /dev/null
+++ b/fe/src/test/java/org/apache/impala/analysis/ExprNdvTest.java
@@ -0,0 +1,113 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.impala.analysis;
+
+import org.apache.impala.catalog.Catalog;
+import org.apache.impala.common.AnalysisException;
+import org.apache.impala.common.FrontendTestBase;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Tests computeNumDistinctValues() estimates for Exprs
+ */
+public class ExprNdvTest extends FrontendTestBase {
+
+  public void verifyNdv(String expr, long expectedNdv)
+      throws AnalysisException {
+    String stmtStr = "select " + expr + " from functional.alltypes";
+    verifyNdvStmt(stmtStr, expectedNdv);
+  }
+
+  /**
+   * This test queries two tables to allow testing missing statistics.
+   * functional.alltypes (a) has statistics
+   * functional.tinytable (tiny) does not
+   */
+  public void verifyNdvTwoTable(String expr, long expectedNdv)
+      throws AnalysisException {
+    String stmtStr = "select " + expr + " from functional.alltypes a, " +
+                     "functional.tinytable tiny";
+    verifyNdvStmt(stmtStr, expectedNdv);
+  }
+
+  public void verifyNdvStmt(String stmtStr, long expectedNdv)
+      throws AnalysisException {
+    SelectStmt stmt = (SelectStmt) ParsesOk(stmtStr);
+    Analyzer analyzer = createAnalyzer(Catalog.DEFAULT_DB);
+    stmt.analyze(analyzer);
+    Expr analyzedExpr = stmt.getSelectList().getItems().get(0).getExpr();
+    long calculatedNdv = analyzedExpr.getNumDistinctValues();
+    assertEquals(expectedNdv, calculatedNdv);
+  }
+
+  /**
+   * Helper for prettier error messages than what JUnit.Assert provides.
+   */
+  private void assertEquals(long expected, long actual) {
+    if (actual != expected) {
+      Assert.fail(String.format("\nActual: %d\nExpected: %d\n", actual, expected));
+    }
+  }
+
+  @Test
+  public void TestCaseExprBasic() throws AnalysisException {
+    // All constants tests
+    verifyNdv("case when id = 1 then 'yes' else 'no' end", 2);
+    verifyNdv("case when id = 1 then 'yes' " +
+              "when id = 2 then 'maybe' else 'no' end", 3);
+    verifyNdv("decode(id, 1, 'yes', 'no')", 2);
+    // Duplicate constants are counted once
+    verifyNdv("case when id = 1 then 'yes' " +
+              "when id = 2 then 'yes' else 'yes' end", 1);
+    // When else not specified, it is NULL, verify it is counted
+    verifyNdv("case when id = 1 then 'yes' end", 2);
+
+    // Basic cases where the output includes a SlotRef
+    // Expect number of constants + max over output SlotRefs
+    verifyNdv("case when id = 1 then 0 else id end", 7301);
+    verifyNdv("case when id = 1 then 0 else int_col end", 11);
+    verifyNdv("case when id = 1 then 'foo' else date_string_col end", 737);
+
+    // Verify max
+    verifyNdv("case when id = 1 then int_col else id end", 7300);
+    verifyNdv("case when id = 1 then date_string_col " +
+              "when id = 2 then date_string_col " +
+              "else date_string_col end", 736);
+  }
+
+  @Test
+  public void TestCaseExprMissingStats() throws AnalysisException {
+
+    // Consts still work
+    verifyNdvTwoTable("case when a.id = 1 then 'yes' " +
+                      "when tiny.a = 'whatever' then 'maybe' " +
+                      "else 'no' end", 3);
+
+    // Input has stats, output does not
+    verifyNdvTwoTable("case when a.id = 1 then tiny.a else tiny.b end", -1);
+
+    // Input has stats, some outputs do not (tiny)
+    verifyNdvTwoTable("case when a.id = 1 then tiny.a " +
+                      "else date_string_col end", -1);
+
+    // Outputs has stats, input does not
+    verifyNdvTwoTable("case when tiny.a = 'whatever' then a.id " +
+                      "else 0 end", 7301);
+  }
+}