You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2019/05/23 18:17:56 UTC
[spark] 01/02: Revert "[SPARK-27539][SQL] Fix inaccurate aggregate outputRows estimation with column containing null values"

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git

commit 855399bbad7706bfd75cc640e3d289392dfd648a
Author: HyukjinKwon <gu...@apache.org>
AuthorDate: Fri May 24 03:16:24 2019 +0900

    Revert "[SPARK-27539][SQL] Fix inaccurate aggregate outputRows estimation with column containing null values"
    
    This reverts commit 42cb4a2ccdb5ca6216677dc4285c3e74cfb7e707.
---
 .../plans/logical/statsEstimation/AggregateEstimation.scala |  4 ++--
 .../catalyst/statsEstimation/AggregateEstimationSuite.scala | 13 +------------
 2 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala
index ffe071e..1198d3f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala
@@ -42,8 +42,8 @@ object AggregateEstimation {
         (res, expr) => {
           val columnStat = childStats.attributeStats(expr.asInstanceOf[Attribute])
           val distinctCount = columnStat.distinctCount.get
-          val distinctValue: BigInt = if (columnStat.nullCount.get > 0) {
-            distinctCount + 1
+          val distinctValue: BigInt = if (distinctCount == 0 && columnStat.nullCount.get > 0) {
+            1
           } else {
             distinctCount
           }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala
index 32bf20b..c247050 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala
@@ -40,9 +40,7 @@ class AggregateEstimationSuite extends StatsEstimationTestBase with PlanTest {
     attr("key31") -> ColumnStat(distinctCount = Some(0), min = None, max = None,
       nullCount = Some(0), avgLen = Some(4), maxLen = Some(4)),
     attr("key32") -> ColumnStat(distinctCount = Some(0), min = None, max = None,
-      nullCount = Some(4), avgLen = Some(4), maxLen = Some(4)),
-    attr("key33") -> ColumnStat(distinctCount = Some(2), min = None, max = None,
-      nullCount = Some(2), avgLen = Some(4), maxLen = Some(4))
+      nullCount = Some(4), avgLen = Some(4), maxLen = Some(4))
   ))
 
   private val nameToAttr: Map[String, Attribute] = columnInfo.map(kv => kv._1.name -> kv._1)
@@ -128,15 +126,6 @@ class AggregateEstimationSuite extends StatsEstimationTestBase with PlanTest {
       expectedOutputRowCount = nameToColInfo("key22")._2.distinctCount.get)
   }
 
-  test("group-by column with null value") {
-    checkAggStats(
-      tableColumns = Seq("key21", "key33"),
-      tableRowCount = 6,
-      groupByColumns = Seq("key21", "key33"),
-      expectedOutputRowCount = nameToColInfo("key21")._2.distinctCount.get *
-        (nameToColInfo("key33")._2.distinctCount.get + 1))
-  }
-
   test("non-cbo estimation") {
     val attributes = Seq("key12").map(nameToAttr)
     val child = StatsTestPlan(


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org