You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@spark.apache.org by gu...@apache.org on 2019/05/23 18:20:19 UTC

[spark] branch branch-2.4 updated (fa7c319 -> e69ad46)

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a change to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git.


    from fa7c319  [SPARK-27800][SQL][HOTFIX][FOLLOWUP] Fix wrong answer on BitwiseXor test cases
     new e0e8a6d  Revert "[SPARK-27539][SQL] Fix inaccurate aggregate outputRows estimation with column containing null values"
     new e69ad46  Revert "[SPARK-27351][SQL] Wrong outputRows estimation after AggregateEstimation wit…"

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../statsEstimation/AggregateEstimation.scala      | 12 ++---------
 .../statsEstimation/AggregateEstimationSuite.scala | 23 +---------------------
 2 files changed, 3 insertions(+), 32 deletions(-)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org

[spark] 02/02: Revert "[SPARK-27351][SQL] Wrong outputRows estimation after AggregateEstimation wit…"

Posted by gu...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git

commit e69ad46c72ed26c8293da95dc19b6f31445c0df5
Author: HyukjinKwon <gu...@apache.org>
AuthorDate: Fri May 24 03:19:48 2019 +0900

    Revert "[SPARK-27351][SQL] Wrong outputRows estimation after AggregateEstimation wit…"
    
    This reverts commit 40668c53ed799881db1f316ceaf2f978b294d8ed.
---
 .../plans/logical/statsEstimation/AggregateEstimation.scala  | 12 ++----------
 .../catalyst/statsEstimation/AggregateEstimationSuite.scala  | 12 +-----------
 2 files changed, 3 insertions(+), 21 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala
index 7ef22fa..111c594 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala
@@ -39,16 +39,8 @@ object AggregateEstimation {
       // Multiply distinct counts of group-by columns. This is an upper bound, which assumes
       // the data contains all combinations of distinct values of group-by columns.
       var outputRows: BigInt = agg.groupingExpressions.foldLeft(BigInt(1))(
-        (res, expr) => {
-          val columnStat = childStats.attributeStats(expr.asInstanceOf[Attribute])
-          val distinctCount = columnStat.distinctCount.get
-          val distinctValue: BigInt = if (distinctCount == 0 && columnStat.nullCount.get > 0) {
-            1
-          } else {
-            distinctCount
-          }
-          res * distinctValue
-        })
+        (res, expr) => res *
+          childStats.attributeStats(expr.asInstanceOf[Attribute]).distinctCount.get)
 
       outputRows = if (agg.groupingExpressions.isEmpty) {
         // If there's no group-by columns, the output is a single row containing values of aggregate
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala
index 6bdf8cd..8213d56 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala
@@ -38,9 +38,7 @@ class AggregateEstimationSuite extends StatsEstimationTestBase with PlanTest {
     attr("key22") -> ColumnStat(distinctCount = Some(2), min = Some(10), max = Some(20),
       nullCount = Some(0), avgLen = Some(4), maxLen = Some(4)),
     attr("key31") -> ColumnStat(distinctCount = Some(0), min = None, max = None,
-      nullCount = Some(0), avgLen = Some(4), maxLen = Some(4)),
-    attr("key32") -> ColumnStat(distinctCount = Some(0), min = None, max = None,
-      nullCount = Some(4), avgLen = Some(4), maxLen = Some(4))
+      nullCount = Some(0), avgLen = Some(4), maxLen = Some(4))
   ))
 
   private val nameToAttr: Map[String, Attribute] = columnInfo.map(kv => kv._1.name -> kv._1)
@@ -94,14 +92,6 @@ class AggregateEstimationSuite extends StatsEstimationTestBase with PlanTest {
       expectedOutputRowCount = 0)
   }
 
-  test("group-by column with only null value") {
-    checkAggStats(
-      tableColumns = Seq("key22", "key32"),
-      tableRowCount = 6,
-      groupByColumns = Seq("key22", "key32"),
-      expectedOutputRowCount = nameToColInfo("key22")._2.distinctCount.get)
-  }
-
   test("non-cbo estimation") {
     val attributes = Seq("key12").map(nameToAttr)
     val child = StatsTestPlan(


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org

[spark] 01/02: Revert "[SPARK-27539][SQL] Fix inaccurate aggregate outputRows estimation with column containing null values"

Posted by gu...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git

commit e0e8a6de1345e6e716bb8c6e35a98e981feb3bab
Author: HyukjinKwon <gu...@apache.org>
AuthorDate: Fri May 24 03:19:40 2019 +0900

    Revert "[SPARK-27539][SQL] Fix inaccurate aggregate outputRows estimation with column containing null values"
    
    This reverts commit 42cb4a2ccdb5ca6216677dc4285c3e74cfb7e707.
---
 .../plans/logical/statsEstimation/AggregateEstimation.scala |  4 ++--
 .../catalyst/statsEstimation/AggregateEstimationSuite.scala | 13 +------------
 2 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala
index b9e72c2..7ef22fa 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala
@@ -42,8 +42,8 @@ object AggregateEstimation {
         (res, expr) => {
           val columnStat = childStats.attributeStats(expr.asInstanceOf[Attribute])
           val distinctCount = columnStat.distinctCount.get
-          val distinctValue: BigInt = if (columnStat.nullCount.get > 0) {
-            distinctCount + 1
+          val distinctValue: BigInt = if (distinctCount == 0 && columnStat.nullCount.get > 0) {
+            1
           } else {
             distinctCount
           }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala
index d89b9df..6bdf8cd 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala
@@ -40,9 +40,7 @@ class AggregateEstimationSuite extends StatsEstimationTestBase with PlanTest {
     attr("key31") -> ColumnStat(distinctCount = Some(0), min = None, max = None,
       nullCount = Some(0), avgLen = Some(4), maxLen = Some(4)),
     attr("key32") -> ColumnStat(distinctCount = Some(0), min = None, max = None,
-      nullCount = Some(4), avgLen = Some(4), maxLen = Some(4)),
-    attr("key33") -> ColumnStat(distinctCount = Some(2), min = None, max = None,
-      nullCount = Some(2), avgLen = Some(4), maxLen = Some(4))
+      nullCount = Some(4), avgLen = Some(4), maxLen = Some(4))
   ))
 
   private val nameToAttr: Map[String, Attribute] = columnInfo.map(kv => kv._1.name -> kv._1)
@@ -104,15 +102,6 @@ class AggregateEstimationSuite extends StatsEstimationTestBase with PlanTest {
       expectedOutputRowCount = nameToColInfo("key22")._2.distinctCount.get)
   }
 
-  test("group-by column with null value") {
-    checkAggStats(
-      tableColumns = Seq("key21", "key33"),
-      tableRowCount = 6,
-      groupByColumns = Seq("key21", "key33"),
-      expectedOutputRowCount = nameToColInfo("key21")._2.distinctCount.get *
-        (nameToColInfo("key33")._2.distinctCount.get + 1))
-  }
-
   test("non-cbo estimation") {
     val attributes = Seq("key12").map(nameToAttr)
     val child = StatsTestPlan(


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org