You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2017/11/27 17:13:50 UTC

spark git commit: [SPARK-22529][SQL] Relation stats should be consistent with other plans based on cbo config

Repository: spark
Updated Branches:
  refs/heads/master 2dbe275b2 -> 1ff4a77be


[SPARK-22529][SQL] Relation stats should be consistent with other plans based on cbo config

## What changes were proposed in this pull request?

Currently, relation stats is the same whether cbo is enabled or not. While relation (`LogicalRelation` or `HiveTableRelation`) is a `LogicalPlan`, its behavior is inconsistent with other plans. This can cause confusion when user runs EXPLAIN COST commands. Besides, when CBO is disabled, we apply the size-only estimation strategy, so there's no need to propagate other catalog statistics to relation.

## How was this patch tested?

Enhanced existing tests case and added a test case.

Author: Zhenhua Wang <wa...@huawei.com>

Closes #19757 from wzhfy/catalog_stats_conversion.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1ff4a77b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1ff4a77b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1ff4a77b

Branch: refs/heads/master
Commit: 1ff4a77be498615ee7216fd9cc2d510ecbd43b27
Parents: 2dbe275
Author: Zhenhua Wang <wa...@huawei.com>
Authored: Tue Nov 28 01:13:44 2017 +0800
Committer: Wenchen Fan <we...@databricks.com>
Committed: Tue Nov 28 01:13:44 2017 +0800

----------------------------------------------------------------------
 .../spark/sql/catalyst/catalog/interface.scala  | 16 ++++++----
 .../execution/datasources/LogicalRelation.scala |  2 +-
 .../sql/StatisticsCollectionTestBase.scala      | 22 +++++++++-----
 .../sql/hive/execution/HiveExplainSuite.scala   | 31 ++++++++++++++------
 4 files changed, 49 insertions(+), 22 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/1ff4a77b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index b87bbb4..b10ce05 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -366,10 +366,16 @@ case class CatalogStatistics(
    * Convert [[CatalogStatistics]] to [[Statistics]], and match column stats to attributes based
    * on column names.
    */
-  def toPlanStats(planOutput: Seq[Attribute]): Statistics = {
-    val matched = planOutput.flatMap(a => colStats.get(a.name).map(a -> _))
-    Statistics(sizeInBytes = sizeInBytes, rowCount = rowCount,
-      attributeStats = AttributeMap(matched))
+  def toPlanStats(planOutput: Seq[Attribute], cboEnabled: Boolean): Statistics = {
+    if (cboEnabled) {
+      val attrStats = planOutput.flatMap(a => colStats.get(a.name).map(a -> _))
+      Statistics(sizeInBytes = sizeInBytes, rowCount = rowCount,
+        attributeStats = AttributeMap(attrStats))
+    } else {
+      // When CBO is disabled, we apply the size-only estimation strategy, so there's no need to
+      // propagate other statistics from catalog to the plan.
+      Statistics(sizeInBytes = sizeInBytes)
+    }
   }
 
   /** Readable string representation for the CatalogStatistics. */
@@ -452,7 +458,7 @@ case class HiveTableRelation(
   )
 
   override def computeStats(): Statistics = {
-    tableMeta.stats.map(_.toPlanStats(output)).getOrElse {
+    tableMeta.stats.map(_.toPlanStats(output, conf.cboEnabled)).getOrElse {
       throw new IllegalStateException("table stats must be specified.")
     }
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/1ff4a77b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
index 2369957..8d715f6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
@@ -41,7 +41,7 @@ case class LogicalRelation(
 
   override def computeStats(): Statistics = {
     catalogTable
-      .flatMap(_.stats.map(_.toPlanStats(output)))
+      .flatMap(_.stats.map(_.toPlanStats(output, conf.cboEnabled)))
       .getOrElse(Statistics(sizeInBytes = relation.sizeInBytes))
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/1ff4a77b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala
index f6df077..0a0407d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.catalog.{CatalogStatistics, CatalogTable, H
 import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Histogram, HistogramBin, LogicalPlan}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.datasources.LogicalRelation
-import org.apache.spark.sql.internal.StaticSQLConf
+import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf}
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types.Decimal
 
@@ -223,11 +223,19 @@ abstract class StatisticsCollectionTestBase extends QueryTest with SQLTestUtils
     assert(catalogTable.stats.get.colStats == Map("c1" -> emptyColStat))
 
     // Check relation statistics
-    assert(relation.stats.sizeInBytes == 0)
-    assert(relation.stats.rowCount == Some(0))
-    assert(relation.stats.attributeStats.size == 1)
-    val (attribute, colStat) = relation.stats.attributeStats.head
-    assert(attribute.name == "c1")
-    assert(colStat == emptyColStat)
+    withSQLConf(SQLConf.CBO_ENABLED.key -> "true") {
+      assert(relation.stats.sizeInBytes == 0)
+      assert(relation.stats.rowCount == Some(0))
+      assert(relation.stats.attributeStats.size == 1)
+      val (attribute, colStat) = relation.stats.attributeStats.head
+      assert(attribute.name == "c1")
+      assert(colStat == emptyColStat)
+    }
+    relation.invalidateStatsCache()
+    withSQLConf(SQLConf.CBO_ENABLED.key -> "false") {
+      assert(relation.stats.sizeInBytes == 0)
+      assert(relation.stats.rowCount.isEmpty)
+      assert(relation.stats.attributeStats.isEmpty)
+    }
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/1ff4a77b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
index 3066a4f..dfabf1e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
@@ -18,8 +18,10 @@
 package org.apache.spark.sql.hive.execution
 
 import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SQLTestUtils
 
 /**
@@ -29,21 +31,32 @@ class HiveExplainSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
   import testImplicits._
 
   test("show cost in explain command") {
+    val explainCostCommand = "EXPLAIN COST  SELECT * FROM src"
     // For readability, we only show optimized plan and physical plan in explain cost command
-    checkKeywordsExist(sql("EXPLAIN COST  SELECT * FROM src "),
+    checkKeywordsExist(sql(explainCostCommand),
       "Optimized Logical Plan", "Physical Plan")
-    checkKeywordsNotExist(sql("EXPLAIN COST  SELECT * FROM src "),
+    checkKeywordsNotExist(sql(explainCostCommand),
       "Parsed Logical Plan", "Analyzed Logical Plan")
 
-    // Only has sizeInBytes before ANALYZE command
-    checkKeywordsExist(sql("EXPLAIN COST  SELECT * FROM src "), "sizeInBytes")
-    checkKeywordsNotExist(sql("EXPLAIN COST  SELECT * FROM src "), "rowCount")
+    withSQLConf(SQLConf.CBO_ENABLED.key -> "true") {
+      // Only has sizeInBytes before ANALYZE command
+      checkKeywordsExist(sql(explainCostCommand), "sizeInBytes")
+      checkKeywordsNotExist(sql(explainCostCommand), "rowCount")
 
-    // Has both sizeInBytes and rowCount after ANALYZE command
-    sql("ANALYZE TABLE src COMPUTE STATISTICS")
-    checkKeywordsExist(sql("EXPLAIN COST  SELECT * FROM src "), "sizeInBytes", "rowCount")
+      // Has both sizeInBytes and rowCount after ANALYZE command
+      sql("ANALYZE TABLE src COMPUTE STATISTICS")
+      checkKeywordsExist(sql(explainCostCommand), "sizeInBytes", "rowCount")
+    }
+
+    spark.sessionState.catalog.refreshTable(TableIdentifier("src"))
+
+    withSQLConf(SQLConf.CBO_ENABLED.key -> "false") {
+      // Don't show rowCount if cbo is disabled
+      checkKeywordsExist(sql(explainCostCommand), "sizeInBytes")
+      checkKeywordsNotExist(sql(explainCostCommand), "rowCount")
+    }
 
-    // No cost information
+    // No statistics information if "cost" is not specified
     checkKeywordsNotExist(sql("EXPLAIN  SELECT * FROM src "), "sizeInBytes", "rowCount")
   }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org