You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2017/11/27 17:13:50 UTC
spark git commit: [SPARK-22529][SQL] Relation stats should be
consistent with other plans based on cbo config
Repository: spark
Updated Branches:
refs/heads/master 2dbe275b2 -> 1ff4a77be
[SPARK-22529][SQL] Relation stats should be consistent with other plans based on cbo config
## What changes were proposed in this pull request?
Currently, relation stats is the same whether cbo is enabled or not. While relation (`LogicalRelation` or `HiveTableRelation`) is a `LogicalPlan`, its behavior is inconsistent with other plans. This can cause confusion when user runs EXPLAIN COST commands. Besides, when CBO is disabled, we apply the size-only estimation strategy, so there's no need to propagate other catalog statistics to relation.
## How was this patch tested?
Enhanced existing tests case and added a test case.
Author: Zhenhua Wang <wa...@huawei.com>
Closes #19757 from wzhfy/catalog_stats_conversion.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1ff4a77b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1ff4a77b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1ff4a77b
Branch: refs/heads/master
Commit: 1ff4a77be498615ee7216fd9cc2d510ecbd43b27
Parents: 2dbe275
Author: Zhenhua Wang <wa...@huawei.com>
Authored: Tue Nov 28 01:13:44 2017 +0800
Committer: Wenchen Fan <we...@databricks.com>
Committed: Tue Nov 28 01:13:44 2017 +0800
----------------------------------------------------------------------
.../spark/sql/catalyst/catalog/interface.scala | 16 ++++++----
.../execution/datasources/LogicalRelation.scala | 2 +-
.../sql/StatisticsCollectionTestBase.scala | 22 +++++++++-----
.../sql/hive/execution/HiveExplainSuite.scala | 31 ++++++++++++++------
4 files changed, 49 insertions(+), 22 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/1ff4a77b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index b87bbb4..b10ce05 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -366,10 +366,16 @@ case class CatalogStatistics(
* Convert [[CatalogStatistics]] to [[Statistics]], and match column stats to attributes based
* on column names.
*/
- def toPlanStats(planOutput: Seq[Attribute]): Statistics = {
- val matched = planOutput.flatMap(a => colStats.get(a.name).map(a -> _))
- Statistics(sizeInBytes = sizeInBytes, rowCount = rowCount,
- attributeStats = AttributeMap(matched))
+ def toPlanStats(planOutput: Seq[Attribute], cboEnabled: Boolean): Statistics = {
+ if (cboEnabled) {
+ val attrStats = planOutput.flatMap(a => colStats.get(a.name).map(a -> _))
+ Statistics(sizeInBytes = sizeInBytes, rowCount = rowCount,
+ attributeStats = AttributeMap(attrStats))
+ } else {
+ // When CBO is disabled, we apply the size-only estimation strategy, so there's no need to
+ // propagate other statistics from catalog to the plan.
+ Statistics(sizeInBytes = sizeInBytes)
+ }
}
/** Readable string representation for the CatalogStatistics. */
@@ -452,7 +458,7 @@ case class HiveTableRelation(
)
override def computeStats(): Statistics = {
- tableMeta.stats.map(_.toPlanStats(output)).getOrElse {
+ tableMeta.stats.map(_.toPlanStats(output, conf.cboEnabled)).getOrElse {
throw new IllegalStateException("table stats must be specified.")
}
}
http://git-wip-us.apache.org/repos/asf/spark/blob/1ff4a77b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
index 2369957..8d715f6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
@@ -41,7 +41,7 @@ case class LogicalRelation(
override def computeStats(): Statistics = {
catalogTable
- .flatMap(_.stats.map(_.toPlanStats(output)))
+ .flatMap(_.stats.map(_.toPlanStats(output, conf.cboEnabled)))
.getOrElse(Statistics(sizeInBytes = relation.sizeInBytes))
}
http://git-wip-us.apache.org/repos/asf/spark/blob/1ff4a77b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala
index f6df077..0a0407d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.catalog.{CatalogStatistics, CatalogTable, H
import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Histogram, HistogramBin, LogicalPlan}
import org.apache.spark.sql.catalyst.util.DateTimeUtils
import org.apache.spark.sql.execution.datasources.LogicalRelation
-import org.apache.spark.sql.internal.StaticSQLConf
+import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf}
import org.apache.spark.sql.test.SQLTestUtils
import org.apache.spark.sql.types.Decimal
@@ -223,11 +223,19 @@ abstract class StatisticsCollectionTestBase extends QueryTest with SQLTestUtils
assert(catalogTable.stats.get.colStats == Map("c1" -> emptyColStat))
// Check relation statistics
- assert(relation.stats.sizeInBytes == 0)
- assert(relation.stats.rowCount == Some(0))
- assert(relation.stats.attributeStats.size == 1)
- val (attribute, colStat) = relation.stats.attributeStats.head
- assert(attribute.name == "c1")
- assert(colStat == emptyColStat)
+ withSQLConf(SQLConf.CBO_ENABLED.key -> "true") {
+ assert(relation.stats.sizeInBytes == 0)
+ assert(relation.stats.rowCount == Some(0))
+ assert(relation.stats.attributeStats.size == 1)
+ val (attribute, colStat) = relation.stats.attributeStats.head
+ assert(attribute.name == "c1")
+ assert(colStat == emptyColStat)
+ }
+ relation.invalidateStatsCache()
+ withSQLConf(SQLConf.CBO_ENABLED.key -> "false") {
+ assert(relation.stats.sizeInBytes == 0)
+ assert(relation.stats.rowCount.isEmpty)
+ assert(relation.stats.attributeStats.isEmpty)
+ }
}
}
http://git-wip-us.apache.org/repos/asf/spark/blob/1ff4a77b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
index 3066a4f..dfabf1e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
@@ -18,8 +18,10 @@
package org.apache.spark.sql.hive.execution
import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.parser.ParseException
import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.test.SQLTestUtils
/**
@@ -29,21 +31,32 @@ class HiveExplainSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
import testImplicits._
test("show cost in explain command") {
+ val explainCostCommand = "EXPLAIN COST SELECT * FROM src"
// For readability, we only show optimized plan and physical plan in explain cost command
- checkKeywordsExist(sql("EXPLAIN COST SELECT * FROM src "),
+ checkKeywordsExist(sql(explainCostCommand),
"Optimized Logical Plan", "Physical Plan")
- checkKeywordsNotExist(sql("EXPLAIN COST SELECT * FROM src "),
+ checkKeywordsNotExist(sql(explainCostCommand),
"Parsed Logical Plan", "Analyzed Logical Plan")
- // Only has sizeInBytes before ANALYZE command
- checkKeywordsExist(sql("EXPLAIN COST SELECT * FROM src "), "sizeInBytes")
- checkKeywordsNotExist(sql("EXPLAIN COST SELECT * FROM src "), "rowCount")
+ withSQLConf(SQLConf.CBO_ENABLED.key -> "true") {
+ // Only has sizeInBytes before ANALYZE command
+ checkKeywordsExist(sql(explainCostCommand), "sizeInBytes")
+ checkKeywordsNotExist(sql(explainCostCommand), "rowCount")
- // Has both sizeInBytes and rowCount after ANALYZE command
- sql("ANALYZE TABLE src COMPUTE STATISTICS")
- checkKeywordsExist(sql("EXPLAIN COST SELECT * FROM src "), "sizeInBytes", "rowCount")
+ // Has both sizeInBytes and rowCount after ANALYZE command
+ sql("ANALYZE TABLE src COMPUTE STATISTICS")
+ checkKeywordsExist(sql(explainCostCommand), "sizeInBytes", "rowCount")
+ }
+
+ spark.sessionState.catalog.refreshTable(TableIdentifier("src"))
+
+ withSQLConf(SQLConf.CBO_ENABLED.key -> "false") {
+ // Don't show rowCount if cbo is disabled
+ checkKeywordsExist(sql(explainCostCommand), "sizeInBytes")
+ checkKeywordsNotExist(sql(explainCostCommand), "rowCount")
+ }
- // No cost information
+ // No statistics information if "cost" is not specified
checkKeywordsNotExist(sql("EXPLAIN SELECT * FROM src "), "sizeInBytes", "rowCount")
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org