You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2017/07/09 10:51:12 UTC
spark git commit: [SPARK-21083][SQL][BRANCH-2.2] Store zero size and
row count when analyzing empty table
Repository: spark
Updated Branches:
refs/heads/branch-2.2 964332b28 -> 3bfad9d42
[SPARK-21083][SQL][BRANCH-2.2] Store zero size and row count when analyzing empty table
## What changes were proposed in this pull request?
We should be able to store zero size and row count after analyzing empty table.
This is a backport for https://github.com/apache/spark/commit/9fccc3627fa41d32fbae6dbbb9bd1521e43eb4f0.
## How was this patch tested?
Added new test.
Author: Zhenhua Wang <wa...@huawei.com>
Closes #18575 from wzhfy/analyzeEmptyTable-2.2.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3bfad9d4
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3bfad9d4
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3bfad9d4
Branch: refs/heads/branch-2.2
Commit: 3bfad9d4210f96dcd2270599257c3a5272cad77b
Parents: 964332b
Author: Zhenhua Wang <wa...@huawei.com>
Authored: Sun Jul 9 18:51:06 2017 +0800
Committer: Wenchen Fan <we...@databricks.com>
Committed: Sun Jul 9 18:51:06 2017 +0800
----------------------------------------------------------------------
.../execution/command/AnalyzeTableCommand.scala | 4 +-
.../spark/sql/StatisticsCollectionSuite.scala | 45 ++++++++++++++------
.../apache/spark/sql/hive/StatisticsSuite.scala | 19 +--------
3 files changed, 35 insertions(+), 33 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/3bfad9d4/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
index 0f3c69c..bf7c227 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
@@ -47,10 +47,10 @@ case class AnalyzeTableCommand(
}
val newTotalSize = AnalyzeTableCommand.calculateTotalSize(sessionState, tableMeta)
- val oldTotalSize = tableMeta.stats.map(_.sizeInBytes.toLong).getOrElse(0L)
+ val oldTotalSize = tableMeta.stats.map(_.sizeInBytes.toLong).getOrElse(-1L)
val oldRowCount = tableMeta.stats.flatMap(_.rowCount.map(_.toLong)).getOrElse(-1L)
var newStats: Option[CatalogStatistics] = None
- if (newTotalSize > 0 && newTotalSize != oldTotalSize) {
+ if (newTotalSize >= 0 && newTotalSize != oldTotalSize) {
newStats = Some(CatalogStatistics(sizeInBytes = newTotalSize))
}
// We only set rowCount when noscan is false, because otherwise:
http://git-wip-us.apache.org/repos/asf/spark/blob/3bfad9d4/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
index 601324f..ae0f219 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
@@ -40,17 +40,6 @@ import org.apache.spark.sql.types._
class StatisticsCollectionSuite extends StatisticsCollectionTestBase with SharedSQLContext {
import testImplicits._
- private def checkTableStats(tableName: String, expectedRowCount: Option[Int])
- : Option[CatalogStatistics] = {
- val df = spark.table(tableName)
- val stats = df.queryExecution.analyzed.collect { case rel: LogicalRelation =>
- assert(rel.catalogTable.get.stats.flatMap(_.rowCount) === expectedRowCount)
- rel.catalogTable.get.stats
- }
- assert(stats.size == 1)
- stats.head
- }
-
test("estimates the size of a limit 0 on outer join") {
withTempView("test") {
Seq(("one", 1), ("two", 2), ("three", 3), ("four", 4)).toDF("k", "v")
@@ -88,6 +77,19 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
}
}
+ test("analyze empty table") {
+ val table = "emptyTable"
+ withTable(table) {
+ sql(s"CREATE TABLE $table (key STRING, value STRING) USING PARQUET")
+ sql(s"ANALYZE TABLE $table COMPUTE STATISTICS noscan")
+ val fetchedStats1 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = None)
+ assert(fetchedStats1.get.sizeInBytes == 0)
+ sql(s"ANALYZE TABLE $table COMPUTE STATISTICS")
+ val fetchedStats2 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(0))
+ assert(fetchedStats2.get.sizeInBytes == 0)
+ }
+ }
+
test("test table-level statistics for data source table") {
val tableName = "tbl"
withTable(tableName) {
@@ -96,11 +98,11 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
// noscan won't count the number of rows
sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS noscan")
- checkTableStats(tableName, expectedRowCount = None)
+ checkTableStats(tableName, hasSizeInBytes = true, expectedRowCounts = None)
// without noscan, we count the number of rows
sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
- checkTableStats(tableName, expectedRowCount = Some(2))
+ checkTableStats(tableName, hasSizeInBytes = true, expectedRowCounts = Some(2))
}
}
@@ -219,6 +221,23 @@ abstract class StatisticsCollectionTestBase extends QueryTest with SQLTestUtils
private val randomName = new Random(31)
+ def checkTableStats(
+ tableName: String,
+ hasSizeInBytes: Boolean,
+ expectedRowCounts: Option[Int]): Option[CatalogStatistics] = {
+ val stats = spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName)).stats
+
+ if (hasSizeInBytes || expectedRowCounts.nonEmpty) {
+ assert(stats.isDefined)
+ assert(stats.get.sizeInBytes >= 0)
+ assert(stats.get.rowCount === expectedRowCounts)
+ } else {
+ assert(stats.isEmpty)
+ }
+
+ stats
+ }
+
/**
* Compute column stats for the given DataFrame and compare it with colStats.
*/
http://git-wip-us.apache.org/repos/asf/spark/blob/3bfad9d4/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index b03d69e..819180d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -30,7 +30,7 @@ import org.apache.spark.sql.execution.joins._
import org.apache.spark.sql.hive.test.TestHiveSingleton
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types._
-import org.apache.spark.util.Utils
+
class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleton {
@@ -217,23 +217,6 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
}
}
- private def checkTableStats(
- tableName: String,
- hasSizeInBytes: Boolean,
- expectedRowCounts: Option[Int]): Option[CatalogStatistics] = {
- val stats = spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName)).stats
-
- if (hasSizeInBytes || expectedRowCounts.nonEmpty) {
- assert(stats.isDefined)
- assert(stats.get.sizeInBytes > 0)
- assert(stats.get.rowCount === expectedRowCounts)
- } else {
- assert(stats.isEmpty)
- }
-
- stats
- }
-
test("test table-level statistics for hive tables created in HiveExternalCatalog") {
val textTable = "textTable"
withTable(textTable) {
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org