You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2017/07/10 03:31:59 UTC
spark git commit: [SPARK-21083][SQL][BRANCH-2.1] Store zero size and
row count when analyzing empty table
Repository: spark
Updated Branches:
refs/heads/branch-2.1 5e2bfd5bc -> 2c2846241
[SPARK-21083][SQL][BRANCH-2.1] Store zero size and row count when analyzing empty table
## What changes were proposed in this pull request?
We should be able to store zero size and row count after analyzing empty table.
This is a backport for https://github.com/apache/spark/commit/9fccc3627fa41d32fbae6dbbb9bd1521e43eb4f0.
## How was this patch tested?
Added new test.
Author: Zhenhua Wang <wz...@163.com>
Closes #18577 from wzhfy/analyzeEmptyTable-2.1.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2c284624
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2c284624
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2c284624
Branch: refs/heads/branch-2.1
Commit: 2c28462411f21f71c0e048cb1f7e05efe19da6b7
Parents: 5e2bfd5
Author: Zhenhua Wang <wz...@163.com>
Authored: Mon Jul 10 11:31:55 2017 +0800
Committer: Wenchen Fan <we...@databricks.com>
Committed: Mon Jul 10 11:31:55 2017 +0800
----------------------------------------------------------------------
.../execution/command/AnalyzeTableCommand.scala | 4 +-
.../spark/sql/StatisticsCollectionSuite.scala | 39 ++++++++++++++------
2 files changed, 30 insertions(+), 13 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/2c284624/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
index 52a8fc8..e6606b4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
@@ -60,10 +60,10 @@ case class AnalyzeTableCommand(
}
def updateTableStats(catalogTable: CatalogTable, newTotalSize: Long): Unit = {
- val oldTotalSize = catalogTable.stats.map(_.sizeInBytes.toLong).getOrElse(0L)
+ val oldTotalSize = catalogTable.stats.map(_.sizeInBytes.toLong).getOrElse(-1L)
val oldRowCount = catalogTable.stats.flatMap(_.rowCount.map(_.toLong)).getOrElse(-1L)
var newStats: Option[Statistics] = None
- if (newTotalSize > 0 && newTotalSize != oldTotalSize) {
+ if (newTotalSize >= 0 && newTotalSize != oldTotalSize) {
newStats = Some(Statistics(sizeInBytes = newTotalSize))
}
// We only set rowCount when noscan is false, because otherwise:
http://git-wip-us.apache.org/repos/asf/spark/blob/2c284624/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
index c663b31..a08edbe 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
@@ -25,7 +25,6 @@ import scala.util.Random
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.execution.datasources.LogicalRelation
import org.apache.spark.sql.internal.StaticSQLConf
import org.apache.spark.sql.test.{SharedSQLContext, SQLTestUtils}
import org.apache.spark.sql.test.SQLTestData.ArrayData
@@ -38,15 +37,20 @@ import org.apache.spark.sql.types._
class StatisticsCollectionSuite extends StatisticsCollectionTestBase with SharedSQLContext {
import testImplicits._
- private def checkTableStats(tableName: String, expectedRowCount: Option[Int])
- : Option[Statistics] = {
- val df = spark.table(tableName)
- val stats = df.queryExecution.analyzed.collect { case rel: LogicalRelation =>
- assert(rel.catalogTable.get.stats.flatMap(_.rowCount) === expectedRowCount)
- rel.catalogTable.get.stats
+ def checkTableStats(
+ tableName: String,
+ hasSizeInBytes: Boolean,
+ expectedRowCounts: Option[Int]): Option[Statistics] = {
+ val stats = spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName)).stats
+ if (hasSizeInBytes || expectedRowCounts.nonEmpty) {
+ assert(stats.isDefined)
+ assert(stats.get.sizeInBytes >= 0)
+ assert(stats.get.rowCount === expectedRowCounts)
+ } else {
+ assert(stats.isEmpty)
}
- assert(stats.size == 1)
- stats.head
+
+ stats
}
test("estimates the size of a limit 0 on outer join") {
@@ -86,6 +90,19 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
}
}
+ test("analyze empty table") {
+ val table = "emptyTable"
+ withTable(table) {
+ sql(s"CREATE TABLE $table (key STRING, value STRING) USING PARQUET")
+ sql(s"ANALYZE TABLE $table COMPUTE STATISTICS noscan")
+ val fetchedStats1 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = None)
+ assert(fetchedStats1.get.sizeInBytes == 0)
+ sql(s"ANALYZE TABLE $table COMPUTE STATISTICS")
+ val fetchedStats2 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(0))
+ assert(fetchedStats2.get.sizeInBytes == 0)
+ }
+ }
+
test("test table-level statistics for data source table") {
val tableName = "tbl"
withTable(tableName) {
@@ -94,11 +111,11 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
// noscan won't count the number of rows
sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS noscan")
- checkTableStats(tableName, expectedRowCount = None)
+ checkTableStats(tableName, hasSizeInBytes = true, expectedRowCounts = None)
// without noscan, we count the number of rows
sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
- checkTableStats(tableName, expectedRowCount = Some(2))
+ checkTableStats(tableName, hasSizeInBytes = true, expectedRowCounts = Some(2))
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org