You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2017/07/10 03:31:59 UTC
spark git commit: [SPARK-21083][SQL][BRANCH-2.1] Store zero size and row count when analyzing empty table

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 5e2bfd5bc -> 2c2846241


[SPARK-21083][SQL][BRANCH-2.1] Store zero size and row count when analyzing empty table

## What changes were proposed in this pull request?

We should be able to store zero size and row count after analyzing empty table.
This is a backport for https://github.com/apache/spark/commit/9fccc3627fa41d32fbae6dbbb9bd1521e43eb4f0.

## How was this patch tested?

Added new test.

Author: Zhenhua Wang <wz...@163.com>

Closes #18577 from wzhfy/analyzeEmptyTable-2.1.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2c284624
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2c284624
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2c284624

Branch: refs/heads/branch-2.1
Commit: 2c28462411f21f71c0e048cb1f7e05efe19da6b7
Parents: 5e2bfd5
Author: Zhenhua Wang <wz...@163.com>
Authored: Mon Jul 10 11:31:55 2017 +0800
Committer: Wenchen Fan <we...@databricks.com>
Committed: Mon Jul 10 11:31:55 2017 +0800

----------------------------------------------------------------------
 .../execution/command/AnalyzeTableCommand.scala |  4 +-
 .../spark/sql/StatisticsCollectionSuite.scala   | 39 ++++++++++++++------
 2 files changed, 30 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/2c284624/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
index 52a8fc8..e6606b4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
@@ -60,10 +60,10 @@ case class AnalyzeTableCommand(
     }
 
     def updateTableStats(catalogTable: CatalogTable, newTotalSize: Long): Unit = {
-      val oldTotalSize = catalogTable.stats.map(_.sizeInBytes.toLong).getOrElse(0L)
+      val oldTotalSize = catalogTable.stats.map(_.sizeInBytes.toLong).getOrElse(-1L)
       val oldRowCount = catalogTable.stats.flatMap(_.rowCount.map(_.toLong)).getOrElse(-1L)
       var newStats: Option[Statistics] = None
-      if (newTotalSize > 0 && newTotalSize != oldTotalSize) {
+      if (newTotalSize >= 0 && newTotalSize != oldTotalSize) {
         newStats = Some(Statistics(sizeInBytes = newTotalSize))
       }
       // We only set rowCount when noscan is false, because otherwise:

http://git-wip-us.apache.org/repos/asf/spark/blob/2c284624/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
index c663b31..a08edbe 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
@@ -25,7 +25,6 @@ import scala.util.Random
 
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.sql.internal.StaticSQLConf
 import org.apache.spark.sql.test.{SharedSQLContext, SQLTestUtils}
 import org.apache.spark.sql.test.SQLTestData.ArrayData
@@ -38,15 +37,20 @@ import org.apache.spark.sql.types._
 class StatisticsCollectionSuite extends StatisticsCollectionTestBase with SharedSQLContext {
   import testImplicits._
 
-  private def checkTableStats(tableName: String, expectedRowCount: Option[Int])
-    : Option[Statistics] = {
-    val df = spark.table(tableName)
-    val stats = df.queryExecution.analyzed.collect { case rel: LogicalRelation =>
-      assert(rel.catalogTable.get.stats.flatMap(_.rowCount) === expectedRowCount)
-      rel.catalogTable.get.stats
+  def checkTableStats(
+      tableName: String,
+      hasSizeInBytes: Boolean,
+      expectedRowCounts: Option[Int]): Option[Statistics] = {
+    val stats = spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName)).stats
+    if (hasSizeInBytes || expectedRowCounts.nonEmpty) {
+      assert(stats.isDefined)
+      assert(stats.get.sizeInBytes >= 0)
+      assert(stats.get.rowCount === expectedRowCounts)
+    } else {
+      assert(stats.isEmpty)
     }
-    assert(stats.size == 1)
-    stats.head
+
+    stats
   }
 
   test("estimates the size of a limit 0 on outer join") {
@@ -86,6 +90,19 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
     }
   }
 
+  test("analyze empty table") {
+    val table = "emptyTable"
+    withTable(table) {
+      sql(s"CREATE TABLE $table (key STRING, value STRING) USING PARQUET")
+      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS noscan")
+      val fetchedStats1 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = None)
+      assert(fetchedStats1.get.sizeInBytes == 0)
+      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS")
+      val fetchedStats2 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(0))
+      assert(fetchedStats2.get.sizeInBytes == 0)
+    }
+  }
+
   test("test table-level statistics for data source table") {
     val tableName = "tbl"
     withTable(tableName) {
@@ -94,11 +111,11 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
 
       // noscan won't count the number of rows
       sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS noscan")
-      checkTableStats(tableName, expectedRowCount = None)
+      checkTableStats(tableName, hasSizeInBytes = true, expectedRowCounts = None)
 
       // without noscan, we count the number of rows
       sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
-      checkTableStats(tableName, expectedRowCount = Some(2))
+      checkTableStats(tableName, hasSizeInBytes = true, expectedRowCounts = Some(2))
     }
   }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org