You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2021/01/13 05:22:46 UTC

[spark] branch branch-3.1 updated: [SPARK-34084][SQL][3.1] Fix auto updating of table stats in `ALTER TABLE .. ADD PARTITION`

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.1 by this push:
     new 6f95815  [SPARK-34084][SQL][3.1] Fix auto updating of table stats in `ALTER TABLE .. ADD PARTITION`
6f95815 is described below

commit 6f95815c070038436fe12011cb5280df1b75ebeb
Author: Max Gekk <ma...@gmail.com>
AuthorDate: Wed Jan 13 05:22:15 2021 +0000

    [SPARK-34084][SQL][3.1] Fix auto updating of table stats in `ALTER TABLE .. ADD PARTITION`
    
    ### What changes were proposed in this pull request?
    Fix an issue in `ALTER TABLE .. ADD PARTITION` which happens when:
    - A table doesn't have stats
    - `spark.sql.statistics.size.autoUpdate.enabled` is `true`
    
    In that case, `ALTER TABLE .. ADD PARTITION` does not update table stats automatically.
    
    ### Why are the changes needed?
    The changes fix the issue demonstrated by the example:
    ```sql
    spark-sql> create table tbl (col0 int, part int) partitioned by (part);
    spark-sql> insert into tbl partition (part = 0) select 0;
    spark-sql> set spark.sql.statistics.size.autoUpdate.enabled=true;
    spark-sql> alter table tbl add partition (part = 1);
    ```
    the `add partition` command should update table stats but it does not. There are no stats in the output of:
    ```
    spark-sql> describe table extended tbl;
    ```
    
    ### Does this PR introduce _any_ user-facing change?
    Yes. After the changes, `ALTER TABLE .. ADD PARTITION` updates stats even when a table doesn't have stats before the command:
    ```sql
    spark-sql> alter table tbl add partition (part = 1);
    spark-sql> describe table extended tbl;
    col0	int	NULL
    part	int	NULL
    # Partition Information
    # col_name	data_type	comment
    part	int	NULL
    
    # Detailed Table Information
    ...
    Statistics	2 bytes
    ```
    
    ### How was this patch tested?
    By running new UT and existing test suites:
    ```
    $ build/sbt -Phive-2.3 -Phive-thriftserver "test:testOnly *.StatisticsSuite"
    ```
    
    Authored-by: Max Gekk <max.gekkgmail.com>
    Signed-off-by: Wenchen Fan <wenchendatabricks.com>
    (cherry picked from commit 6c047958f9fcf4cac848695915deea289c65ddc1)
    Signed-off-by: Max Gekk <max.gekkgmail.com>
    
    Closes #31157 from MaxGekk/fix-stats-in-add-partition-3.1.
    
    Authored-by: Max Gekk <ma...@gmail.com>
    Signed-off-by: Wenchen Fan <we...@databricks.com>
---
 .../org/apache/spark/sql/execution/command/ddl.scala | 20 ++++++++++----------
 .../org/apache/spark/sql/hive/StatisticsSuite.scala  | 16 ++++++++++++++++
 2 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index 4545d73..f657f42 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -486,17 +486,17 @@ case class AlterTableAddPartitionCommand(
     }
 
     sparkSession.catalog.refreshTable(table.identifier.quotedString)
-    if (table.stats.nonEmpty) {
-      if (sparkSession.sessionState.conf.autoSizeUpdateEnabled) {
-        val addedSize = CommandUtils.calculateMultipleLocationSizes(sparkSession, table.identifier,
-          parts.map(_.storage.locationUri)).sum
-        if (addedSize > 0) {
-          val newStats = CatalogStatistics(sizeInBytes = table.stats.get.sizeInBytes + addedSize)
-          catalog.alterTableStats(table.identifier, Some(newStats))
-        }
-      } else {
-        catalog.alterTableStats(table.identifier, None)
+    if (table.stats.nonEmpty && sparkSession.sessionState.conf.autoSizeUpdateEnabled) {
+      // Updating table stats only if new partition is not empty
+      val addedSize = CommandUtils.calculateMultipleLocationSizes(sparkSession, table.identifier,
+        parts.map(_.storage.locationUri)).sum
+      if (addedSize > 0) {
+        val newStats = CatalogStatistics(sizeInBytes = table.stats.get.sizeInBytes + addedSize)
+        catalog.alterTableStats(table.identifier, Some(newStats))
       }
+    } else {
+      // Re-calculating of table size including all partitions
+      CommandUtils.updateTableStats(sparkSession, table)
     }
     Seq.empty[Row]
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index b819d3c..bfe69a2 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -1554,4 +1554,20 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
       }
     }
   }
+
+  test("SPARK-34084: auto update table stats") {
+    Seq("parquet", "hive").foreach { format =>
+      withTable("t") {
+        withSQLConf(SQLConf.AUTO_SIZE_UPDATE_ENABLED.key -> "false") {
+          sql(s"CREATE TABLE t (col0 int, part int) USING $format PARTITIONED BY (part)")
+          sql("INSERT INTO t PARTITION (part=0) SELECT 0")
+          assert(getCatalogTable("t").stats.isEmpty)
+        }
+        withSQLConf(SQLConf.AUTO_SIZE_UPDATE_ENABLED.key -> "true") {
+          sql("ALTER TABLE t ADD PARTITION (part=1)")
+          assert(getTableStats("t").sizeInBytes > 0)
+        }
+      }
+    }
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org