You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2021/01/13 05:22:46 UTC
[spark] branch branch-3.1 updated: [SPARK-34084][SQL][3.1] Fix auto
updating of table stats in `ALTER TABLE .. ADD PARTITION`
This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.1 by this push:
new 6f95815 [SPARK-34084][SQL][3.1] Fix auto updating of table stats in `ALTER TABLE .. ADD PARTITION`
6f95815 is described below
commit 6f95815c070038436fe12011cb5280df1b75ebeb
Author: Max Gekk <ma...@gmail.com>
AuthorDate: Wed Jan 13 05:22:15 2021 +0000
[SPARK-34084][SQL][3.1] Fix auto updating of table stats in `ALTER TABLE .. ADD PARTITION`
### What changes were proposed in this pull request?
Fix an issue in `ALTER TABLE .. ADD PARTITION` which happens when:
- A table doesn't have stats
- `spark.sql.statistics.size.autoUpdate.enabled` is `true`
In that case, `ALTER TABLE .. ADD PARTITION` does not update table stats automatically.
### Why are the changes needed?
The changes fix the issue demonstrated by the example:
```sql
spark-sql> create table tbl (col0 int, part int) partitioned by (part);
spark-sql> insert into tbl partition (part = 0) select 0;
spark-sql> set spark.sql.statistics.size.autoUpdate.enabled=true;
spark-sql> alter table tbl add partition (part = 1);
```
the `add partition` command should update table stats but it does not. There are no stats in the output of:
```
spark-sql> describe table extended tbl;
```
### Does this PR introduce _any_ user-facing change?
Yes. After the changes, `ALTER TABLE .. ADD PARTITION` updates stats even when a table doesn't have stats before the command:
```sql
spark-sql> alter table tbl add partition (part = 1);
spark-sql> describe table extended tbl;
col0 int NULL
part int NULL
# Partition Information
# col_name data_type comment
part int NULL
# Detailed Table Information
...
Statistics 2 bytes
```
### How was this patch tested?
By running new UT and existing test suites:
```
$ build/sbt -Phive-2.3 -Phive-thriftserver "test:testOnly *.StatisticsSuite"
```
Authored-by: Max Gekk <max.gekkgmail.com>
Signed-off-by: Wenchen Fan <wenchendatabricks.com>
(cherry picked from commit 6c047958f9fcf4cac848695915deea289c65ddc1)
Signed-off-by: Max Gekk <max.gekkgmail.com>
Closes #31157 from MaxGekk/fix-stats-in-add-partition-3.1.
Authored-by: Max Gekk <ma...@gmail.com>
Signed-off-by: Wenchen Fan <we...@databricks.com>
---
.../org/apache/spark/sql/execution/command/ddl.scala | 20 ++++++++++----------
.../org/apache/spark/sql/hive/StatisticsSuite.scala | 16 ++++++++++++++++
2 files changed, 26 insertions(+), 10 deletions(-)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index 4545d73..f657f42 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -486,17 +486,17 @@ case class AlterTableAddPartitionCommand(
}
sparkSession.catalog.refreshTable(table.identifier.quotedString)
- if (table.stats.nonEmpty) {
- if (sparkSession.sessionState.conf.autoSizeUpdateEnabled) {
- val addedSize = CommandUtils.calculateMultipleLocationSizes(sparkSession, table.identifier,
- parts.map(_.storage.locationUri)).sum
- if (addedSize > 0) {
- val newStats = CatalogStatistics(sizeInBytes = table.stats.get.sizeInBytes + addedSize)
- catalog.alterTableStats(table.identifier, Some(newStats))
- }
- } else {
- catalog.alterTableStats(table.identifier, None)
+ if (table.stats.nonEmpty && sparkSession.sessionState.conf.autoSizeUpdateEnabled) {
+ // Updating table stats only if new partition is not empty
+ val addedSize = CommandUtils.calculateMultipleLocationSizes(sparkSession, table.identifier,
+ parts.map(_.storage.locationUri)).sum
+ if (addedSize > 0) {
+ val newStats = CatalogStatistics(sizeInBytes = table.stats.get.sizeInBytes + addedSize)
+ catalog.alterTableStats(table.identifier, Some(newStats))
}
+ } else {
+ // Re-calculating of table size including all partitions
+ CommandUtils.updateTableStats(sparkSession, table)
}
Seq.empty[Row]
}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index b819d3c..bfe69a2 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -1554,4 +1554,20 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
}
}
}
+
+ test("SPARK-34084: auto update table stats") {
+ Seq("parquet", "hive").foreach { format =>
+ withTable("t") {
+ withSQLConf(SQLConf.AUTO_SIZE_UPDATE_ENABLED.key -> "false") {
+ sql(s"CREATE TABLE t (col0 int, part int) USING $format PARTITIONED BY (part)")
+ sql("INSERT INTO t PARTITION (part=0) SELECT 0")
+ assert(getCatalogTable("t").stats.isEmpty)
+ }
+ withSQLConf(SQLConf.AUTO_SIZE_UPDATE_ENABLED.key -> "true") {
+ sql("ALTER TABLE t ADD PARTITION (part=1)")
+ assert(getTableStats("t").sizeInBytes > 0)
+ }
+ }
+ }
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org