You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by GitBox <gi...@apache.org> on 2022/11/07 03:04:29 UTC
[GitHub] [spark] jackylee-ch commented on a diff in pull request #38496: [WIP][SPARK-40708][SQL] Auto update table statistics based on write metrics
jackylee-ch commented on code in PR #38496:
URL: https://github.com/apache/spark/pull/38496#discussion_r1014966247
##########
sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala:
##########
@@ -53,19 +53,41 @@ class PathFilterIgnoreNonData(stagingDir: String) extends PathFilter with Serial
object CommandUtils extends Logging {
/** Change statistics after changing data by commands. */
- def updateTableStats(sparkSession: SparkSession, table: CatalogTable): Unit = {
- val catalog = sparkSession.sessionState.catalog
- if (sparkSession.sessionState.conf.autoSizeUpdateEnabled) {
+ def updateTableStats(
+ sparkSession: SparkSession,
+ table: CatalogTable,
+ wroteStats: Option[WriteStats] = None): Unit = {
+ val sessionState = sparkSession.sessionState
+ val catalog = sessionState.catalog
+ if (sessionState.conf.autoSizeUpdateEnabled) {
val newTable = catalog.getTableMetadata(table.identifier)
val (newSize, newPartitions) = CommandUtils.calculateTotalSize(sparkSession, newTable)
- val isNewStats = newTable.stats.map(newSize != _.sizeInBytes).getOrElse(true)
+ val isNewStats = newTable.stats.forall(newSize != _.sizeInBytes)
if (isNewStats) {
val newStats = CatalogStatistics(sizeInBytes = newSize)
catalog.alterTableStats(table.identifier, Some(newStats))
}
if (newPartitions.nonEmpty) {
catalog.alterPartitions(table.identifier, newPartitions)
}
+ } else if (sessionState.conf.autoUpdateBasedOnMetricsEnabled && wroteStats.nonEmpty) {
+ val stat = wroteStats.get
+ stat.mode match {
+ case SaveMode.Overwrite | SaveMode.ErrorIfExists =>
+ catalog.alterTableStats(table.identifier,
+ Some(CatalogStatistics(stat.numBytes, stat.numRows)))
Review Comment:
Hm, we should consider about partition Statistics here. If we overwrite the part of the partitions, it would get wrong table statistcs.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org