You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by GitBox <gi...@apache.org> on 2022/11/07 03:04:29 UTC

[GitHub] [spark] jackylee-ch commented on a diff in pull request #38496: [WIP][SPARK-40708][SQL] Auto update table statistics based on write metrics

jackylee-ch commented on code in PR #38496:
URL: https://github.com/apache/spark/pull/38496#discussion_r1014966247


##########
sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala:
##########
@@ -53,19 +53,41 @@ class PathFilterIgnoreNonData(stagingDir: String) extends PathFilter with Serial
 object CommandUtils extends Logging {
 
   /** Change statistics after changing data by commands. */
-  def updateTableStats(sparkSession: SparkSession, table: CatalogTable): Unit = {
-    val catalog = sparkSession.sessionState.catalog
-    if (sparkSession.sessionState.conf.autoSizeUpdateEnabled) {
+  def updateTableStats(
+      sparkSession: SparkSession,
+      table: CatalogTable,
+      wroteStats: Option[WriteStats] = None): Unit = {
+    val sessionState = sparkSession.sessionState
+    val catalog = sessionState.catalog
+    if (sessionState.conf.autoSizeUpdateEnabled) {
       val newTable = catalog.getTableMetadata(table.identifier)
       val (newSize, newPartitions) = CommandUtils.calculateTotalSize(sparkSession, newTable)
-      val isNewStats = newTable.stats.map(newSize != _.sizeInBytes).getOrElse(true)
+      val isNewStats = newTable.stats.forall(newSize != _.sizeInBytes)
       if (isNewStats) {
         val newStats = CatalogStatistics(sizeInBytes = newSize)
         catalog.alterTableStats(table.identifier, Some(newStats))
       }
       if (newPartitions.nonEmpty) {
         catalog.alterPartitions(table.identifier, newPartitions)
       }
+    } else if (sessionState.conf.autoUpdateBasedOnMetricsEnabled && wroteStats.nonEmpty) {
+      val stat = wroteStats.get
+      stat.mode match {
+        case SaveMode.Overwrite | SaveMode.ErrorIfExists =>
+          catalog.alterTableStats(table.identifier,
+            Some(CatalogStatistics(stat.numBytes, stat.numRows)))

Review Comment:
   Hm, we should consider about partition Statistics here. If we overwrite the part of the partitions, it would get wrong table statistcs.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org