You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by GitBox <gi...@apache.org> on 2022/11/07 13:18:55 UTC

[GitHub] [spark] wankunde commented on a diff in pull request #38496: [WIP][SPARK-40708][SQL] Auto update table statistics based on write metrics

wankunde commented on code in PR #38496:
URL: https://github.com/apache/spark/pull/38496#discussion_r1015410975


##########
sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala:
##########
@@ -53,19 +53,41 @@ class PathFilterIgnoreNonData(stagingDir: String) extends PathFilter with Serial
 object CommandUtils extends Logging {
 
   /** Change statistics after changing data by commands. */
-  def updateTableStats(sparkSession: SparkSession, table: CatalogTable): Unit = {
-    val catalog = sparkSession.sessionState.catalog
-    if (sparkSession.sessionState.conf.autoSizeUpdateEnabled) {
+  def updateTableStats(
+      sparkSession: SparkSession,
+      table: CatalogTable,
+      wroteStats: Option[WriteStats] = None): Unit = {
+    val sessionState = sparkSession.sessionState
+    val catalog = sessionState.catalog
+    if (sessionState.conf.autoSizeUpdateEnabled) {
       val newTable = catalog.getTableMetadata(table.identifier)
       val (newSize, newPartitions) = CommandUtils.calculateTotalSize(sparkSession, newTable)
-      val isNewStats = newTable.stats.map(newSize != _.sizeInBytes).getOrElse(true)
+      val isNewStats = newTable.stats.forall(newSize != _.sizeInBytes)
       if (isNewStats) {
         val newStats = CatalogStatistics(sizeInBytes = newSize)
         catalog.alterTableStats(table.identifier, Some(newStats))
       }
       if (newPartitions.nonEmpty) {
         catalog.alterPartitions(table.identifier, newPartitions)
       }
+    } else if (sessionState.conf.autoUpdateBasedOnMetricsEnabled && wroteStats.nonEmpty) {
+      val stat = wroteStats.get
+      stat.mode match {
+        case SaveMode.Overwrite | SaveMode.ErrorIfExists =>
+          catalog.alterTableStats(table.identifier,
+            Some(CatalogStatistics(stat.numBytes, stat.numRows)))

Review Comment:
   Hi, @jackylee-ch  Thanks for your review.  It seems we can only update stats for overwriting non-partition table.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org