You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by GitBox <gi...@apache.org> on 2022/02/09 15:36:53 UTC

[GitHub] [spark] wzhfy commented on a change in pull request #35440: [SPARK-38140][SQL] Desc column stats (min, max) for timestamp type is not consistent with the values due to time zone difference

wzhfy commented on a change in pull request #35440:
URL: https://github.com/apache/spark/pull/35440#discussion_r802796394



##########
File path: sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
##########
@@ -470,7 +470,98 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
     }
   }
 
-  def getStatAttrNames(tableName: String): Set[String] = {
+  private def checkDescTimestampColStatsByZone(
+      tableName: String,
+      timestampColumn: String,
+      expectedMinTimestamp: String,
+      expectedMaxTimestamp: String): Unit = {
+
+    def extractColumnStatsFromDesc(statsName: String, rows: Array[Row]): String = {
+      rows.collect {
+        case r: Row if r.getString(0) == statsName =>
+          r.getString(1)
+      }.head
+    }
+
+    val descTsCol = sql(s"DESC FORMATTED $tableName $timestampColumn").collect()
+    assert(extractColumnStatsFromDesc("min", descTsCol) == expectedMinTimestamp)
+    assert(extractColumnStatsFromDesc("max", descTsCol) == expectedMaxTimestamp)
+  }
+
+  test("describe column stats (min, max) for timestamp column: desc results should be consistent " +
+    "with the written value if writing and desc happen in the same time zone") {
+
+    val original = TimeZone.getDefault
+    try {
+      Seq("UTC", "PST", "Asia/Hong_Kong").foreach { timeZoneId =>
+        val table = "insert_desc_same_time_zone"
+        val tsCol = "timestamp_typed_col"
+        withTable(table) {
+
+          TimeZone.setDefault(DateTimeUtils.getTimeZone(timeZoneId))
+
+          val minTimestamp = "make_timestamp(2022, 1, 1, 0, 0, 1.123456)"
+          val maxTimestamp = "make_timestamp(2022, 1, 3, 0, 0, 2.987654)"
+          sql(s"CREATE TABLE $table ($tsCol Timestamp) USING parquet")
+          sql(s"INSERT INTO $table VALUES $minTimestamp, $maxTimestamp")
+          sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR ALL COLUMNS")
+
+          checkDescTimestampColStatsByZone(
+            tableName = table,
+            timestampColumn = tsCol,
+            expectedMinTimestamp = "2022-01-01 00:00:01.123456",
+            expectedMaxTimestamp = "2022-01-03 00:00:02.987654")
+        }
+      }
+    } finally {
+      TimeZone.setDefault(original)
+    }
+  }
+
+  test("describe column stats (min, max) for timestamp column: desc should show different " +
+    "results if writing in UTC and desc in other time zones") {
+
+    val table = "insert_desc_diff_time_zones"
+    val tsCol = "timestamp_typed_col"
+
+    val original = TimeZone.getDefault

Review comment:
       Thanks! Didn't know we have this util method. Updated.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org