You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ge...@apache.org on 2023/03/14 04:00:36 UTC

[spark] branch branch-3.4 updated: [SPARK-42777][SQL] Support converting TimestampNTZ catalog stats to plan stats

This is an automated email from the ASF dual-hosted git repository.

gengliang pushed a commit to branch branch-3.4
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.4 by this push:
     new a352507a12c [SPARK-42777][SQL] Support converting TimestampNTZ catalog stats to plan stats
a352507a12c is described below

commit a352507a12c256b8d196c142f65291db21c96a5f
Author: Gengliang Wang <ge...@apache.org>
AuthorDate: Mon Mar 13 21:00:02 2023 -0700

    [SPARK-42777][SQL] Support converting TimestampNTZ catalog stats to plan stats
    
    ### What changes were proposed in this pull request?
    
    When `spark.sql.cbo.planStats.enabled` or `spark.sql.cbo.enabled` is enabled, the logical plan will fetch row counts and column statistics from catalog.
    This PR is to support converting TimestampNTZ catalog stats to plan stats.
    
    ### Why are the changes needed?
    
    Implement a missing piece of the TimestampNTZ type.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No, TimestampNTZ is not released yet.
    
    ### How was this patch tested?
    
    New UT
    
    Closes #40404 from gengliangwang/fromExternalString.
    
    Authored-by: Gengliang Wang <ge...@apache.org>
    Signed-off-by: Gengliang Wang <ge...@apache.org>
    (cherry picked from commit c3ac782450583e6073b88d940af60714eb4cdf44)
    Signed-off-by: Gengliang Wang <ge...@apache.org>
---
 .../spark/sql/catalyst/catalog/interface.scala     |  2 ++
 .../spark/sql/StatisticsCollectionSuite.scala      | 24 ++++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index 6f4c4f27efc..08dd2dfd5bc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -681,6 +681,8 @@ object CatalogColumnStat extends Logging {
       case TimestampType if version == 1 =>
         DateTimeUtils.fromJavaTimestamp(java.sql.Timestamp.valueOf(s))
       case TimestampType => getTimestampFormatter(isParsing = true).parse(s)
+      case TimestampNTZType =>
+        getTimestampFormatter(isParsing = true, forTimestampNTZ = true).parse(s)
       case ByteType => s.toByte
       case ShortType => s.toShort
       case IntegerType => s.toInt
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
index 2ab8bb25a8b..e6b74a328e5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
@@ -571,6 +571,30 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
     }
   }
 
+  test("SPARK-42777: describe column stats (min, max) for timestamp_ntz column") {
+    val table = "insert_desc_same_time_zone"
+    val tsCol = "timestamp_ntz_typed_col"
+    withTable(table) {
+      val minTimestamp = "make_timestamp_ntz(2022, 1, 1, 0, 0, 1.123456)"
+      val maxTimestamp = "make_timestamp_ntz(2022, 1, 3, 0, 0, 2.987654)"
+      sql(s"CREATE TABLE $table ($tsCol timestamp_ntz) USING parquet")
+      sql(s"INSERT INTO $table VALUES $minTimestamp, $maxTimestamp")
+      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR ALL COLUMNS")
+
+      checkDescTimestampColStats(
+        tableName = table,
+        timestampColumn = tsCol,
+        expectedMinTimestamp = "2022-01-01 00:00:01.123456",
+        expectedMaxTimestamp = "2022-01-03 00:00:02.987654")
+
+      // Converting TimestampNTZ catalog stats to plan stats
+      val columnStat = getCatalogTable(table)
+        .stats.get.colStats(tsCol).toPlanStat(tsCol, TimestampNTZType)
+      assert(columnStat.min.contains(1640995201123456L))
+      assert(columnStat.max.contains(1641168002987654L))
+    }
+  }
+
   private def getStatAttrNames(tableName: String): Set[String] = {
     val queryStats = spark.table(tableName).queryExecution.optimizedPlan.stats.attributeStats
     queryStats.map(_._1.name).toSet


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org