You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2020/06/03 12:01:39 UTC

[spark] branch branch-3.0 updated: [SPARK-31878][SQL] Create date formatter only once in `HiveResult`

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 85a48910 [SPARK-31878][SQL] Create date formatter only once in `HiveResult`
85a48910 is described below

commit 85a489103465d6dc36d9565e71e38e73fb8e097f
Author: Max Gekk <ma...@gmail.com>
AuthorDate: Wed Jun 3 12:00:20 2020 +0000

    [SPARK-31878][SQL] Create date formatter only once in `HiveResult`
    
    ### What changes were proposed in this pull request?
    1. Replace `def dateFormatter` to `val dateFormatter`.
    2. Modify the `date formatting in hive result` test in `HiveResultSuite` to check modified code on various time zones.
    
    ### Why are the changes needed?
    To avoid creation of `DateFormatter` per every incoming date in `HiveResult.toHiveString`. This should eliminate unnecessary creation of `SimpleDateFormat` instances and compilation of the default pattern `yyyy-MM-dd`. The changes can speed up processing of legacy date values of the `java.sql.Date` type which is collected by default.
    
    ### Does this PR introduce _any_ user-facing change?
    No
    
    ### How was this patch tested?
    Modified a test in `HiveResultSuite`.
    
    Closes #28687 from MaxGekk/HiveResult-val-dateFormatter.
    
    Authored-by: Max Gekk <ma...@gmail.com>
    Signed-off-by: Wenchen Fan <we...@databricks.com>
    (cherry picked from commit 125a89ce0880ab4c53dcb1b879f1d65ff0f589df)
    Signed-off-by: Wenchen Fan <we...@databricks.com>
---
 .../apache/spark/sql/execution/HiveResult.scala    | 24 +++++++++++++++++-----
 .../spark/sql/execution/HiveResultSuite.scala      | 22 ++++++++++++--------
 2 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala
index 73484a2..c6b16fb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala
@@ -19,10 +19,10 @@ package org.apache.spark.sql.execution
 
 import java.nio.charset.StandardCharsets
 import java.sql.{Date, Timestamp}
-import java.time.{Instant, LocalDate}
+import java.time.{Instant, LocalDate, ZoneOffset}
 
 import org.apache.spark.sql.Row
-import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils, TimestampFormatter}
+import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils, LegacyDateFormats, TimestampFormatter}
 import org.apache.spark.sql.execution.command.{DescribeCommandBase, ExecutedCommandExec, ShowTablesCommand, ShowViewsCommand}
 import org.apache.spark.sql.execution.datasources.v2.{DescribeTableExec, ShowTablesExec}
 import org.apache.spark.sql.internal.SQLConf
@@ -72,9 +72,23 @@ object HiveResult {
     }
   }
 
-  private def zoneId = DateTimeUtils.getZoneId(SQLConf.get.sessionLocalTimeZone)
-  private def dateFormatter = DateFormatter(zoneId)
-  private def timestampFormatter = TimestampFormatter.getFractionFormatter(zoneId)
+  // We can create the date formatter only once because it does not depend on Spark's
+  // session time zone controlled by the SQL config `spark.sql.session.timeZone`.
+  // The `zoneId` parameter is used only in parsing of special date values like `now`,
+  // `yesterday` and etc. but not in date formatting. While formatting of:
+  // - `java.time.LocalDate`, zone id is not used by `DateTimeFormatter` at all.
+  // - `java.sql.Date`, the date formatter delegates formatting to the legacy formatter
+  //   which uses the default system time zone `TimeZone.getDefault`. This works correctly
+  //   due to `DateTimeUtils.toJavaDate` which is based on the system time zone too.
+  private val dateFormatter = DateFormatter(
+    format = DateFormatter.defaultPattern,
+    // We can set any time zone id. UTC was taken for simplicity.
+    zoneId = ZoneOffset.UTC,
+    locale = DateFormatter.defaultLocale,
+    // Use `FastDateFormat` as the legacy formatter because it is thread-safe.
+    legacyFormat = LegacyDateFormats.FAST_DATE_FORMAT)
+  private def timestampFormatter = TimestampFormatter.getFractionFormatter(
+    DateTimeUtils.getZoneId(SQLConf.get.sessionLocalTimeZone))
 
   /** Formats a datum (based on the given data type) and returns the string representation. */
   def toHiveString(a: (Any, DataType), nested: Boolean = false): String = a match {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/HiveResultSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/HiveResultSuite.scala
index 5e81c74..a0b212d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/HiveResultSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/HiveResultSuite.scala
@@ -17,21 +17,27 @@
 
 package org.apache.spark.sql.execution
 
+import org.apache.spark.sql.catalyst.util.DateTimeTestUtils
 import org.apache.spark.sql.connector.InMemoryTableCatalog
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.{ExamplePoint, ExamplePointUDT, SharedSparkSession}
 
 class HiveResultSuite extends SharedSparkSession {
   import testImplicits._
 
   test("date formatting in hive result") {
-    val dates = Seq("2018-12-28", "1582-10-03", "1582-10-04", "1582-10-15")
-    val df = dates.toDF("a").selectExpr("cast(a as date) as b")
-    val executedPlan1 = df.queryExecution.executedPlan
-    val result = HiveResult.hiveResultString(executedPlan1)
-    assert(result == dates)
-    val executedPlan2 = df.selectExpr("array(b)").queryExecution.executedPlan
-    val result2 = HiveResult.hiveResultString(executedPlan2)
-    assert(result2 == dates.map(x => s"[$x]"))
+    DateTimeTestUtils.outstandingTimezonesIds.foreach { zoneId =>
+      withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> zoneId) {
+        val dates = Seq("2018-12-28", "1582-10-03", "1582-10-04", "1582-10-15")
+        val df = dates.toDF("a").selectExpr("cast(a as date) as b")
+        val executedPlan1 = df.queryExecution.executedPlan
+        val result = HiveResult.hiveResultString(executedPlan1)
+        assert(result == dates)
+        val executedPlan2 = df.selectExpr("array(b)").queryExecution.executedPlan
+        val result2 = HiveResult.hiveResultString(executedPlan2)
+        assert(result2 == dates.map(x => s"[$x]"))
+      }
+    }
   }
 
   test("timestamp formatting in hive result") {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org