You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by sr...@apache.org on 2019/03/06 14:27:18 UTC

[spark] branch master updated: [SPARK-27031][SQL] Avoid double formatting in timestampToString

This is an automated email from the ASF dual-hosted git repository.

srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 9b55722  [SPARK-27031][SQL] Avoid double formatting in timestampToString
9b55722 is described below

commit 9b55722161e050fc104f5aac86cffd79bcd6905c
Author: Maxim Gekk <ma...@gmail.com>
AuthorDate: Wed Mar 6 08:26:59 2019 -0600

    [SPARK-27031][SQL] Avoid double formatting in timestampToString
    
    ## What changes were proposed in this pull request?
    
    Removed unnecessary conversion of microseconds in `DateTimeUtils.timestampToString` to `java.sql.Timestamp` which aims to output fraction of seconds by casting it to string. This was replaced by special `TimestampFormatter` which appends the fraction formatter to `DateTimeFormatterBuilder`: `appendFraction(ChronoField.NANO_OF_SECOND, 0, 9, true)`. The former one means trailing zeros in second's fraction should be truncated while formatting.
    
    ## How was this patch tested?
    
    By existing test suites like `CastSuite`, `DateTimeUtilsSuite`, `JDBCSuite`, and by new test in `TimestampFormatterSuite`.
    
    Closes #23936 from MaxGekk/timestamp-to-string.
    
    Lead-authored-by: Maxim Gekk <ma...@gmail.com>
    Co-authored-by: Maxim Gekk <ma...@databricks.com>
    Signed-off-by: Sean Owen <se...@databricks.com>
---
 .../spark/sql/catalyst/expressions/Cast.scala      |  2 +-
 .../catalyst/util/DateTimeFormatterHelper.scala    | 26 ++++++++++++++++++----
 .../spark/sql/catalyst/util/DateTimeUtils.scala    | 10 +--------
 .../sql/catalyst/util/TimestampFormatter.scala     | 21 ++++++++++++++++-
 .../sql/catalyst/util/DateTimeUtilsSuite.scala     |  2 +-
 .../spark/sql/util/TimestampFormatterSuite.scala   | 11 ++++++++-
 .../apache/spark/sql/execution/HiveResult.scala    |  2 +-
 .../execution/datasources/jdbc/JDBCRelation.scala  |  3 ++-
 8 files changed, 58 insertions(+), 19 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 84087ae..c238ccb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -233,7 +233,7 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
   @inline private[this] def buildCast[T](a: Any, func: T => Any): Any = func(a.asInstanceOf[T])
 
   private lazy val dateFormatter = DateFormatter()
-  private lazy val timestampFormatter = TimestampFormatter(timeZone)
+  private lazy val timestampFormatter = TimestampFormatter.getFractionFormatter(timeZone)
 
   // UDFToString
   private[this] def castToString(from: DataType): Any => Any = from match {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala
index 81ad6ad..a0c0db3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala
@@ -62,10 +62,12 @@ private object DateTimeFormatterHelper {
     .maximumSize(128)
     .build[(String, Locale), DateTimeFormatter]()
 
-  def buildFormatter(pattern: String, locale: Locale): DateTimeFormatter = {
-    new DateTimeFormatterBuilder()
-      .parseCaseInsensitive()
-      .appendPattern(pattern)
+  def createBuilder(): DateTimeFormatterBuilder = {
+    new DateTimeFormatterBuilder().parseCaseInsensitive()
+  }
+
+  def toFormatter(builder: DateTimeFormatterBuilder, locale: Locale): DateTimeFormatter = {
+    builder
       .parseDefaulting(ChronoField.ERA, 1)
       .parseDefaulting(ChronoField.MONTH_OF_YEAR, 1)
       .parseDefaulting(ChronoField.DAY_OF_MONTH, 1)
@@ -75,4 +77,20 @@ private object DateTimeFormatterHelper {
       .withChronology(IsoChronology.INSTANCE)
       .withResolverStyle(ResolverStyle.STRICT)
   }
+
+  def buildFormatter(pattern: String, locale: Locale): DateTimeFormatter = {
+    val builder = createBuilder().appendPattern(pattern)
+    toFormatter(builder, locale)
+  }
+
+  lazy val fractionFormatter: DateTimeFormatter = {
+    val builder = createBuilder()
+      .append(DateTimeFormatter.ISO_LOCAL_DATE)
+      .appendLiteral(' ')
+      .appendValue(ChronoField.HOUR_OF_DAY, 2).appendLiteral(':')
+      .appendValue(ChronoField.MINUTE_OF_HOUR, 2).appendLiteral(':')
+      .appendValue(ChronoField.SECOND_OF_MINUTE, 2)
+      .appendFraction(ChronoField.NANO_OF_SECOND, 0, 9, true)
+    toFormatter(builder, TimestampFormatter.defaultLocale)
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 5064220..627ee14 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -99,15 +99,7 @@ object DateTimeUtils {
 
   // Converts Timestamp to string according to Hive TimestampWritable convention.
   def timestampToString(tf: TimestampFormatter, us: SQLTimestamp): String = {
-    val ts = toJavaTimestamp(us)
-    val timestampString = ts.toString
-    val formatted = tf.format(us)
-
-    if (timestampString.length > 19 && timestampString.substring(19) != ".0") {
-      formatted + timestampString.substring(19)
-    } else {
-      formatted
-    }
+    tf.format(us)
   }
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala
index c254815..e559b6a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala
@@ -47,7 +47,7 @@ class Iso8601TimestampFormatter(
     timeZone: TimeZone,
     locale: Locale) extends TimestampFormatter with DateTimeFormatterHelper {
   @transient
-  private lazy val formatter = getOrCreateFormatter(pattern, locale)
+  protected lazy val formatter = getOrCreateFormatter(pattern, locale)
 
   private def toInstant(s: String): Instant = {
     val temporalAccessor = formatter.parse(s)
@@ -66,6 +66,21 @@ class Iso8601TimestampFormatter(
   }
 }
 
+/**
+ * The formatter parses/formats timestamps according to the pattern `yyyy-MM-dd HH:mm:ss.[..fff..]`
+ * where `[..fff..]` is a fraction of second up to microsecond resolution. The formatter does not
+ * output trailing zeros in the fraction. For example, the timestamp `2019-03-05 15:00:01.123400` is
+ * formatted as the string `2019-03-05 15:00:01.1234`.
+ *
+ * @param timeZone the time zone in which the formatter parses or format timestamps
+ */
+class FractionTimestampFormatter(timeZone: TimeZone)
+  extends Iso8601TimestampFormatter("", timeZone, TimestampFormatter.defaultLocale) {
+
+  @transient
+  override protected lazy val formatter = DateTimeFormatterHelper.fractionFormatter
+}
+
 object TimestampFormatter {
   val defaultPattern: String = "yyyy-MM-dd HH:mm:ss"
   val defaultLocale: Locale = Locale.US
@@ -81,4 +96,8 @@ object TimestampFormatter {
   def apply(timeZone: TimeZone): TimestampFormatter = {
     apply(defaultPattern, timeZone, defaultLocale)
   }
+
+  def getFractionFormatter(timeZone: TimeZone): TimestampFormatter = {
+    new FractionTimestampFormatter(timeZone)
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index e270b91..39eb7d1 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -33,7 +33,7 @@ class DateTimeUtilsSuite extends SparkFunSuite {
   private def defaultTz = DateTimeUtils.defaultTimeZone()
 
   test("nanoseconds truncation") {
-    val tf = TimestampFormatter(DateTimeUtils.defaultTimeZone())
+    val tf = TimestampFormatter.getFractionFormatter(DateTimeUtils.defaultTimeZone())
     def checkStringToTimestamp(originalTime: String, expectedParsedTime: String) {
       val parsedTimestampOp = DateTimeUtils.stringToTimestamp(
         UTF8String.fromString(originalTime), defaultTz)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/util/TimestampFormatterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/util/TimestampFormatterSuite.scala
index d007adf..1675b61 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/util/TimestampFormatterSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/util/TimestampFormatterSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.util
 
 import java.time.{LocalDateTime, ZoneOffset}
-import java.util.{Locale, TimeZone}
+import java.util.TimeZone
 import java.util.concurrent.TimeUnit
 
 import org.apache.spark.SparkFunSuite
@@ -117,4 +117,13 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper {
     assert(micros === TimeUnit.SECONDS.toMicros(
       LocalDateTime.of(2009, 3, 20, 11, 30, 1).toEpochSecond(ZoneOffset.UTC)))
   }
+
+  test("format fraction of second") {
+    val formatter = TimestampFormatter.getFractionFormatter(TimeZone.getTimeZone("UTC"))
+    assert(formatter.format(0) === "1970-01-01 00:00:00")
+    assert(formatter.format(1) === "1970-01-01 00:00:00.000001")
+    assert(formatter.format(1000) === "1970-01-01 00:00:00.001")
+    assert(formatter.format(900000) === "1970-01-01 00:00:00.9")
+    assert(formatter.format(1000000) === "1970-01-01 00:00:01")
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala
index 24fba79..38ef72e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala
@@ -78,7 +78,7 @@ object HiveResult {
     BinaryType)
 
   private lazy val dateFormatter = DateFormatter()
-  private lazy val timestampFormatter = TimestampFormatter(
+  private lazy val timestampFormatter = TimestampFormatter.getFractionFormatter(
     DateTimeUtils.getTimeZone(SQLConf.get.sessionLocalTimeZone))
 
   /** Hive outputs fields of structs slightly differently than top level attributes. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
index c0f78b5..724a0f3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
@@ -199,7 +199,8 @@ private[sql] object JDBCRelation extends Logging {
       val dateTimeStr = columnType match {
         case DateType => DateFormatter().format(value.toInt)
         case TimestampType =>
-          val timestampFormatter = TimestampFormatter(DateTimeUtils.getTimeZone(timeZoneId))
+          val timestampFormatter = TimestampFormatter.getFractionFormatter(
+            DateTimeUtils.getTimeZone(timeZoneId))
           DateTimeUtils.timestampToString(timestampFormatter, value)
       }
       s"'$dateTimeStr'"


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org