You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2020/11/11 22:59:47 UTC
[spark] branch branch-3.0 updated: [SPARK-33404][SQL][3.0] Fix
incorrect results in `date_trunc` expression
This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 00be83a [SPARK-33404][SQL][3.0] Fix incorrect results in `date_trunc` expression
00be83a is described below
commit 00be83a15807cb5b26b0cd7fe491060e27bf4c81
Author: Utkarsh <ut...@databricks.com>
AuthorDate: Wed Nov 11 14:45:49 2020 -0800
[SPARK-33404][SQL][3.0] Fix incorrect results in `date_trunc` expression
**Backport #30303 to 3.0**
### What changes were proposed in this pull request?
The following query produces incorrect results:
```
SELECT date_trunc('minute', '1769-10-17 17:10:02')
```
Spark currently incorrectly returns
```
1769-10-17 17:10:02
```
against the expected return value of
```
1769-10-17 17:10:00
```
**Steps to repro**
Run the following commands in spark-shell:
```
spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
spark.sql("SELECT date_trunc('minute', '1769-10-17 17:10:02')").show()
```
This happens as `truncTimestamp` in package `org.apache.spark.sql.catalyst.util.DateTimeUtils` incorrectly assumes that time zone offsets can never have the granularity of a second and thus does not account for time zone adjustment when truncating the given timestamp to `minute`.
This assumption is currently used when truncating the timestamps to `microsecond, millisecond, second, or minute`.
This PR fixes this issue and always uses time zone knowledge when truncating timestamps regardless of the truncation unit.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Added new tests to `DateTimeUtilsSuite` which previously failed and pass now.
Closes #30339 from utkarsh39/date_trunc_fix_3.0.
Authored-by: Utkarsh <ut...@databricks.com>
Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
.../spark/sql/catalyst/util/DateTimeUtils.scala | 6 ++--
.../sql/catalyst/util/DateTimeUtilsSuite.scala | 34 +++++++++++++++-------
2 files changed, 28 insertions(+), 12 deletions(-)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 3f8417f..344a6ae 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -771,8 +771,12 @@ object DateTimeUtils {
* Trunc level should be generated using `parseTruncLevel()`, should be between 0 and 9.
*/
def truncTimestamp(t: SQLTimestamp, level: Int, zoneId: ZoneId): SQLTimestamp = {
+ // Time zone offsets have a maximum precision of seconds (see `java.time.ZoneOffset`). Hence
+ // truncation to microsecond, millisecond, and second can be done
+ // without using time zone information. This results in a performance improvement.
level match {
case TRUNC_TO_MICROSECOND => t
+ case TRUNC_TO_MINUTE => truncToUnit(t, zoneId, ChronoUnit.MINUTES)
case TRUNC_TO_HOUR => truncToUnit(t, zoneId, ChronoUnit.HOURS)
case TRUNC_TO_DAY => truncToUnit(t, zoneId, ChronoUnit.DAYS)
case _ =>
@@ -781,8 +785,6 @@ object DateTimeUtils {
case TRUNC_TO_MILLISECOND => millis
case TRUNC_TO_SECOND =>
millis - Math.floorMod(millis, MILLIS_PER_SECOND)
- case TRUNC_TO_MINUTE =>
- millis - Math.floorMod(millis, MILLIS_PER_MINUTE)
case _ => // Try to truncate date levels
val dDays = millisToDays(millis, zoneId)
daysToMillis(truncDate(dDays, level), zoneId)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index d526ae1..f92d65c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -516,18 +516,32 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
assert(time == None)
}
- test("truncTimestamp") {
- def testTrunc(
- level: Int,
- expected: String,
- inputTS: SQLTimestamp,
- zoneId: ZoneId = defaultZoneId): Unit = {
- val truncated =
- DateTimeUtils.truncTimestamp(inputTS, level, zoneId)
- val expectedTS = toTimestamp(expected, defaultZoneId)
- assert(truncated === expectedTS.get)
+ def testTrunc(
+ level: Int,
+ expected: String,
+ inputTS: Long,
+ zoneId: ZoneId = defaultZoneId): Unit = {
+ val truncated = DateTimeUtils.truncTimestamp(inputTS, level, zoneId)
+ val expectedTS = toTimestamp(expected, defaultZoneId)
+ assert(truncated === expectedTS.get)
+ }
+
+ test("SPARK-33404: test truncTimestamp when time zone offset from UTC has a " +
+ "granularity of seconds") {
+ for (zid <- ALL_TIMEZONES) {
+ withDefaultTimeZone(zid) {
+ val inputTS = DateTimeUtils.stringToTimestamp(
+ UTF8String.fromString("1769-10-17T17:10:02.123456"), defaultZoneId)
+ testTrunc(DateTimeUtils.TRUNC_TO_MINUTE, "1769-10-17T17:10:00", inputTS.get, zid)
+ testTrunc(DateTimeUtils.TRUNC_TO_SECOND, "1769-10-17T17:10:02", inputTS.get, zid)
+ testTrunc(DateTimeUtils.TRUNC_TO_MILLISECOND, "1769-10-17T17:10:02.123", inputTS.get, zid)
+ testTrunc(DateTimeUtils.TRUNC_TO_MICROSECOND, "1769-10-17T17:10:02.123456",
+ inputTS.get, zid)
+ }
}
+ }
+ test("truncTimestamp") {
val defaultInputTS = DateTimeUtils.stringToTimestamp(
UTF8String.fromString("2015-03-05T09:32:05.359123"), defaultZoneId)
val defaultInputTS1 = DateTimeUtils.stringToTimestamp(
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org