You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2021/02/19 13:27:20 UTC

[spark] branch branch-3.1 updated: [SPARK-34424][SQL][TESTS][3.1][3.0] Fix failures of HiveOrcHadoopFsRelationSuite

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.1 by this push:
     new 41b22c4  [SPARK-34424][SQL][TESTS][3.1][3.0] Fix failures of HiveOrcHadoopFsRelationSuite
41b22c4 is described below

commit 41b22c4d8b5f5082a602d3adf6d3e405484a442b
Author: Max Gekk <ma...@gmail.com>
AuthorDate: Fri Feb 19 13:26:37 2021 +0000

    [SPARK-34424][SQL][TESTS][3.1][3.0] Fix failures of HiveOrcHadoopFsRelationSuite
    
    ### What changes were proposed in this pull request?
    Modify `RandomDataGenerator.forType()` to allow generation of dates/timestamps that are valid in both Julian and Proleptic Gregorian calendars. Currently, the function can produce a date (for example `1582-10-06`) which is valid in the Proleptic Gregorian calendar. Though it cannot be saved to ORC files AS IS since ORC format (ORC libs in fact) assumes Julian calendar. So, Spark shifts `1582-10-06` to the next valid date `1582-10-15` while saving it to ORC files. And as a consequence  [...]
    
    In this PR, I propose to generate valid dates/timestamps in both calendars for ORC datasource till SPARK-34440 is resolved.
    
    ### Why are the changes needed?
    The changes fix failures of `HiveOrcHadoopFsRelationSuite`. For instance, the test "test all data types" fails with the seed **610710213676**:
    ```
    == Results ==
    !== Correct Answer - 20 ==    == Spark Answer - 20 ==
     struct<index:int,col:date>   struct<index:int,col:date>
    ...
    ![9,1582-10-06]               [9,1582-10-15]
    ```
    
    ### Does this PR introduce _any_ user-facing change?
    No
    
    ### How was this patch tested?
    By running the modified test suite:
    ```
    $ build/sbt -Phive -Phive-thriftserver "test:testOnly *HiveOrcHadoopFsRelationSuite"
    ```
    
    Authored-by: Max Gekk <max.gekkgmail.com>
    Signed-off-by: HyukjinKwon <gurwls223apache.org>
    (cherry picked from commit 03161055de0c132070354407160553363175c4d7)
    Signed-off-by: Max Gekk <max.gekkgmail.com>
    
    Closes #31589 from MaxGekk/fix-HiveOrcHadoopFsRelationSuite-3.1.
    
    Authored-by: Max Gekk <ma...@gmail.com>
    Signed-off-by: Wenchen Fan <we...@databricks.com>
---
 .../org/apache/spark/sql/RandomDataGenerator.scala | 73 ++++++++++++++--------
 .../spark/sql/sources/HadoopFsRelationTest.scala   | 13 +++-
 2 files changed, 58 insertions(+), 28 deletions(-)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
index 9fa27c7..0fc040f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
@@ -149,12 +149,15 @@ object RandomDataGenerator {
    * @param dataType the type to generate values for
    * @param nullable whether null values should be generated
    * @param rand an optional random number generator
+   * @param validJulianDatetime whether to generate dates and timestamps that are valid
+   *                            in the Julian calendar.
    * @return a function which can be called to generate random values.
    */
   def forType(
       dataType: DataType,
       nullable: Boolean = true,
-      rand: Random = new Random): Option[() => Any] = {
+      rand: Random = new Random,
+      validJulianDatetime: Boolean = false): Option[() => Any] = {
     val valueGenerator: Option[() => Any] = dataType match {
       case StringType => Some(() => rand.nextString(rand.nextInt(MAX_STR_LEN)))
       case BinaryType => Some(() => {
@@ -182,29 +185,37 @@ object RandomDataGenerator {
           "1970-01-01", // the epoch date
           "9999-12-31" // the last supported date according to SQL standard
         )
+        def getRandomDate(rand: Random): java.sql.Date = {
+          val date = DateTimeUtils.toJavaDate(uniformDaysRand(rand))
+          // The generated `date` is based on the hybrid calendar Julian + Gregorian since
+          // 1582-10-15 but it should be valid in Proleptic Gregorian calendar too which is used
+          // by Spark SQL since version 3.0 (see SPARK-26651). We try to convert `date` to
+          // a local date in Proleptic Gregorian calendar to satisfy this requirement. Some
+          // years are leap years in Julian calendar but not in Proleptic Gregorian calendar.
+          // As the consequence of that, 29 February of such years might not exist in Proleptic
+          // Gregorian calendar. When this happens, we shift the date by one day.
+          Try { date.toLocalDate; date }.getOrElse(new Date(date.getTime + MILLIS_PER_DAY))
+        }
         if (SQLConf.get.getConf(SQLConf.DATETIME_JAVA8API_ENABLED)) {
           randomNumeric[LocalDate](
             rand,
-            (rand: Random) => LocalDate.ofEpochDay(uniformDaysRand(rand)),
+            (rand: Random) => {
+              val days = if (validJulianDatetime) {
+                DateTimeUtils.fromJavaDate(getRandomDate(rand))
+              } else {
+                uniformDaysRand(rand)
+              }
+              LocalDate.ofEpochDay(days)
+            },
             specialDates.map(LocalDate.parse))
         } else {
           randomNumeric[java.sql.Date](
             rand,
-            (rand: Random) => {
-              val date = DateTimeUtils.toJavaDate(uniformDaysRand(rand))
-              // The generated `date` is based on the hybrid calendar Julian + Gregorian since
-              // 1582-10-15 but it should be valid in Proleptic Gregorian calendar too which is used
-              // by Spark SQL since version 3.0 (see SPARK-26651). We try to convert `date` to
-              // a local date in Proleptic Gregorian calendar to satisfy this requirement. Some
-              // years are leap years in Julian calendar but not in Proleptic Gregorian calendar.
-              // As the consequence of that, 29 February of such years might not exist in Proleptic
-              // Gregorian calendar. When this happens, we shift the date by one day.
-              Try { date.toLocalDate; date }.getOrElse(new Date(date.getTime + MILLIS_PER_DAY))
-            },
+            getRandomDate,
             specialDates.map(java.sql.Date.valueOf))
         }
       case TimestampType =>
-        def uniformMicorsRand(rand: Random): Long = {
+        def uniformMicrosRand(rand: Random): Long = {
           var milliseconds = rand.nextLong() % 253402329599999L
           // -62135740800000L is the number of milliseconds before January 1, 1970, 00:00:00 GMT
           // for "0001-01-01 00:00:00.000000". We need to find a
@@ -222,10 +233,29 @@ object RandomDataGenerator {
           "1970-01-01 00:00:00", // the epoch timestamp
           "9999-12-31 23:59:59"  // the last supported timestamp according to SQL standard
         )
+        def getRandomTimestamp(rand: Random): java.sql.Timestamp = {
+          // DateTimeUtils.toJavaTimestamp takes microsecond.
+          val ts = DateTimeUtils.toJavaTimestamp(uniformMicrosRand(rand))
+          // The generated `ts` is based on the hybrid calendar Julian + Gregorian since
+          // 1582-10-15 but it should be valid in Proleptic Gregorian calendar too which is used
+          // by Spark SQL since version 3.0 (see SPARK-26651). We try to convert `ts` to
+          // a local timestamp in Proleptic Gregorian calendar to satisfy this requirement. Some
+          // years are leap years in Julian calendar but not in Proleptic Gregorian calendar.
+          // As the consequence of that, 29 February of such years might not exist in Proleptic
+          // Gregorian calendar. When this happens, we shift the timestamp `ts` by one day.
+          Try { ts.toLocalDateTime; ts }.getOrElse(new Timestamp(ts.getTime + MILLIS_PER_DAY))
+        }
         if (SQLConf.get.getConf(SQLConf.DATETIME_JAVA8API_ENABLED)) {
           randomNumeric[Instant](
             rand,
-            (rand: Random) => DateTimeUtils.microsToInstant(uniformMicorsRand(rand)),
+            (rand: Random) => {
+              val micros = if (validJulianDatetime) {
+                DateTimeUtils.fromJavaTimestamp(getRandomTimestamp(rand))
+              } else {
+                uniformMicrosRand(rand)
+              }
+              DateTimeUtils.microsToInstant(micros)
+            },
             specialTs.map { s =>
               val ldt = LocalDateTime.parse(s.replace(" ", "T"))
               ldt.atZone(ZoneId.systemDefault()).toInstant
@@ -233,18 +263,7 @@ object RandomDataGenerator {
         } else {
           randomNumeric[java.sql.Timestamp](
             rand,
-            (rand: Random) => {
-              // DateTimeUtils.toJavaTimestamp takes microsecond.
-              val ts = DateTimeUtils.toJavaTimestamp(uniformMicorsRand(rand))
-              // The generated `ts` is based on the hybrid calendar Julian + Gregorian since
-              // 1582-10-15 but it should be valid in Proleptic Gregorian calendar too which is used
-              // by Spark SQL since version 3.0 (see SPARK-26651). We try to convert `ts` to
-              // a local timestamp in Proleptic Gregorian calendar to satisfy this requirement. Some
-              // years are leap years in Julian calendar but not in Proleptic Gregorian calendar.
-              // As the consequence of that, 29 February of such years might not exist in Proleptic
-              // Gregorian calendar. When this happens, we shift the timestamp `ts` by one day.
-              Try { ts.toLocalDateTime; ts }.getOrElse(new Timestamp(ts.getTime + MILLIS_PER_DAY))
-            },
+            getRandomTimestamp,
             specialTs.map(java.sql.Timestamp.valueOf))
         }
       case CalendarIntervalType => Some(() => {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala
index b65a004..9befd91 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.sources
 
 import java.io.File
+import java.util.Locale
 
 import scala.util.Random
 
@@ -160,7 +161,17 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes
                 val dataGenerator = RandomDataGenerator.forType(
                   dataType = dataType,
                   nullable = true,
-                  new Random(seed)
+                  new Random(seed),
+                  // TODO(SPARK-34440): Allow saving/loading datetime in ORC w/o rebasing
+                  // The ORC datasource always performs datetime rebasing that can lead to
+                  // shifting of the original dates/timestamps. For instance, 1582-10-06 is valid
+                  // date in the Proleptic Gregorian calendar but it does not exist in the Julian
+                  // calendar. The ORC datasource shifts the date to the next valid date 1582-10-15
+                  // during rebasing of this date to the Julian calendar. Since the test compares
+                  // the original date before saving and the date loaded back from the ORC files,
+                  // we set `validJulianDatetime` to `true` to generate only Proleptic Gregorian
+                  // dates that exist in the Julian calendar and will be not changed during rebase.
+                  validJulianDatetime = dataSourceName.toLowerCase(Locale.ROOT).contains("orc")
                 ).getOrElse {
                   fail(s"Failed to create data generator for schema $dataType")
                 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org