You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2020/05/11 13:03:02 UTC
[spark] branch branch-3.0 updated: [SPARK-31665][SQL][TESTS] Check parquet dictionary encoding of random dates/timestamps

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 7b567e4  [SPARK-31665][SQL][TESTS] Check parquet dictionary encoding of random dates/timestamps
7b567e4 is described below

commit 7b567e4f1414e6a3def73f0d3b2f8b058ae58d43
Author: Max Gekk <ma...@gmail.com>
AuthorDate: Mon May 11 12:59:41 2020 +0000

    [SPARK-31665][SQL][TESTS] Check parquet dictionary encoding of random dates/timestamps
    
    ### What changes were proposed in this pull request?
    Modified `RandomDataGenerator.forType` for DateType and TimestampType to generate special date//timestamp values with 0.5 probability. This will trigger dictionary encoding in Parquet datasource test  HadoopFsRelationTest "test all data types". Currently, dictionary encoding is tested only for numeric types like ShortType.
    
    ### Why are the changes needed?
    To extend test coverage. Currently, probability of testing of dictionary encoding in the test HadoopFsRelationTest "test all data types" for DateType and TimestampType is close to zero because dates/timestamps are uniformly distributed in wide range, and the chance of generating the same values is pretty low. In this way, parquet datasource cannot apply dictionary encoding for such column types.
    
    ### Does this PR introduce _any_ user-facing change?
    No
    
    ### How was this patch tested?
    By running `ParquetHadoopFsRelationSuite` and `JsonHadoopFsRelationSuite`.
    
    Closes #28481 from MaxGekk/test-random-parquet-dict-enc.
    
    Authored-by: Max Gekk <ma...@gmail.com>
    Signed-off-by: Wenchen Fan <we...@databricks.com>
    (cherry picked from commit 32a5398b659695c338cd002d9094bdf19a89a716)
    Signed-off-by: Wenchen Fan <we...@databricks.com>
---
 .../org/apache/spark/sql/RandomDataGenerator.scala | 100 ++++++++++++---------
 .../spark/sql/sources/HadoopFsRelationTest.scala   |   2 +-
 2 files changed, 58 insertions(+), 44 deletions(-)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
index 5a4d23d..cf8d772 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
@@ -162,52 +162,66 @@ object RandomDataGenerator {
       })
       case BooleanType => Some(() => rand.nextBoolean())
       case DateType =>
-        val generator =
-          () => {
-            var milliseconds = rand.nextLong() % 253402329599999L
-            // -62135740800000L is the number of milliseconds before January 1, 1970, 00:00:00 GMT
-            // for "0001-01-01 00:00:00.000000". We need to find a
-            // number that is greater or equals to this number as a valid timestamp value.
-            while (milliseconds < -62135740800000L) {
-              // 253402329599999L is the number of milliseconds since
-              // January 1, 1970, 00:00:00 GMT for "9999-12-31 23:59:59.999999".
-              milliseconds = rand.nextLong() % 253402329599999L
-            }
-            val date = DateTimeUtils.toJavaDate((milliseconds / MILLIS_PER_DAY).toInt)
-            // The generated `date` is based on the hybrid calendar Julian + Gregorian since
-            // 1582-10-15 but it should be valid in Proleptic Gregorian calendar too which is used
-            // by Spark SQL since version 3.0 (see SPARK-26651). We try to convert `date` to
-            // a local date in Proleptic Gregorian calendar to satisfy this requirement.
-            // Some years are leap years in Julian calendar but not in Proleptic Gregorian calendar.
-            // As the consequence of that, 29 February of such years might not exist in Proleptic
-            // Gregorian calendar. When this happens, we shift the date by one day.
-            Try { date.toLocalDate; date }.getOrElse(new Date(date.getTime + MILLIS_PER_DAY))
+        def uniformDateRand(rand: Random): java.sql.Date = {
+          var milliseconds = rand.nextLong() % 253402329599999L
+          // -62135740800000L is the number of milliseconds before January 1, 1970, 00:00:00 GMT
+          // for "0001-01-01 00:00:00.000000". We need to find a
+          // number that is greater or equals to this number as a valid timestamp value.
+          while (milliseconds < -62135740800000L) {
+            // 253402329599999L is the number of milliseconds since
+            // January 1, 1970, 00:00:00 GMT for "9999-12-31 23:59:59.999999".
+            milliseconds = rand.nextLong() % 253402329599999L
           }
-        Some(generator)
+          val date = DateTimeUtils.toJavaDate((milliseconds / MILLIS_PER_DAY).toInt)
+          // The generated `date` is based on the hybrid calendar Julian + Gregorian since
+          // 1582-10-15 but it should be valid in Proleptic Gregorian calendar too which is used
+          // by Spark SQL since version 3.0 (see SPARK-26651). We try to convert `date` to
+          // a local date in Proleptic Gregorian calendar to satisfy this requirement.
+          // Some years are leap years in Julian calendar but not in Proleptic Gregorian calendar.
+          // As the consequence of that, 29 February of such years might not exist in Proleptic
+          // Gregorian calendar. When this happens, we shift the date by one day.
+          Try { date.toLocalDate; date }.getOrElse(new Date(date.getTime + MILLIS_PER_DAY))
+        }
+        randomNumeric[java.sql.Date](
+          rand,
+          uniformDateRand,
+          Seq(
+            "0001-01-01", // the fist day of Common Era
+            "1582-10-15", // the cutover date from Julian to Gregorian calendar
+            "1970-01-01", // the epoch date
+            "9999-12-31"  // the last supported date according to SQL standard
+          ).map(java.sql.Date.valueOf))
       case TimestampType =>
-        val generator =
-          () => {
-            var milliseconds = rand.nextLong() % 253402329599999L
-            // -62135740800000L is the number of milliseconds before January 1, 1970, 00:00:00 GMT
-            // for "0001-01-01 00:00:00.000000". We need to find a
-            // number that is greater or equals to this number as a valid timestamp value.
-            while (milliseconds < -62135740800000L) {
-              // 253402329599999L is the number of milliseconds since
-              // January 1, 1970, 00:00:00 GMT for "9999-12-31 23:59:59.999999".
-              milliseconds = rand.nextLong() % 253402329599999L
-            }
-            // DateTimeUtils.toJavaTimestamp takes microsecond.
-            val ts = DateTimeUtils.toJavaTimestamp(milliseconds * 1000)
-            // The generated `ts` is based on the hybrid calendar Julian + Gregorian since
-            // 1582-10-15 but it should be valid in Proleptic Gregorian calendar too which is used
-            // by Spark SQL since version 3.0 (see SPARK-26651). We try to convert `ts` to
-            // a local timestamp in Proleptic Gregorian calendar to satisfy this requirement.
-            // Some years are leap years in Julian calendar but not in Proleptic Gregorian calendar.
-            // As the consequence of that, 29 February of such years might not exist in Proleptic
-            // Gregorian calendar. When this happens, we shift the timestamp `ts` by one day.
-            Try { ts.toLocalDateTime; ts }.getOrElse(new Timestamp(ts.getTime + MILLIS_PER_DAY))
+        def uniformTimestampRand(rand: Random): java.sql.Timestamp = {
+          var milliseconds = rand.nextLong() % 253402329599999L
+          // -62135740800000L is the number of milliseconds before January 1, 1970, 00:00:00 GMT
+          // for "0001-01-01 00:00:00.000000". We need to find a
+          // number that is greater or equals to this number as a valid timestamp value.
+          while (milliseconds < -62135740800000L) {
+            // 253402329599999L is the number of milliseconds since
+            // January 1, 1970, 00:00:00 GMT for "9999-12-31 23:59:59.999999".
+            milliseconds = rand.nextLong() % 253402329599999L
           }
-        Some(generator)
+          // DateTimeUtils.toJavaTimestamp takes microsecond.
+          val ts = DateTimeUtils.toJavaTimestamp(milliseconds * 1000)
+          // The generated `ts` is based on the hybrid calendar Julian + Gregorian since
+          // 1582-10-15 but it should be valid in Proleptic Gregorian calendar too which is used
+          // by Spark SQL since version 3.0 (see SPARK-26651). We try to convert `ts` to
+          // a local timestamp in Proleptic Gregorian calendar to satisfy this requirement.
+          // Some years are leap years in Julian calendar but not in Proleptic Gregorian calendar.
+          // As the consequence of that, 29 February of such years might not exist in Proleptic
+          // Gregorian calendar. When this happens, we shift the timestamp `ts` by one day.
+          Try { ts.toLocalDateTime; ts }.getOrElse(new Timestamp(ts.getTime + MILLIS_PER_DAY))
+        }
+        randomNumeric[java.sql.Timestamp](
+          rand,
+          uniformTimestampRand,
+          Seq(
+            "0001-01-01 00:00:00", // the fist timestamp of Common Era
+            "1582-10-15 23:59:59", // the cutover date from Julian to Gregorian calendar
+            "1970-01-01 00:00:00", // the epoch timestamp
+            "9999-12-31 23:59:59.999" // the last supported timestamp according to SQL standard
+          ).map(java.sql.Timestamp.valueOf))
       case CalendarIntervalType => Some(() => {
         val months = rand.nextInt(1000)
         val days = rand.nextInt(10000)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala
index 4ada507..9cf1719 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala
@@ -159,7 +159,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes
               .add("index", IntegerType, nullable = false)
               .add("col", dataType, nullable = true)
             val rdd =
-              spark.sparkContext.parallelize((1 to 10).map(i => Row(i, dataGenerator())))
+              spark.sparkContext.parallelize((1 to 20).map(i => Row(i, dataGenerator())))
             val df = spark.createDataFrame(rdd, schema).orderBy("index").coalesce(1)
 
             df.write


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org