You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2020/05/07 07:54:52 UTC
[spark] branch branch-3.0 updated:
[SPARK-31361][SQL][TESTS][FOLLOWUP] Check non-vectorized Parquet reader
while date/timestamp rebasing
This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new dfb916f [SPARK-31361][SQL][TESTS][FOLLOWUP] Check non-vectorized Parquet reader while date/timestamp rebasing
dfb916f is described below
commit dfb916f6b65b05dd7fd58853d97e06bc7e75d8be
Author: Max Gekk <ma...@gmail.com>
AuthorDate: Thu May 7 07:52:29 2020 +0000
[SPARK-31361][SQL][TESTS][FOLLOWUP] Check non-vectorized Parquet reader while date/timestamp rebasing
### What changes were proposed in this pull request?
In PR, I propose to modify two tests of `ParquetIOSuite`:
- SPARK-31159: rebasing timestamps in write
- SPARK-31159: rebasing dates in write
to check non-vectorized Parquet reader together with vectorized reader.
### Why are the changes needed?
To improve test coverage and make sure that non-vectorized reader behaves similar to the vectorized reader.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
By running `PaquetIOSuite`:
```
$ ./build/sbt "test:testOnly *ParquetIOSuite"
```
Closes #28466 from MaxGekk/test-novec-rebase-ParquetIOSuite.
Authored-by: Max Gekk <ma...@gmail.com>
Signed-off-by: Wenchen Fan <we...@databricks.com>
(cherry picked from commit 272d229005b7166ab83bbb8f44a4d5e9d89424a1)
Signed-off-by: Wenchen Fan <we...@databricks.com>
---
.../datasources/parquet/ParquetIOSuite.scala | 54 +++++++++++++---------
1 file changed, 32 insertions(+), 22 deletions(-)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
index 7f0a228..af66aa0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
@@ -952,18 +952,24 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession
.write
.parquet(path)
}
- // The file metadata indicates if it needs rebase or not, so we can always get the
- // correct result regardless of the "rebaseInRead" config.
- Seq(true, false).foreach { rebase =>
- withSQLConf(SQLConf.LEGACY_PARQUET_REBASE_DATETIME_IN_READ.key -> rebase.toString) {
- checkAnswer(spark.read.parquet(path), Row(Timestamp.valueOf(tsStr)))
- }
- }
- // Force to not rebase to prove the written datetime values are rebased and we will get
- // wrong result if we don't rebase while reading.
- withSQLConf("spark.test.forceNoRebase" -> "true") {
- checkAnswer(spark.read.parquet(path), Row(Timestamp.valueOf(nonRebased)))
+ Seq(false, true).foreach { vectorized =>
+ withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vectorized.toString) {
+ // The file metadata indicates if it needs rebase or not, so we can always get the
+ // correct result regardless of the "rebaseInRead" config.
+ Seq(true, false).foreach { rebase =>
+ withSQLConf(
+ SQLConf.LEGACY_PARQUET_REBASE_DATETIME_IN_READ.key -> rebase.toString) {
+ checkAnswer(spark.read.parquet(path), Row(Timestamp.valueOf(tsStr)))
+ }
+ }
+
+ // Force to not rebase to prove the written datetime values are rebased
+ // and we will get wrong result if we don't rebase while reading.
+ withSQLConf("spark.test.forceNoRebase" -> "true") {
+ checkAnswer(spark.read.parquet(path), Row(Timestamp.valueOf(nonRebased)))
+ }
+ }
}
}
}
@@ -981,18 +987,22 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession
.parquet(path)
}
- // The file metadata indicates if it needs rebase or not, so we can always get the correct
- // result regardless of the "rebaseInRead" config.
- Seq(true, false).foreach { rebase =>
- withSQLConf(SQLConf.LEGACY_PARQUET_REBASE_DATETIME_IN_READ.key -> rebase.toString) {
- checkAnswer(spark.read.parquet(path), Row(Date.valueOf("1001-01-01")))
- }
- }
+ Seq(false, true).foreach { vectorized =>
+ withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vectorized.toString) {
+ // The file metadata indicates if it needs rebase or not, so we can always get the correct
+ // result regardless of the "rebaseInRead" config.
+ Seq(true, false).foreach { rebase =>
+ withSQLConf(SQLConf.LEGACY_PARQUET_REBASE_DATETIME_IN_READ.key -> rebase.toString) {
+ checkAnswer(spark.read.parquet(path), Row(Date.valueOf("1001-01-01")))
+ }
+ }
- // Force to not rebase to prove the written datetime values are rebased and we will get
- // wrong result if we don't rebase while reading.
- withSQLConf("spark.test.forceNoRebase" -> "true") {
- checkAnswer(spark.read.parquet(path), Row(Date.valueOf("1001-01-07")))
+ // Force to not rebase to prove the written datetime values are rebased and we will get
+ // wrong result if we don't rebase while reading.
+ withSQLConf("spark.test.forceNoRebase" -> "true") {
+ checkAnswer(spark.read.parquet(path), Row(Date.valueOf("1001-01-07")))
+ }
+ }
}
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org