You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2020/05/05 00:31:16 UTC
[spark] branch branch-3.0 updated: [SPARK-31639] Revert SPARK-27528
Use Parquet logical type TIMESTAMP_MICROS by default
This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new c4b292e [SPARK-31639] Revert SPARK-27528 Use Parquet logical type TIMESTAMP_MICROS by default
c4b292e is described below
commit c4b292e0368060abb81cb76439ca4a796df88b69
Author: Max Gekk <ma...@gmail.com>
AuthorDate: Mon May 4 17:27:02 2020 -0700
[SPARK-31639] Revert SPARK-27528 Use Parquet logical type TIMESTAMP_MICROS by default
### What changes were proposed in this pull request?
This reverts commit https://github.com/apache/spark/commit/43a73e387cb843486adcf5b8bbd8b99010ce6e02. It sets `INT96` as the timestamp type while saving timestamps to parquet files.
### Why are the changes needed?
To be compatible with Hive and Presto that don't support the `TIMESTAMP_MICROS` type in current stable releases.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
By existing test suites.
Closes #28450 from MaxGekk/parquet-int96.
Authored-by: Max Gekk <ma...@gmail.com>
Signed-off-by: Dongjoon Hyun <do...@apache.org>
(cherry picked from commit 372ccba0632a76a7b02cb2c558a3ecd4fae839e5)
Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
docs/sql-migration-guide.md | 2 --
.../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala | 2 +-
.../datasources/parquet/ParquetInteroperabilitySuite.scala | 8 ++------
.../test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala | 2 +-
4 files changed, 4 insertions(+), 10 deletions(-)
diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
index d515b57..748d689 100644
--- a/docs/sql-migration-guide.md
+++ b/docs/sql-migration-guide.md
@@ -186,8 +186,6 @@ license: |
- In Spark version 2.4 and below, CSV datasource converts a malformed CSV string to a row with all `null`s in the PERMISSIVE mode. In Spark 3.0, the returned row can contain non-`null` fields if some of CSV column values were parsed and converted to desired types successfully.
- - In Spark 3.0, parquet logical type `TIMESTAMP_MICROS` is used by default while saving `TIMESTAMP` columns. In Spark version 2.4 and below, `TIMESTAMP` columns are saved as `INT96` in parquet files. Note that, some SQL systems such as Hive 1.x and Impala 2.x can only read `INT96` timestamps, you can set `spark.sql.parquet.outputTimestampType` as `INT96` to restore the previous behavior and keep interoperability.
-
- In Spark 3.0, when Avro files are written with user provided schema, the fields are matched by field names between catalyst schema and Avro schema instead of positions.
- In Spark 3.0, when Avro files are written with user provided non-nullable schema, even the catalyst schema is nullable, Spark is still able to write the files. However, Spark throws runtime NullPointerException if any of the records contains null.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index f6b0bbd..72946a9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -620,7 +620,7 @@ object SQLConf {
.stringConf
.transform(_.toUpperCase(Locale.ROOT))
.checkValues(ParquetOutputTimestampType.values.map(_.toString))
- .createWithDefault(ParquetOutputTimestampType.TIMESTAMP_MICROS.toString)
+ .createWithDefault(ParquetOutputTimestampType.INT96.toString)
val PARQUET_COMPRESSION = buildConf("spark.sql.parquet.compression.codec")
.doc("Sets the compression codec used when writing Parquet files. If either `compression` or " +
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
index 649a46f..7d75077 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
@@ -119,12 +119,8 @@ class ParquetInteroperabilitySuite extends ParquetCompatibilityTest with SharedS
).map { s => java.sql.Timestamp.valueOf(s) }
import testImplicits._
// match the column names of the file from impala
- withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key ->
- SQLConf.ParquetOutputTimestampType.INT96.toString) {
- val df = spark.createDataset(ts).toDF().repartition(1)
- .withColumnRenamed("value", "ts")
- df.write.parquet(tableDir.getAbsolutePath)
- }
+ val df = spark.createDataset(ts).toDF().repartition(1).withColumnRenamed("value", "ts")
+ df.write.parquet(tableDir.getAbsolutePath)
FileUtils.copyFile(new File(impalaPath), new File(tableDir, "part-00001.parq"))
Seq(false, true).foreach { int96TimestampConversion =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
index feccf52..f1cd37f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
@@ -275,7 +275,7 @@ class SQLConfSuite extends QueryTest with SharedSparkSession {
// check default value
assert(spark.sessionState.conf.parquetOutputTimestampType ==
- SQLConf.ParquetOutputTimestampType.TIMESTAMP_MICROS)
+ SQLConf.ParquetOutputTimestampType.INT96)
spark.sessionState.conf.setConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE, "timestamp_micros")
assert(spark.sessionState.conf.parquetOutputTimestampType ==
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org