You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by "Sungju Jin (Jira)" <ji...@apache.org> on 2020/07/17 04:04:00 UTC
[jira] [Created] (HUDI-1107) Respect
spark.sql.parquet.outputTimestampType when writing datasets
Sungju Jin created HUDI-1107:
--------------------------------
Summary: Respect spark.sql.parquet.outputTimestampType when writing datasets
Key: HUDI-1107
URL: https://issues.apache.org/jira/browse/HUDI-1107
Project: Apache Hudi
Issue Type: Bug
Affects Versions: 0.5.2
Reporter: Sungju Jin
Hi team, I've noticed `spark.sql.parquet.outputTimestampType` ignores somehow when writing datasets. It would be great if Hudi supports this if possible, thank you!
{code:java}
val spark = SparkSession
.builder()
.master("local")
.getOrCreate()
import spark.implicits._
spark.conf.set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MILLIS")
val inputs = """{"id":"2", "discount_rate_id": "202", "price": "10.00", "currency": "usd", "start_date": "2020-04-19", "end_date": "null", "created": "2020-04-19 10:57:17", "modified": "2018-04-19 10:57:17"}
{"id":"4", "discount_rate_id": "202", "price": "10.00", "currency": "eur", "start_date": "2020-04-19", "end_date": "null", "created": "2020-04-19 10:57:18", "modified": "2018-04-19 10:57:18"}"""
val df = spark
.read
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss")
.json(Seq(inputs).toDS)
.toDF
.withColumn("created", col("created").cast(TimestampType))
.withColumn("modified", col("modified").cast(TimestampType))
df.write
.format("org.apache.hudi")
.option(PRECOMBINE_FIELD_OPT_KEY, "id")
.option(RECORDKEY_FIELD_OPT_KEY, "created")
.option(TABLE_NAME, s"adhoc.test")
.mode("overwrite")
.save("/tmp/test")
{code}
{code:java}
$ cd /tmp/test/default/
$ parquet-tools schema 0e924c77-0e31-46c4-ade9-f9cc97ea02cd-0_0-21-12005_20200716205246.parquet
message adhoc.test_record {
optional binary _hoodie_commit_time (UTF8);
optional binary _hoodie_commit_seqno (UTF8);
optional binary _hoodie_record_key (UTF8);
optional binary _hoodie_partition_path (UTF8);
optional binary _hoodie_file_name (UTF8);
optional int64 created (TIMESTAMP_MICROS); <-- This should be TIMESTAMP_MILLIS
optional binary currency (UTF8);
optional binary discount_rate_id (UTF8);
optional binary end_date (UTF8);
optional binary id (UTF8);
optional int64 modified (TIMESTAMP_MICROS); <-- This should be TIMESTAMP_MILLIS
optional binary price (UTF8);
optional binary start_date (UTF8);
}
{code}
--
This message was sent by Atlassian Jira
(v8.3.4#803005)