You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by GitBox <gi...@apache.org> on 2021/09/24 13:56:14 UTC
[GitHub] [hudi] parisni edited a comment on issue #3670: [SUPPORT] SQL stmt managed table, not update/delete with datasource API
parisni edited a comment on issue #3670:
URL: https://github.com/apache/hudi/issues/3670#issuecomment-926644355
hi @xushiyan, see below example/explanations
```scala
// create a basic parquet table
val inputDF = Seq(
("100", "2015-01-01", "2015-01-01T13:51:39.340396Z"),
("101", "2015-01-01", "2015-01-01T12:14:58.597216Z"),
("102", "2015-01-01", "2015-01-01T13:51:40.417052Z"),
("103", "2015-01-01", "2015-01-01T13:51:40.519832Z"),
("104", "2015-01-02", "2015-01-01T12:15:00.512679Z"),
("105", "2015-01-02", "2015-01-01T13:51:42.248818Z")
).toDF("id", "creation_date", "last_update_time")
.withColumn("creation_date", expr("cast(creation_date as date)"))
.withColumn("id", expr("cast(id as bigint)"))
inputDF.write.format("parquet").saveAsTable("test_hudi_partitionned")
// create a sql hudi managed table and insert into it the basic parquet data
spark.sql(
"""
create table if not exists test_hudi_partition_sql using hudi
location 's3://test-bucket/test/test_hudi_partitionned_sql'
options (
type = 'mor',
primaryKey = 'id',
preCombineField = 'last_update_time'
)
partitioned by (creation_date)
as select id, last_update_time, creation_date
from test_hudi_partition
""")
// try to delete from the hudi table with datasource api
val tableName = "test_hudi_partition_sql"
val hudiOptions = Map[String, String](
HoodieWriteConfig.TABLE_NAME -> tableName,
DataSourceWriteOptions.OPERATION_OPT_KEY -> "delete",
DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY -> "COPY_ON_WRITE",
DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY -> "id",
DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY -> "creation_date",
DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY -> "last_update_time",
DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY -> "default",
DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY -> "true",
DataSourceWriteOptions.HIVE_TABLE_OPT_KEY -> "test_hudi_partition",
DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY -> "creation_date",
DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY -> classOf[MultiPartKeysValueExtractor].getName,
"hoodie.datasource.hive_sync.mode" -> "hms",
)
spark.sql("select * from test_hudi_partition_sql") // delete all the rows
.write
.format("org.apache.hudi")
.options(hudiOptions)
.option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.DELETE_OPERATION_OPT_VAL)
.mode(SaveMode.Append)
.save("s3://test-bucket/test/test_hudi_partitionned_sql")
/*
// ISSUE: this should delete all the rows, but does not work
+-------------------+--------------------+------------------+----------------------+--------------------+---+-------------+--------------------+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path| _hoodie_file_name| id|creation_date| last_update_time|
+-------------------+--------------------+------------------+----------------------+--------------------+---+-------------+--------------------+
| 20210913143339| 20210913143339_0_21| 100| 2015-01-01|ac56876d-1183-413...|100| 2015-01-01|2015-01-01T13:51:...|
| 20210913143339| 20210913143339_0_22| 101| 2015-01-01|ac56876d-1183-413...|101| 2015-01-01|2015-01-01T12:14:...|
| 20210913143339| 20210913143339_0_23| 102| 2015-01-01|ac56876d-1183-413...|102| 2015-01-01|2015-01-01T13:51:...|
| 20210913143339| 20210913143339_0_24| 103| 2015-01-01|ac56876d-1183-413...|103| 2015-01-01|2015-01-01T13:51:...|
| 20210913143339| 20210913143339_1_25| 104| 2015-01-02|34a31f32-3348-44b...|104| 2015-01-02|2015-01-01T12:15:...|
| 20210913143339| 20210913143339_1_26| 105| 2015-01-02|34a31f32-3348-44b...|105| 2015-01-02|2015-01-01T13:51:...|
+-------------------+--------------------+------------------+----------------------+--------------------+---+-------------+--------------------+
*/
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org