You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by GitBox <gi...@apache.org> on 2022/09/08 01:48:50 UTC
[GitHub] [hudi] xiarixiaoyao commented on issue #5452: Schema Evolution: Missing column for previous records when new entry does not have the same while upsert.
xiarixiaoyao commented on issue #5452:
URL: https://github.com/apache/hudi/issues/5452#issuecomment-1240122505
@santoshsb you need use schema evolution and hoodie.datasource.write.reconcile.schema, see the follow codes
```
def perf(spark: SparkSession) = {
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.functions._
import org.apache.hudi.DataSourceWriteOptions
import org.apache.hudi.DataSourceReadOptions
import org.apache.hudi.config.HoodieWriteConfig
import org.apache.hudi.hive.MultiPartKeysValueExtractor
//Define a Patient FHIR resource, for simplicity have deleted most of the elements and retained a few
val orgString = """{"resourceType":"Patient","id":"beca9a29-49bb-40e4-adff-4dbb4d664972","lastUpdated":"2022-02-14T15:18:18.90836+05:30","source":"4a0701fe-5c3b-482b-895d-875fcbd2148a","name":[{"use":"official","family":"Keeling57","given":["Serina556"],"prefix":["Ms."]}]}"""
val sqlContext = spark.sqlContext
import sqlContext.implicits._
val orgStringDf = spark.read.json(Seq(orgString).toDS)
//Specify common DataSourceWriteOptions in the single hudiOptions variable
val hudiOptions = Map[String,String](
HoodieWriteConfig.TABLE_NAME -> "patient_hudi",
DataSourceWriteOptions.TABLE_TYPE_OPT_KEY -> "COPY_ON_WRITE",
DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY -> "id",
DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY -> "source",
DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY -> "lastUpdated",
DataSourceWriteOptions.HIVE_STYLE_PARTITIONING_OPT_KEY -> "true")
//Write the orgStringDf to a Hudi table
orgStringDf.write
.format("org.apache.hudi")
.option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
.options(hudiOptions)
.mode(SaveMode.Overwrite)
.save("/work/data/updateTst/hudi/json_schema_tst")
//Read the Hudi table
val patienthudi = spark.read.format("hudi").load("/work/data/updateTst/hudi/json_schema_tst")
//Printschema
patienthudi.printSchema
//Update: Based on our usecase add a new patient resource, this resource might contain new columns and might not have existing columns (normal use case with FHIR data)
val updatedString = """{"resourceType":"Patient","id":"beca9a29-49bb-40e4-adff-4dbb4d664972","lastUpdated":"2022-02-14T15:18:18.90836+05:30","source":"4a0701fe-5c3b-482b-895d-875fcbd2148a","name":[{"use":"official","family":"Keeling57","given":["Serina556"]}]}"""
//Convert the new resource string into DF
val updatedStringDf = spark.read.json(Seq(updatedString).toDS)
//Check the schema of the new resource that is being added
updatedStringDf.printSchema
//Upsert the new resource
spark.sql("set hoodie.schema.on.read.enable=true")
updatedStringDf.write
.format("org.apache.hudi")
.options(hudiOptions)
.option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL)
.option(DataSourceWriteOptions.PAYLOAD_CLASS_OPT_KEY, "org.apache.hudi.common.model.EmptyHoodieRecordPayload")
.option("hoodie.datasource.write.reconcile.schema", "true")
.mode(SaveMode.Append)
.save("/work/data/updateTst/hudi/json_schema_tst")
//Read the Hudi table
val patienthudiUpdated = spark.read.format("hudi").load("/work/data/updateTst/hudi/json_schema_tst")
//Print the schema after adding the new record
patienthudiUpdated.printSchema
}
```
patienthudiUpdated.schema:
|-- _hoodie_commit_time: string (nullable = true)
|-- _hoodie_commit_seqno: string (nullable = true)
|-- _hoodie_record_key: string (nullable = true)
|-- _hoodie_partition_path: string (nullable = true)
|-- _hoodie_file_name: string (nullable = true)
|-- id: string (nullable = true)
|-- lastUpdated: string (nullable = true)
|-- name: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- family: string (nullable = true)
| | |-- given: array (nullable = true)
| | | |-- element: string (containsNull = true)
| | |-- prefix: array (nullable = true)
| | | |-- element: string (containsNull = true)
| | |-- use: string (nullable = true)
|-- resourceType: string (nullable = true)
|-- source: string (nullable = true)
i think it should be ok , thanks
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org