You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by "Tao Meng (Jira)" <ji...@apache.org> on 2022/03/16 14:14:00 UTC

[jira] [Updated] (HUDI-3646) The Hudi update syntax should not modify the nullability attribute of a column

     [ https://issues.apache.org/jira/browse/HUDI-3646?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Tao Meng updated HUDI-3646:
---------------------------
    Description: 
now, when we use sparksql to update hudi table, we find that  hudi will change the nullability attribute of a column

eg:
{code:java}
// code placeholder
 val tableName = generateTableName
 val tablePath = s"${new Path(tmp.getCanonicalPath, tableName).toUri.toString}"
 // create table
 spark.sql(
   s"""
      |create table $tableName (
      |  id int,
      |  name string,
      |  price double,
      |  ts long
      |) using hudi
      | location '$tablePath'
      | options (
      |  type = '$tableType',
      |  primaryKey = 'id',
      |  preCombineField = 'ts'
      | )
""".stripMargin)
 // insert data to table
 spark.sql(s"insert into $tableName select 1, 'a1', 10, 1000")
 spark.sql(s"select * from $tableName").printSchema()

 // update data
 spark.sql(s"update $tableName set price = 20 where id = 1")
 spark.sql(s"select * from $tableName").printSchema() {code}
 

 |-- _hoodie_commit_time: string (nullable = true)
 |-- _hoodie_commit_seqno: string (nullable = true)
 |-- _hoodie_record_key: string (nullable = true)
 |-- _hoodie_partition_path: string (nullable = true)
 |-- _hoodie_file_name: string (nullable = true)
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 *|-- price: double (nullable = true)*
 |-- ts: long (nullable = true)

 

 |-- _hoodie_commit_time: string (nullable = true)
 |-- _hoodie_commit_seqno: string (nullable = true)
 |-- _hoodie_record_key: string (nullable = true)
 |-- _hoodie_partition_path: string (nullable = true)
 |-- _hoodie_file_name: string (nullable = true)
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 *|-- price: double (nullable = false )*
 |-- ts: long (nullable = true)
 
the nullable attribute of price has been changed to false, This is not the result we want

  was:
now, when we use sparksql to update hudi table, we find that  hudi will change the nullability attribute of a column

eg:

```

test("Test Update Table") {
withTempDir { tmp =>
Seq("cow", "mor").foreach {tableType =>
val tableName = generateTableName
val tablePath = s"${new Path(tmp.getCanonicalPath, tableName).toUri.toString}"
// create table
spark.sql(
s"""
|create table $tableName (
| id int,
| name string,
| price double,
| ts long
|) using hudi
| location '$tablePath'
| options (
| type = '$tableType',
| primaryKey = 'id',
| preCombineField = 'ts'
| )
""".stripMargin)
// insert data to table
spark.sql(s"insert into $tableName select 1, 'a1', 10, 1000")
spark.sql(s"select * from $tableName").printSchema()
checkAnswer(s"select id, name, price, ts from $tableName")(
Seq(1, "a1", 10.0, 1000)
)

// update data
spark.sql(s"update $tableName set price = 20 where id = 1")
spark.sql(s"select * from $tableName").printSchema()
checkAnswer(s"select id, name, price, ts from $tableName")(
Seq(1, "a1", 20.0, 1000)
)

// update data
spark.sql(s"update $tableName set price = price * 2 where id = 1")
checkAnswer(s"select id, name, price, ts from $tableName")(
Seq(1, "a1", 40.0, 1000)
)
}
}
}
}

```

 |-- _hoodie_commit_time: string (nullable = true)
 |-- _hoodie_commit_seqno: string (nullable = true)
 |-- _hoodie_record_key: string (nullable = true)
 |-- _hoodie_partition_path: string (nullable = true)
 |-- _hoodie_file_name: string (nullable = true)
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 *|-- price: double (nullable = true)*
 |-- ts: long (nullable = true)

 
|-- _hoodie_commit_time: string (nullable = true)
 |-- _hoodie_commit_seqno: string (nullable = true)
 |-- _hoodie_record_key: string (nullable = true)
 |-- _hoodie_partition_path: string (nullable = true)
 |-- _hoodie_file_name: string (nullable = true)
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 *|-- price: double (nullable = false)*
 |-- ts: long (nullable = true)

the nullable attribute of price has been changed to false, This is not the result we want


> The Hudi update syntax should not modify the nullability attribute of a column
> ------------------------------------------------------------------------------
>
>                 Key: HUDI-3646
>                 URL: https://issues.apache.org/jira/browse/HUDI-3646
>             Project: Apache Hudi
>          Issue Type: Bug
>          Components: spark-sql
>    Affects Versions: 0.10.1
>         Environment: spark3.1.2
>            Reporter: Tao Meng
>            Priority: Minor
>             Fix For: 0.12.0
>
>
> now, when we use sparksql to update hudi table, we find that  hudi will change the nullability attribute of a column
> eg:
> {code:java}
> // code placeholder
>  val tableName = generateTableName
>  val tablePath = s"${new Path(tmp.getCanonicalPath, tableName).toUri.toString}"
>  // create table
>  spark.sql(
>    s"""
>       |create table $tableName (
>       |  id int,
>       |  name string,
>       |  price double,
>       |  ts long
>       |) using hudi
>       | location '$tablePath'
>       | options (
>       |  type = '$tableType',
>       |  primaryKey = 'id',
>       |  preCombineField = 'ts'
>       | )
> """.stripMargin)
>  // insert data to table
>  spark.sql(s"insert into $tableName select 1, 'a1', 10, 1000")
>  spark.sql(s"select * from $tableName").printSchema()
>  // update data
>  spark.sql(s"update $tableName set price = 20 where id = 1")
>  spark.sql(s"select * from $tableName").printSchema() {code}
>  
>  |-- _hoodie_commit_time: string (nullable = true)
>  |-- _hoodie_commit_seqno: string (nullable = true)
>  |-- _hoodie_record_key: string (nullable = true)
>  |-- _hoodie_partition_path: string (nullable = true)
>  |-- _hoodie_file_name: string (nullable = true)
>  |-- id: integer (nullable = true)
>  |-- name: string (nullable = true)
>  *|-- price: double (nullable = true)*
>  |-- ts: long (nullable = true)
>  
>  |-- _hoodie_commit_time: string (nullable = true)
>  |-- _hoodie_commit_seqno: string (nullable = true)
>  |-- _hoodie_record_key: string (nullable = true)
>  |-- _hoodie_partition_path: string (nullable = true)
>  |-- _hoodie_file_name: string (nullable = true)
>  |-- id: integer (nullable = true)
>  |-- name: string (nullable = true)
>  *|-- price: double (nullable = false )*
>  |-- ts: long (nullable = true)
>  
> the nullable attribute of price has been changed to false, This is not the result we want



--
This message was sent by Atlassian Jira
(v8.20.1#820001)