You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by GitBox <gi...@apache.org> on 2023/01/20 07:27:06 UTC

[GitHub] [hudi] abhishekshenoy opened a new issue, #7717: [SUPPORT] org.apache.avro.SchemaParseException: Can't redefine: array When there are Top level variables , Struct and Array[struct] (no complex datatype within array[struct])

abhishekshenoy opened a new issue, #7717:
URL: https://github.com/apache/hudi/issues/7717

   ### Describe the problem you faced
   
   When storing a data structure with the following layout into a copy-on-write table:
   ```
   root
    |-- personDetails: struct (nullable = true)
    |    |-- id: integer (nullable = false)
    |-- idInfo: struct (nullable = true)
    |    |-- adhaarId: integer (nullable = false)
    |-- addressInfo: array (nullable = true)
    |    |-- element: struct (containsNull = true)
    |    |    |-- addressId: integer (nullable = false)
    |-- employmentInfo: array (nullable = true)
    |    |-- element: struct (containsNull = true)
    |    |    |-- employmenCd: integer (nullable = false)
    |-- src_load_ts: timestamp (nullable = false)
    |-- load_ts: timestamp (nullable = false)
    |-- load_dt: date (nullable = false)
   ```
   the first write will succeed, but then subsequent writes will fail with the error included in the stacktrace.
   
   ### To Reproduce
   
   Steps to reproduce the behavior:
   
   ```
     case class Person(personDetails: PersonDetails,
                       idInfo: IdInfo,
                       addressInfo: Array[AddressInfo] = Array.empty[AddressInfo],
                       employmentInfo: Array[EmploymentInfo] = Array.empty[EmploymentInfo])
   
     case class PersonDetails(id: Int)
   
     case class IdInfo(adhaarId: Int)
   
     case class AddressInfo(addressId: Int)
   
     case class EmploymentInfo(employmenCd: Int)
   
     def maskedParquetBugTest(spark: SparkSession): Unit = {
   
       import spark.implicits._
   
       val personDetails1 = PersonDetails(1)
       val idInfo1 = IdInfo(1)
       val addressInfo1 = AddressInfo(1)
       val employmentInfo1 = EmploymentInfo(1)
   
       val item1 = Person(personDetails1, idInfo1, Array(addressInfo1), Array(employmentInfo1))
       val parquetBugDs = Seq(item1).toDF()
         .withColumn("src_load_ts", current_timestamp())
         .withColumn("load_ts", timestampInCst).withColumn("load_dt", to_date(col("load_ts")))
   
       parquetBugDs.printSchema()
   
       writeHudi(parquetBugDs, "parquet_bug_ds",
         "load_dt",
         "personDetails.id",
         "src_load_ts")
     }
   
     def writeHudi(ds: DataFrame, tableName: String, partitionPath: String, recordKey: String, precombineKey: String): Unit = {
   
       val hoodieConfigs: util.Map[String, String] = new java.util.HashMap[String, String]
       hoodieConfigs.put("hoodie.table.name", tableName)
       hoodieConfigs.put("hoodie.datasource.write.keygenerator.class", classOf[SimpleKeyGenerator].getName)
       hoodieConfigs.put("hoodie.datasource.write.partitionpath.field", partitionPath)
       hoodieConfigs.put("hoodie.datasource.write.recordkey.field", recordKey)
       hoodieConfigs.put("hoodie.datasource.write.precombine.field", precombineKey)
       hoodieConfigs.put("hoodie.payload.ordering.field", precombineKey)
       hoodieConfigs.put("hoodie.index.type", "GLOBAL_SIMPLE")
       hoodieConfigs.put("hoodie.insert.shuffle.parallelism", "1")
       hoodieConfigs.put("hoodie.upsert.shuffle.parallelism", "1")
       hoodieConfigs.put("hoodie.bulkinsert.shuffle.parallelism", "1")
       hoodieConfigs.put("hoodie.delete.shuffle.parallelism", "1")
       hoodieConfigs.put("hoodie.simple.index.update.partition.path", "false")
       hoodieConfigs.put("hoodie.datasource.write.payload.class", classOf[DefaultHoodieRecordPayload].getName)
       hoodieConfigs.put("hoodie.datasource.write.hive_style_partitioning", "false")
       hoodieConfigs.put("hoodie.datasource.write.table.type", COW_TABLE_TYPE_OPT_VAL)
       hoodieConfigs.put("hoodie.datasource.write.row.writer.enable", "true")
       hoodieConfigs.put("hoodie.combine.before.upsert", "true")
       hoodieConfigs.put("hoodie.datasource.write.keygenerator.consistent.logical.timestamp.enabled", "true")
       hoodieConfigs.put("hoodie.schema.on.read.enable", "true")
       hoodieConfigs.put("hoodie.datasource.write.reconcile.schema", "true")
       hoodieConfigs.put("hoodie.datasource.write.operation", "upsert")
   
       ds.toDF().write.format("hudi").
         options(hoodieConfigs).
         mode("append").
         save(s"/tmp/data/hudi/$tableName")
     }
   
     maskedParquetBugTest(spark)
   
     maskedParquetBugTest(spark)
   ```
   
   ### Expected behavior
   
   The second write succeeds.
   
   ### Environment Description
   
   Hudi version (hudi-spark3.1-bundle_2.12) : 0.12.2 , 0.12.1, 0.12.0
   
   Spark version : 3.1.3
   
   Hive version : -
   
   Hadoop version : -
   
   Storage (HDFS/S3/GCS..) : Local storage
   
   Running on Docker? (yes/no) : No
   
   ### Additional context
   Fix mentioned in [#7145](https://github.com/apache/hudi/issues/7145) does not work as we do not have any Array[Struct] within another Array[Struct] 
   
   ### Stack Trace
   ```
   Driver stacktrace:
   	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2303)
   	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2252)
   	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2251)
   	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
   	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
   	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
   	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2251)
   	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1124)
   	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1124)
   	at scala.Option.foreach(Option.scala:407)
   	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1124)
   	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2490)
   	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2432)
   	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2421)
   	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
   	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:902)
   	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2196)
   	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2217)
   	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2236)
   	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2261)
   	at org.apache.spark.rdd.RDD.count(RDD.scala:1253)
   	at org.apache.hudi.HoodieSparkSqlWriter$.commitAndPerformPostOperations(HoodieSparkSqlWriter.scala:693)
   	at org.apache.hudi.HoodieSparkSqlWriter$.write(HoodieSparkSqlWriter.scala:345)
   	at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:145)
   	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:46)
   	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
   	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
   	at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:90)
   	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
   	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
   	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
   	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
   	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
   	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:132)
   	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:131)
   	at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:989)
   	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
   	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
   	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
   	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
   	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
   	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:989)
   	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:438)
   	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:415)
   	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:293)
   	at com.test.run.hudi.upsert.complex.utils.HudiParquetBugTest$.writeHudi(HudiParquetBugTest.scala:99)
   	at com.test.run.hudi.upsert.complex.utils.HudiParquetBugTest$.maskedParquetBugTest(HudiParquetBugTest.scala:68)
   	at com.test.run.hudi.upsert.complex.utils.HudiParquetBugTest$.delayedEndpoint$com$walmart$hnw$datafoundations$techmod$ingestion$utils$HudiParquetBugTest$1(HudiParquetBugTest.scala:32)
   	at com.test.run.hudi.upsert.complex.utils.HudiParquetBugTest$delayedInit$body.apply(HudiParquetBugTest.scala:14)
   	at scala.Function0.apply$mcV$sp(Function0.scala:39)
   	at scala.Function0.apply$mcV$sp$(Function0.scala:39)
   	at scala.runtime.AbstractFunction0.apply$mcV$sp(AbstractFunction0.scala:17)
   	at scala.App.$anonfun$main$1$adapted(App.scala:80)
   	at scala.collection.immutable.List.foreach(List.scala:431)
   	at scala.App.main(App.scala:80)
   	at scala.App.main$(App.scala:78)
   	at com.test.run.hudi.upsert.complex.utils.HudiParquetBugTest$.main(HudiParquetBugTest.scala:14)
   	at com.test.run.hudi.upsert.complex.utils.HudiParquetBugTest.main(HudiParquetBugTest.scala)
   Caused by: org.apache.hudi.exception.HoodieUpsertException: Error upserting bucketType UPDATE for partition :0
   	at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.handleUpsertPartition(BaseSparkCommitActionExecutor.java:329)
   	at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.lambda$mapPartitionsAsRDD$a3ab3c4$1(BaseSparkCommitActionExecutor.java:244)
   	at org.apache.spark.api.java.JavaRDDLike.$anonfun$mapPartitionsWithIndex$1(JavaRDDLike.scala:102)
   	at org.apache.spark.api.java.JavaRDDLike.$anonfun$mapPartitionsWithIndex$1$adapted(JavaRDDLike.scala:102)
   	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$2(RDD.scala:915)
   	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$2$adapted(RDD.scala:915)
   	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
   	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
   	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
   	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
   	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
   	at org.apache.spark.rdd.RDD.$anonfun$getOrCompute$1(RDD.scala:386)
   	at org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$1(BlockManager.scala:1440)
   	at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1350)
   	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1414)
   	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1237)
   	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:384)
   	at org.apache.spark.rdd.RDD.iterator(RDD.scala:335)
   	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
   	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
   	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
   	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
   	at org.apache.spark.scheduler.Task.run(Task.scala:131)
   	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:498)
   	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
   	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:501)
   	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
   	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
   	at java.lang.Thread.run(Thread.java:748)
   Caused by: org.apache.hudi.exception.HoodieException: org.apache.avro.SchemaParseException: Can't redefine: array
   	at org.apache.hudi.table.action.commit.HoodieMergeHelper.runMerge(HoodieMergeHelper.java:166)
   	at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.handleUpdateInternal(BaseSparkCommitActionExecutor.java:358)
   	at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.handleUpdate(BaseSparkCommitActionExecutor.java:349)
   	at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.handleUpsertPartition(BaseSparkCommitActionExecutor.java:322)
   	... 28 more
   Caused by: org.apache.avro.SchemaParseException: Can't redefine: array
   	at org.apache.avro.Schema$Names.put(Schema.java:1550)
   	at org.apache.avro.Schema$NamedSchema.writeNameRef(Schema.java:813)
   	at org.apache.avro.Schema$RecordSchema.toJson(Schema.java:975)
   	at org.apache.avro.Schema$ArraySchema.toJson(Schema.java:1137)
   	at org.apache.avro.Schema$UnionSchema.toJson(Schema.java:1242)
   	at org.apache.avro.Schema$RecordSchema.fieldsToJson(Schema.java:1003)
   	at org.apache.avro.Schema$RecordSchema.toJson(Schema.java:987)
   	at org.apache.avro.Schema.toString(Schema.java:426)
   	at org.apache.avro.Schema.toString(Schema.java:398)
   	at org.apache.avro.Schema.toString(Schema.java:389)
   	at org.apache.parquet.avro.AvroReadSupport.setAvroReadSchema(AvroReadSupport.java:69)
   	at org.apache.hudi.io.storage.HoodieParquetReader.getRecordIterator(HoodieParquetReader.java:69)
   	at org.apache.hudi.io.storage.HoodieFileReader.getRecordIterator(HoodieFileReader.java:43)
   	at org.apache.hudi.table.action.commit.HoodieMergeHelper.runMerge(HoodieMergeHelper.java:149)
   ```
   
   
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [I] [SUPPORT] org.apache.avro.SchemaParseException: Can't redefine: array When there are Top level variables , Struct and Array[struct] (no complex datatype within array[struct]) [hudi]

Posted by "ad1happy2go (via GitHub)" <gi...@apache.org>.
ad1happy2go commented on issue #7717:
URL: https://github.com/apache/hudi/issues/7717#issuecomment-1966705910

   What hudi and spark version you are using @Jonathanrodrigr12 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [I] [SUPPORT] org.apache.avro.SchemaParseException: Can't redefine: array When there are Top level variables , Struct and Array[struct] (no complex datatype within array[struct]) [hudi]

Posted by "ad1happy2go (via GitHub)" <gi...@apache.org>.
ad1happy2go commented on issue #7717:
URL: https://github.com/apache/hudi/issues/7717#issuecomment-2049464984

   @Jonathanrodrigr12 Did you also had multiple "value" column across structs? This may be same as issue raised by @junkri https://github.com/apache/hudi/issues/10983  and not this original issue.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [I] [SUPPORT] org.apache.avro.SchemaParseException: Can't redefine: array When there are Top level variables , Struct and Array[struct] (no complex datatype within array[struct]) [hudi]

Posted by "junkri (via GitHub)" <gi...@apache.org>.
junkri commented on issue #7717:
URL: https://github.com/apache/hudi/issues/7717#issuecomment-2045350745

   @Jonathanrodrigr12 I think I ran into the same problem as you, I can see on your screenshot that you have the field called "value" defined multiple times, once as a `decimal` and once as a `struct`. I've just raised a separate issue covering this, please check out if it covers your situation as well: https://github.com/apache/hudi/issues/10983
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [hudi] danny0405 commented on issue #7717: [SUPPORT] org.apache.avro.SchemaParseException: Can't redefine: array When there are Top level variables , Struct and Array[struct] (no complex datatype within array[struct])

Posted by "danny0405 (via GitHub)" <gi...@apache.org>.
danny0405 commented on issue #7717:
URL: https://github.com/apache/hudi/issues/7717#issuecomment-1401409620

   @jonvex Can you take a look at this issue?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [hudi] jonvex commented on issue #7717: [SUPPORT] org.apache.avro.SchemaParseException: Can't redefine: array When there are Top level variables , Struct and Array[struct] (no complex datatype within array[struct])

Posted by "jonvex (via GitHub)" <gi...@apache.org>.
jonvex commented on issue #7717:
URL: https://github.com/apache/hudi/issues/7717#issuecomment-1404063334

   >   case class Person(personDetails: PersonDetails,
                       idInfo: IdInfo,
                       addressInfo: Array[AddressInfo] = Array.empty[AddressInfo],
                       employmentInfo: Array[EmploymentInfo] = Array.empty[EmploymentInfo])
   
   Should addressInfo be an array of  addressId, and employmentInfo be an array of employmenCd?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [hudi] voonhous commented on issue #7717: [SUPPORT] org.apache.avro.SchemaParseException: Can't redefine: array When there are Top level variables , Struct and Array[struct] (no complex datatype within array[struct])

Posted by "voonhous (via GitHub)" <gi...@apache.org>.
voonhous commented on issue #7717:
URL: https://github.com/apache/hudi/issues/7717#issuecomment-1661742304

   Detailed explanation of this error can be found here:
   
   https://github.com/apache/hudi/issues/6849#issuecomment-1661734683


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [hudi] jonvex commented on issue #7717: [SUPPORT] org.apache.avro.SchemaParseException: Can't redefine: array When there are Top level variables , Struct and Array[struct] (no complex datatype within array[struct])

Posted by "jonvex (via GitHub)" <gi...@apache.org>.
jonvex commented on issue #7717:
URL: https://github.com/apache/hudi/issues/7717#issuecomment-1431620989

   Yes. It is not exactly the same issue. What I meant is I think the root cause is the same, and it can be solved by upgrading parquet-avro.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [hudi] danny0405 commented on issue #7717: [SUPPORT] org.apache.avro.SchemaParseException: Can't redefine: array When there are Top level variables , Struct and Array[struct] (no complex datatype within array[struct])

Posted by "danny0405 (via GitHub)" <gi...@apache.org>.
danny0405 commented on issue #7717:
URL: https://github.com/apache/hudi/issues/7717#issuecomment-1404606602

   So cc @jonvex , you mean the Spark SQL still fails for the latest master code and things are not like what it mentioned in #2657 because it said that only Hive query fails.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [hudi] jonvex commented on issue #7717: [SUPPORT] org.apache.avro.SchemaParseException: Can't redefine: array When there are Top level variables , Struct and Array[struct] (no complex datatype within array[struct])

Posted by "jonvex (via GitHub)" <gi...@apache.org>.
jonvex commented on issue #7717:
URL: https://github.com/apache/hudi/issues/7717#issuecomment-1404164342

   Ok, so this seems to be the same issue as [https://github.com/apache/hudi/issues/2657](https://github.com/apache/hudi/issues/2657)
   
   Here is my runbook that you can use in spark shell. I used Spark 3.1.3 and Hudi 0.12.2 :
   ```
   import org.apache.hudi.QuickstartUtils._
   import scala.collection.JavaConversions._
   import org.apache.spark.sql.SaveMode._
   import org.apache.hudi.DataSourceReadOptions._
   import org.apache.hudi.DataSourceWriteOptions._
   import org.apache.hudi.config.HoodieWriteConfig._
   import org.apache.hudi.common.model.HoodieRecord
   import org.apache.spark.sql.DataFrame
   import java.util.Map
   import java.util.HashMap
   import org.apache.hudi.keygen.SimpleKeyGenerator
   import org.apache.hudi.common.model.DefaultHoodieRecordPayload
   import org.apache.spark.sql.SparkSession
   import java.sql.Timestamp
   import spark.implicits._
   
     val timestampInCst = current_timestamp()
   
     case class PersonDetails(id: Int)
   
     case class IdInfo(adhaarId: Int)
   
     case class AddressInfo(addressId: Int)
   
     //case class EmploymentInfo(employmenCd: Int)
   
   //    case class Person(personDetails: PersonDetails,
   //                     idInfo: IdInfo,
   //                     addressInfo: Array[AddressInfo] = Array.empty[AddressInfo],
   //                     employmentInfo: Array[EmploymentInfo] = Array.empty[EmploymentInfo])
   
       case class Person(personDetails: PersonDetails,
                   idInfo: IdInfo,
                   addressInfo: Array[AddressInfo] = Array.empty[AddressInfo])
     
     
     def writeHudi(ds: DataFrame, tableName: String, partitionPath: String, recordKey: String, precombineKey: String, basePath: String): Unit = {
   
       val hoodieConfigs: java.util.Map[String, String] = new java.util.HashMap[String, String]
       hoodieConfigs.put("hoodie.table.name", tableName)
       hoodieConfigs.put("hoodie.datasource.write.keygenerator.class", classOf[SimpleKeyGenerator].getName)
       hoodieConfigs.put("hoodie.datasource.write.partitionpath.field", partitionPath)
       hoodieConfigs.put("hoodie.datasource.write.recordkey.field", recordKey)
       hoodieConfigs.put("hoodie.datasource.write.precombine.field", precombineKey)
       hoodieConfigs.put("hoodie.payload.ordering.field", precombineKey)
       hoodieConfigs.put("hoodie.index.type", "GLOBAL_SIMPLE")
       hoodieConfigs.put("hoodie.insert.shuffle.parallelism", "1")
       hoodieConfigs.put("hoodie.upsert.shuffle.parallelism", "1")
       hoodieConfigs.put("hoodie.bulkinsert.shuffle.parallelism", "1")
       hoodieConfigs.put("hoodie.delete.shuffle.parallelism", "1")
       hoodieConfigs.put("hoodie.simple.index.update.partition.path", "false")
       hoodieConfigs.put("hoodie.datasource.write.payload.class", classOf[DefaultHoodieRecordPayload].getName)
       hoodieConfigs.put("hoodie.datasource.write.hive_style_partitioning", "false")
       hoodieConfigs.put("hoodie.datasource.write.table.type", COW_TABLE_TYPE_OPT_VAL)
       hoodieConfigs.put("hoodie.datasource.write.row.writer.enable", "true")
       hoodieConfigs.put("hoodie.combine.before.upsert", "true")
       hoodieConfigs.put("hoodie.datasource.write.keygenerator.consistent.logical.timestamp.enabled", "true")
       hoodieConfigs.put("hoodie.schema.on.read.enable", "true")
       hoodieConfigs.put("hoodie.datasource.write.reconcile.schema", "true")
       hoodieConfigs.put("hoodie.datasource.write.operation", "upsert")
   
       ds.toDF().write.format("hudi").
         options(hoodieConfigs).
         mode("append").
         save(basePath)
     }
   
     def maskedParquetBugTest(spark: SparkSession, tableName: String, basePath: String): Unit = {
   
       import spark.implicits._
   
       val personDetails1 = PersonDetails(1)
       val idInfo1 = IdInfo(1)
       val addressInfo1 = AddressInfo(1)
       //val employmentInfo1 = EmploymentInfo(1)
   
       //val item1 = Person(personDetails1, idInfo1, Array(addressInfo1), Array(employmentInfo1))
       val item1 = Person(personDetails1, idInfo1, Array(addressInfo1))
       val parquetBugDs = Seq(item1).toDF().
         withColumn("src_load_ts", current_timestamp()).
         withColumn("load_ts", timestampInCst).
         withColumn("load_dt", to_date(col("load_ts")))
   
       parquetBugDs.printSchema()
   
       writeHudi(parquetBugDs, tableName, "load_dt", "personDetails.id", "src_load_ts", basePath)
   
       spark.read.format("hudi").load(basePath).show(false)
     }
   
   val tableName = "tbl11"
   val basePath = s"/tmp/issue7717/$tableName"
   maskedParquetBugTest(spark, tableName, basePath)
   maskedParquetBugTest(spark, tableName, basePath)
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [hudi] ad1happy2go commented on issue #7717: [SUPPORT] org.apache.avro.SchemaParseException: Can't redefine: array When there are Top level variables , Struct and Array[struct] (no complex datatype within array[struct])

Posted by "ad1happy2go (via GitHub)" <gi...@apache.org>.
ad1happy2go commented on issue #7717:
URL: https://github.com/apache/hudi/issues/7717#issuecomment-1486701108

   @abhishekshenoy As we were table to reproduce this error with spark version spark-3.1.3 but same code works with spark-3.2. The Hudi version used for both is same.
   
   So as @jonvex  mentioned, it's the parquet-avro lib version issue. Do you still have an issue or Can we close this issue


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [I] [SUPPORT] org.apache.avro.SchemaParseException: Can't redefine: array When there are Top level variables , Struct and Array[struct] (no complex datatype within array[struct]) [hudi]

Posted by "Jonathanrodrigr12 (via GitHub)" <gi...@apache.org>.
Jonathanrodrigr12 commented on issue #7717:
URL: https://github.com/apache/hudi/issues/7717#issuecomment-1969960684

   i am use Spark version : 3.4.1 and hudi 0.14.0


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [I] [SUPPORT] org.apache.avro.SchemaParseException: Can't redefine: array When there are Top level variables , Struct and Array[struct] (no complex datatype within array[struct]) [hudi]

Posted by "Jonathanrodrigr12 (via GitHub)" <gi...@apache.org>.
Jonathanrodrigr12 commented on issue #7717:
URL: https://github.com/apache/hudi/issues/7717#issuecomment-1962262653

   Hi, i have the same problem but i am use the HoodieMultiTableStreamer 
   **Description**
   I have a lot parquet files, all of them have this struct
   ![image](https://github.com/apache/hudi/assets/53848036/2c15084d-b17c-471f-8a5d-0b77391a7958)
   
   
   but the first time when i run the job in emr serverless the data is saved, but int the second attemp i have this error
   
   **Expected behavior**
   The second write succeeds.
   
   **Environment Description**
   Hudi hudi-utilities-bundle_2.12-0.14.0-amzn-0.jar
   Spark version : 3.4.1
   EMR: 6.15.0
   Stack Trace
   `org.apache.hudi.exception.HoodieUpsertException: Error upserting bucketType UPDATE for partition :0
   	at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.handleUpsertPartition(BaseSparkCommitActionExecutor.java:342)
   	at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.handleInsertPartition(BaseSparkCommitActionExecutor.java:348)
   	at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.lambda$mapPartitionsAsRDD$a3ab3c4$1(BaseSparkCommitActionExecutor.java:259)
   	at org.apache.spark.api.java.JavaRDDLike.$anonfun$mapPartitionsWithIndex$1(JavaRDDLike.scala:102)
   	at org.apache.spark.api.java.JavaRDDLike.$anonfun$mapPartitionsWithIndex$1$adapted(JavaRDDLike.scala:102)
   	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$2(RDD.scala:905)
   	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$2$adapted(RDD.scala:905)
   	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
   	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
   	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
   	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
   	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
   	at org.apache.spark.rdd.RDD.$anonfun$getOrCompute$1(RDD.scala:377)
   	at org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$1(BlockManager.scala:1552)
   	at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1462)
   	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1526)
   	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1349)
   	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:375)
   	at org.apache.spark.rdd.RDD.iterator(RDD.scala:326)
   	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
   	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
   	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
   	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
   	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
   	at org.apache.spark.scheduler.Task.run(Task.scala:141)
   	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:563)
   	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1541)
   	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:566)
   	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
   	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
   	at java.lang.Thread.run(Thread.java:750)
   Caused by: org.apache.hudi.exception.HoodieException: org.apache.avro.SchemaParseException: Can't redefine: value
   	at org.apache.hudi.table.action.commit.HoodieMergeHelper.runMerge(HoodieMergeHelper.java:149)
   	at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.handleUpdateInternal(BaseSparkCommitActionExecutor.java:387)
   	at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.handleUpdate(BaseSparkCommitActionExecutor.java:369)
   	at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.handleUpsertPartition(BaseSparkCommitActionExecutor.java:335)
   	... 30 more
   Caused by: org.apache.avro.SchemaParseException: Can't redefine: value
   	at org.apache.avro.Schema$Names.put(Schema.java:1586)
   	at org.apache.avro.Schema$NamedSchema.writeNameRef(Schema.java:844)
   	at org.apache.avro.Schema$RecordSchema.toJson(Schema.java:1011)
   	at org.apache.avro.Schema$UnionSchema.toJson(Schema.java:1278)
   	at org.apache.avro.Schema$RecordSchema.fieldsToJson(Schema.java:1039)
   	at org.apache.avro.Schema$RecordSchema.toJson(Schema.java:1023)
   	at org.apache.avro.Schema$ArraySchema.toJson(Schema.java:1173)
   	at org.apache.avro.Schema$UnionSchema.toJson(Schema.java:1278)
   	at org.apache.avro.Schema$RecordSchema.fieldsToJson(Schema.java:1039)
   	at org.apache.avro.Schema$RecordSchema.toJson(Schema.java:1023)
   	at org.apache.avro.Schema$UnionSchema.toJson(Schema.java:1278)
   	at org.apache.avro.Schema$RecordSchema.fieldsToJson(Schema.java:1039)
   	at org.apache.avro.Schema$RecordSchema.toJson(Schema.java:1023)
   	at org.apache.avro.Schema$UnionSchema.toJson(Schema.java:1278)
   	at org.apache.avro.Schema$RecordSchema.fieldsToJson(Schema.java:1039)
   	at org.apache.avro.Schema$RecordSchema.toJson(Schema.java:1023)
   	at org.apache.avro.Schema.toString(Schema.java:433)
   	at org.apache.avro.Schema.toString(Schema.java:405)
   	at org.apache.avro.Schema.toString(Schema.java:396)
   	at org.apache.parquet.avro.AvroReadSupport.setAvroReadSchema(AvroReadSupport.java:73)
   	at org.apache.hudi.io.storage.HoodieAvroParquetReader.getIndexedRecordIteratorInternal(HoodieAvroParquetReader.java:162)
   	at org.apache.hudi.io.storage.HoodieAvroParquetReader.getIndexedRecordIterator(HoodieAvroParquetReader.java:94)
   	at org.apache.hudi.io.storage.HoodieAvroParquetReader.getRecordIterator(HoodieAvroParquetReader.java:73)
   	at org.apache.hudi.table.action.commit.HoodieMergeHelper.runMerge(HoodieMergeHelper.java:126)
   	... 33 more
   
   Driver stacktrace:
   	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2974) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2910) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2909) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) ~[scala-library-2.12.15.jar:?]
   	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) ~[scala-library-2.12.15.jar:?]
   	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) ~[scala-library-2.12.15.jar:?]
   	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2909) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1263) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1263) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at scala.Option.foreach(Option.scala:407) ~[scala-library-2.12.15.jar:?]
   	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1263) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3173) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3112) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3101) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:1028) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2271) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2366) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.$anonfun$fold$1(RDD.scala:1172) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.withScope(RDD.scala:405) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.fold(RDD.scala:1166) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.DoubleRDDFunctions.$anonfun$sum$1(DoubleRDDFunctions.scala:36) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at scala.runtime.java8.JFunction0$mcD$sp.apply(JFunction0$mcD$sp.java:23) ~[scala-library-2.12.15.jar:?]
   	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.withScope(RDD.scala:405) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.DoubleRDDFunctions.sum(DoubleRDDFunctions.scala:36) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.api.java.JavaDoubleRDD.sum(JavaDoubleRDD.scala:165) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.hudi.utilities.streamer.StreamSync.writeToSink(StreamSync.java:804) ~[hudi-utilities-bundle_2.12-0.14.0-amzn-0.jar:0.14.0-amzn-0]
   	at org.apache.hudi.utilities.streamer.StreamSync.syncOnce(StreamSync.java:446) ~[hudi-utilities-bundle_2.12-0.14.0-amzn-0.jar:0.14.0-amzn-0]
   	at org.apache.hudi.utilities.streamer.HoodieStreamer$StreamSyncService.ingestOnce(HoodieStreamer.java:840) ~[hudi-utilities-bundle_2.12-0.14.0-amzn-0.jar:0.14.0-amzn-0]
   	at org.apache.hudi.utilities.ingestion.HoodieIngestionService.startIngestion(HoodieIngestionService.java:72) ~[hudi-utilities-bundle_2.12-0.14.0-amzn-0.jar:0.14.0-amzn-0]
   	at org.apache.hudi.common.util.Option.ifPresent(Option.java:97) ~[hudi-utilities-bundle_2.12-0.14.0-amzn-0.jar:0.14.0-amzn-0]
   	at org.apache.hudi.utilities.streamer.HoodieStreamer.sync(HoodieStreamer.java:205) ~[hudi-utilities-bundle_2.12-0.14.0-amzn-0.jar:0.14.0-amzn-0]
   	at org.apache.hudi.utilities.streamer.HoodieMultiTableStreamer.sync(HoodieMultiTableStreamer.java:456) ~[hudi-utilities-bundle_2.12-0.14.0-amzn-0.jar:0.14.0-amzn-0]
   	at org.apache.hudi.utilities.streamer.HoodieMultiTableStreamer.main(HoodieMultiTableStreamer.java:281) ~[hudi-utilities-bundle_2.12-0.14.0-amzn-0.jar:0.14.0-amzn-0]
   	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) ~[?:1.8.0_392]
   	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) ~[?:1.8.0_392]
   	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) ~[?:1.8.0_392]
   	at java.lang.reflect.Method.invoke(Method.java:498) ~[?:1.8.0_392]
   	at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:1066) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:192) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:215) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:91) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1158) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1167) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   Caused by: org.apache.hudi.exception.HoodieUpsertException: Error upserting bucketType UPDATE for partition :0
   	at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.handleUpsertPartition(BaseSparkCommitActionExecutor.java:342) ~[hudi-utilities-bundle_2.12-0.14.0-amzn-0.jar:0.14.0-amzn-0]
   	at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.handleInsertPartition(BaseSparkCommitActionExecutor.java:348) ~[hudi-utilities-bundle_2.12-0.14.0-amzn-0.jar:0.14.0-amzn-0]
   	at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.lambda$mapPartitionsAsRDD$a3ab3c4$1(BaseSparkCommitActionExecutor.java:259) ~[hudi-utilities-bundle_2.12-0.14.0-amzn-0.jar:0.14.0-amzn-0]
   	at org.apache.spark.api.java.JavaRDDLike.$anonfun$mapPartitionsWithIndex$1(JavaRDDLike.scala:102) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.api.java.JavaRDDLike.$anonfun$mapPartitionsWithIndex$1$adapted(JavaRDDLike.scala:102) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$2(RDD.scala:905) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$2$adapted(RDD.scala:905) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.$anonfun$getOrCompute$1(RDD.scala:377) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$1(BlockManager.scala:1552) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1462) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1526) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1349) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:375) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.iterator(RDD.scala:326) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.scheduler.Task.run(Task.scala:141) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:563) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1541) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:566) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) ~[?:1.8.0_392]
   	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) ~[?:1.8.0_392]
   	at java.lang.Thread.run(Thread.java:750) ~[?:1.8.0_392]
   Caused by: org.apache.hudi.exception.HoodieException: org.apache.avro.SchemaParseException: Can't redefine: value
   	at org.apache.hudi.table.action.commit.HoodieMergeHelper.runMerge(HoodieMergeHelper.java:149) ~[hudi-utilities-bundle_2.12-0.14.0-amzn-0.jar:0.14.0-amzn-0]
   	at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.handleUpdateInternal(BaseSparkCommitActionExecutor.java:387) ~[hudi-utilities-bundle_2.12-0.14.0-amzn-0.jar:0.14.0-amzn-0]
   	at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.handleUpdate(BaseSparkCommitActionExecutor.java:369) ~[hudi-utilities-bundle_2.12-0.14.0-amzn-0.jar:0.14.0-amzn-0]
   	at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.handleUpsertPartition(BaseSparkCommitActionExecutor.java:335) ~[hudi-utilities-bundle_2.12-0.14.0-amzn-0.jar:0.14.0-amzn-0]
   	at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.handleInsertPartition(BaseSparkCommitActionExecutor.java:348) ~[hudi-utilities-bundle_2.12-0.14.0-amzn-0.jar:0.14.0-amzn-0]
   	at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.lambda$mapPartitionsAsRDD$a3ab3c4$1(BaseSparkCommitActionExecutor.java:259) ~[hudi-utilities-bundle_2.12-0.14.0-amzn-0.jar:0.14.0-amzn-0]
   	at org.apache.spark.api.java.JavaRDDLike.$anonfun$mapPartitionsWithIndex$1(JavaRDDLike.scala:102) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.api.java.JavaRDDLike.$anonfun$mapPartitionsWithIndex$1$adapted(JavaRDDLike.scala:102) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$2(RDD.scala:905) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$2$adapted(RDD.scala:905) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.$anonfun$getOrCompute$1(RDD.scala:377) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$1(BlockManager.scala:1552) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1462) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1526) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1349) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:375) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.iterator(RDD.scala:326) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.scheduler.Task.run(Task.scala:141) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:563) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1541) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:566) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) ~[?:1.8.0_392]
   	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) ~[?:1.8.0_392]
   	at java.lang.Thread.run(Thread.java:750) ~[?:1.8.0_392]
   Caused by: org.apache.avro.SchemaParseException: Can't redefine: value
   	at org.apache.avro.Schema$Names.put(Schema.java:1586) ~[avro-1.11.1.jar:1.11.1]
   	at org.apache.avro.Schema$NamedSchema.writeNameRef(Schema.java:844) ~[avro-1.11.1.jar:1.11.1]
   	at org.apache.avro.Schema$RecordSchema.toJson(Schema.java:1011) ~[avro-1.11.1.jar:1.11.1]
   	at org.apache.avro.Schema$UnionSchema.toJson(Schema.java:1278) ~[avro-1.11.1.jar:1.11.1]
   	at org.apache.avro.Schema$RecordSchema.fieldsToJson(Schema.java:1039) ~[avro-1.11.1.jar:1.11.1]
   	at org.apache.avro.Schema$RecordSchema.toJson(Schema.java:1023) ~[avro-1.11.1.jar:1.11.1]
   	at org.apache.avro.Schema$ArraySchema.toJson(Schema.java:1173) ~[avro-1.11.1.jar:1.11.1]
   	at org.apache.avro.Schema$UnionSchema.toJson(Schema.java:1278) ~[avro-1.11.1.jar:1.11.1]
   	at org.apache.avro.Schema$RecordSchema.fieldsToJson(Schema.java:1039) ~[avro-1.11.1.jar:1.11.1]
   	at org.apache.avro.Schema$RecordSchema.toJson(Schema.java:1023) ~[avro-1.11.1.jar:1.11.1]
   	at org.apache.avro.Schema$UnionSchema.toJson(Schema.java:1278) ~[avro-1.11.1.jar:1.11.1]
   	at org.apache.avro.Schema$RecordSchema.fieldsToJson(Schema.java:1039) ~[avro-1.11.1.jar:1.11.1]
   	at org.apache.avro.Schema$RecordSchema.toJson(Schema.java:1023) ~[avro-1.11.1.jar:1.11.1]
   	at org.apache.avro.Schema$UnionSchema.toJson(Schema.java:1278) ~[avro-1.11.1.jar:1.11.1]
   	at org.apache.avro.Schema$RecordSchema.fieldsToJson(Schema.java:1039) ~[avro-1.11.1.jar:1.11.1]
   	at org.apache.avro.Schema$RecordSchema.toJson(Schema.java:1023) ~[avro-1.11.1.jar:1.11.1]
   	at org.apache.avro.Schema.toString(Schema.java:433) ~[avro-1.11.1.jar:1.11.1]
   	at org.apache.avro.Schema.toString(Schema.java:405) ~[avro-1.11.1.jar:1.11.1]
   	at org.apache.avro.Schema.toString(Schema.java:396) ~[avro-1.11.1.jar:1.11.1]
   	at org.apache.parquet.avro.AvroReadSupport.setAvroReadSchema(AvroReadSupport.java:73) ~[org.apache.parquet_parquet-avro-1.12.3.jar:1.12.3]
   	at org.apache.hudi.io.storage.HoodieAvroParquetReader.getIndexedRecordIteratorInternal(HoodieAvroParquetReader.java:162) ~[hudi-utilities-bundle_2.12-0.14.0-amzn-0.jar:0.14.0-amzn-0]
   	at org.apache.hudi.io.storage.HoodieAvroParquetReader.getIndexedRecordIterator(HoodieAvroParquetReader.java:94) ~[hudi-utilities-bundle_2.12-0.14.0-amzn-0.jar:0.14.0-amzn-0]
   	at org.apache.hudi.io.storage.HoodieAvroParquetReader.getRecordIterator(HoodieAvroParquetReader.java:73) ~[hudi-utilities-bundle_2.12-0.14.0-amzn-0.jar:0.14.0-amzn-0]
   	at org.apache.hudi.table.action.commit.HoodieMergeHelper.runMerge(HoodieMergeHelper.java:126) ~[hudi-utilities-bundle_2.12-0.14.0-amzn-0.jar:0.14.0-amzn-0]
   	at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.handleUpdateInternal(BaseSparkCommitActionExecutor.java:387) ~[hudi-utilities-bundle_2.12-0.14.0-amzn-0.jar:0.14.0-amzn-0]
   	at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.handleUpdate(BaseSparkCommitActionExecutor.java:369) ~[hudi-utilities-bundle_2.12-0.14.0-amzn-0.jar:0.14.0-amzn-0]
   	at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.handleUpsertPartition(BaseSparkCommitActionExecutor.java:335) ~[hudi-utilities-bundle_2.12-0.14.0-amzn-0.jar:0.14.0-amzn-0]
   	at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.handleInsertPartition(BaseSparkCommitActionExecutor.java:348) ~[hudi-utilities-bundle_2.12-0.14.0-amzn-0.jar:0.14.0-amzn-0]
   	at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.lambda$mapPartitionsAsRDD$a3ab3c4$1(BaseSparkCommitActionExecutor.java:259) ~[hudi-utilities-bundle_2.12-0.14.0-amzn-0.jar:0.14.0-amzn-0]
   	at org.apache.spark.api.java.JavaRDDLike.$anonfun$mapPartitionsWithIndex$1(JavaRDDLike.scala:102) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.api.java.JavaRDDLike.$anonfun$mapPartitionsWithIndex$1$adapted(JavaRDDLike.scala:102) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$2(RDD.scala:905) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$2$adapted(RDD.scala:905) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.$anonfun$getOrCompute$1(RDD.scala:377) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$1(BlockManager.scala:1552) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1462) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1526) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1349) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:375) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.iterator(RDD.scala:326) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.scheduler.Task.run(Task.scala:141) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:563) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1541) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:566) ~[spark-core_2.12-3.4.1-amzn-2.jar:3.4.1-amzn-2]
   	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) ~[?:1.8.0_392]
   	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) ~[?:1.8.0_392]`


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org