You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@hudi.apache.org by GitBox <gi...@apache.org> on 2022/10/09 11:59:48 UTC

[GitHub] [hudi] yhyyz opened a new issue, #6900: [SUPPORT]

yhyyz opened a new issue, #6900:
URL: https://github.com/apache/hudi/issues/6900

   #### Environement
   Hudi 0.11.0
   Spark 3.2.1
   
   #### Job Info
   Hudi table type: MOR
   Spark Structured Streaming foreachBatch multiple task write different hudi table。An error occurs after the job runs for a few hours。
   
   #### code
   ```scala
   object Debezium2Hudi {
   
     case class TableInfoList(tableInfo: List[TableInfo])
     private val log = LoggerFactory.getLogger("debezium2hudi")
     def main(args: Array[String]): Unit = {
       log.info(args.mkString)
       // Set log4j level to warn
       Logger.getLogger("org").setLevel(Level.WARN)
       //    System.setProperty("HADOOP_USER_NAME", "hadoop")
       val params = Config.parseConfig(Debezium2Hudi, args)
       val tableInfoList = JsonUtil.mapper.readValue(params.tableInfoJson, classOf[TableInfoList])
       // init spark session
       val ss = SparkHelper.getSparkSession(params.env)
       import ss.implicits._
       val df = ss
         .readStream
         .format("kafka")
         .option("kafka.bootstrap.servers", params.brokerList)
         .option("subscribe", params.sourceTopic)
         .option("startingOffsets", params.startPos)
         .option("failOnDataLoss", false)
         .option("maxOffsetsPerTrigger",params.maxOffset.toLong)
         .option("kafka.consumer.commit.groupid", params.consumerGroup)
         .load()
         .repartition(Integer.valueOf(params.partitionNum))
   
       ss.streams.addListener(new StreamingQueryListener {
         override def onQueryStarted(event: StreamingQueryListener.QueryStartedEvent): Unit = log.debug(s"QueryStarted [id = ${event.id}, name = ${event.name}, runId = ${event.runId}]")
   
         override def onQueryProgress(event: StreamingQueryListener.QueryProgressEvent): Unit = log.warn(s"QueryProgress ${event.progress}")
   
         override def onQueryTerminated(event: StreamingQueryListener.QueryTerminatedEvent): Unit = log.debug(s"QueryTerminated [id = ${event.id}, runId = ${event.runId}, error = ${event.exception}]")
       })
   
       val listener = new KafkaOffsetCommitterListener()
       ss.streams.addListener(listener)
   
       val pool = Executors.newFixedThreadPool(50)
       implicit val xc: ExecutionContextExecutor = ExecutionContext.fromExecutor(pool)
   
       val partitionFormat: (String => String) = (arg: String) => {
         val formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'")
         val parFormatter = DateTimeFormatter.ofPattern("yyyyMM")
         parFormatter.format(formatter.parse(arg))
       }
       val sqlPartitionFunc = udf(partitionFormat)
   
       val ds = df.selectExpr("CAST(value AS STRING)").as[String]
       val query = ds
         .writeStream
         .queryName("debezium2hudi")
         .option("checkpointLocation", params.checkpointDir)
         // if set 0, as fast as possible
         .trigger(Trigger.ProcessingTime(params.trigger + " seconds"))
         .foreachBatch { (batchDF: Dataset[String], batchId: Long) =>
           log.warn("current batch: "+batchId.toString)
   
           val newsDF = batchDF.map(cdc => DebeziumParser.apply().debezium2Hudi(cdc))
             .filter(_ != null)
           if (!newsDF.isEmpty) {
             val tasks = Seq[Future[Unit]]()
             for (tableInfo <- tableInfoList.tableInfo) {
               val insertORUpsertDF = newsDF
                 .filter($"database" === tableInfo.database && $"table" === tableInfo.table)
                 .filter($"operationType" === HudiOP.UPSERT || $"operationType" === HudiOP.INSERT)
                 .select($"data".as("jsonData"))
               if (!insertORUpsertDF.isEmpty) {
                 val json_schema = ss.read.json(insertORUpsertDF.select("jsonData").as[String]).schema
                 val cdcDF = insertORUpsertDF.select(from_json($"jsonData", json_schema).as("cdc_data"))
                 val cdcPartitionDF = cdcDF.select($"cdc_data.*")
                   .withColumn(tableInfo.hudiPartitionField, sqlPartitionFunc(col(tableInfo.partitionTimeColumn)))
                 params.concurrent match {
                   case "true" => {
                     val runTask = HudiWriteTask.run(cdcPartitionDF, params, tableInfo)(xc)
                     tasks :+ runTask
                   }
                   case _ => ....
                 }
               }
             }
             if (params.concurrent == "true" && tasks.nonEmpty) {
               Await.result(Future.sequence(tasks), Duration(60, MINUTES))
               ()
             }
   
           }
         }.start()
       query.awaitTermination()
     }
   
   }
   
   ```
   
   #### ERROR
   ```
   22/10/09 08:03:12 INFO ShuffleBlockFetcherIterator: Started 8 remote fetches in 0 ms
   22/10/09 08:03:12 INFO Executor: Finished task 0.0 in stage 25180.0 (TID 172844). 2512 bytes result sent to driver
   22/10/09 08:03:12 ERROR Executor: Exception in task 0.0 in stage 25178.0 (TID 172843)
   org.apache.hudi.exception.HoodieIOException: Failed to read MARKERS file s3://app-util/hudi-bloom/opti-tmp-102/cdc_test_db/dhdata_02/.hoodie/.temp/20221009080301290/MARKERS0
   	at org.apache.hudi.common.util.MarkerUtils.readMarkersFromFile(MarkerUtils.java:210)
   	at org.apache.hudi.common.util.MarkerUtils.lambda$readTimelineServerBasedMarkersFromFileSystem$141c8e72$1(MarkerUtils.java:185)
   	at org.apache.hudi.common.fs.FSUtils.lambda$parallelizeFilesProcess$1f9929d5$1(FSUtils.java:736)
   	at org.apache.hudi.client.common.HoodieSparkEngineContext.lambda$mapToPair$786cea6a$1(HoodieSparkEngineContext.java:149)
   	at org.apache.spark.api.java.JavaPairRDD$.$anonfun$pairFunToScalaFun$1(JavaPairRDD.scala:1073)
   	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
   	at scala.collection.Iterator.foreach(Iterator.scala:943)
   	at scala.collection.Iterator.foreach$(Iterator.scala:943)
   	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
   	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
   	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
   	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
   	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
   	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
   	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
   	at scala.collection.AbstractIterator.to(Iterator.scala:1431)
   	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
   	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
   	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1431)
   	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
   	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
   	at scala.collection.AbstractIterator.toArray(Iterator.scala:1431)
   	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1030)
   	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2255)
   	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
   	at org.apache.spark.scheduler.Task.run(Task.scala:133)
   	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
   	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1474)
   	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
   	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
   	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
   	at java.lang.Thread.run(Thread.java:750)
   Caused by: java.io.FileNotFoundException: No such file or directory 's3://app-util/hudi-bloom/opti-tmp-102/cdc_test_db/dhdata_02/.hoodie/.temp/20221009080301290/MARKERS0'
   	at com.amazon.ws.emr.hadoop.fs.s3n.S3NativeFileSystem.getFileStatus(S3NativeFileSystem.java:521)
   	at com.amazon.ws.emr.hadoop.fs.s3n.S3NativeFileSystem.open(S3NativeFileSystem.java:932)
   	at com.amazon.ws.emr.hadoop.fs.s3n.S3NativeFileSystem.open(S3NativeFileSystem.java:924)
   	at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:906)
   	at com.amazon.ws.emr.hadoop.fs.EmrFileSystem.open(EmrFileSystem.java:194)
   	at org.apache.hudi.common.util.MarkerUtils.readMarkersFromFile(MarkerUtils.java:207)
   	... 31 more
   22/10/09 08:03:13 INFO YarnCoarseGrainedExecutorBackend: Got assigned task 172858
   22/10/09 08:03:13 INFO Executor: Running task 6.0 in stage 25185.0 (TID 172858)
   ```
   
   
   
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [hudi] nsivabalan commented on issue #6900: [SUPPORT]Hudi Failed to read MARKERS file

Posted by GitBox <gi...@apache.org>.

nsivabalan commented on issue #6900:
URL: https://github.com/apache/hudi/issues/6900#issuecomment-1284357523

   Another suggestion is. if you feel having cleaner inline is causing some perf hit, you can relax cleaner to run only once in every N commits, using `hoodie.clean.max.commits`. What this config means is, even to attempt whether something needs to be cleaned, will happen once every N where hoodie.clean.max.commits=N. 
   
   
   Do not confuse this w/ `hoodie.cleaner.commits.retained`. Let say you se hoodie.cleaner.commits.retained = 10, but hoodie.clean.max.commits=2.
   
   Every 2 commits, hudi cleaner will check if there are more than 10 commits in active timeline and clean the data files. IF you are ok to give some leeway, you can increase the value for hoodie.clean.max.commits to 5 or 10. So, only once every 5 commits even clean scheduling will be attempted. 
   
   
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [hudi] nsivabalan commented on issue #6900: [SUPPORT]Hudi Failed to read MARKERS file

Posted by GitBox <gi...@apache.org>.

nsivabalan commented on issue #6900:
URL: https://github.com/apache/hudi/issues/6900#issuecomment-1284354065

   and yes, if you using a diff process to do compaction and cleaning, you need to configure lock provider for sure. If not, you might end up with exceptions. 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [hudi] nsivabalan commented on issue #6900: [SUPPORT]Hudi Failed to read MARKERS file

Posted by GitBox <gi...@apache.org>.

nsivabalan commented on issue #6900:
URL: https://github.com/apache/hudi/issues/6900#issuecomment-1286427252

   we might need to inspect the timeline to see whats happening. may be metadata table is corrupt. we might need to inspect that. 
   
   Can you run our validation tool against your table and let us know what you see. 
   
   https://github.com/apache/hudi/blob/master/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java
   
   enable all these 
   `--validate-latest-file-slices`: validate latest file slices for all partitions.
    * - `--validate-latest-base-files`: validate latest base files for all partitions.
    * - `--validate-all-file-groups`: validate all file groups, and all file slices within file groups.
   
   
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [hudi] nsivabalan commented on issue #6900: [SUPPORT]Hudi Failed to read MARKERS file

Posted by GitBox <gi...@apache.org>.

nsivabalan commented on issue #6900:
URL: https://github.com/apache/hudi/issues/6900#issuecomment-1287880865

   yeah. looks like metadata table is out of sync w/ data table. you may need to disbale metadata for few commits and then re-enable. I will follow up to see how did we end up in this state. 
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [hudi] yihua commented on issue #6900: [SUPPORT]Hudi Failed to read MARKERS file

Posted by GitBox <gi...@apache.org>.

yihua commented on issue #6900:
URL: https://github.com/apache/hudi/issues/6900#issuecomment-1278619277

   @yhyyz to mitigate the issue, have you tried to switch to the direct markers by setting `hoodie.write.markers.type=direct` and see if the structured streaming can make progress?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [hudi] umehrot2 commented on issue #6900: [SUPPORT]Hudi Failed to read MARKERS file

Posted by GitBox <gi...@apache.org>.

umehrot2 commented on issue #6900:
URL: https://github.com/apache/hudi/issues/6900#issuecomment-1286257672

   Latest stack trace, when using structured streaming with the following properties:
   ```
   hoodie.datasource.hive_sync.enable=false
   hoodie.upsert.shuffle.parallelism=20
   hoodie.insert.shuffle.parallelism=20
   hoodie.keep.min.commits=6
   hoodie.keep.max.commits=7
   hoodie.parquet.small.file.limit=52428800
   hoodie.index.type=GLOBAL_BLOOM
   hoodie.datasource.write.payload.class=org.apache.hudi.common.model.DefaultHoodieRecordPayload
   hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.NonpartitionedKeyGenerator
   hoodie.metadata.enable=true
   hoodie.cleaner.commits.retained=3
   hoodie.clean.max.commits=5
   hoodie.clean.async=false
   hoodie.clean.automatic=true
   hoodie.archive.async=false
   hoodie.datasource.compaction.async.enable=true
   hoodie.write.markers.type=DIRECT
   hoodie.embed.timeline.server=true
   hoodie.embed.timeline.server.async=false
   hoodie.compact.schedule.inline=false
   hoodie.compact.inline.max.delta.commits=2
   ```
   
   Stacktrace:
   ```
   22/10/19 15:36:18 ERROR UpsertPartitioner: Error trying to compute average bytes/record 
   org.apache.hudi.exception.HoodieIOException: Could not read commit details from s3://app-util/hudi-bloom/multi-stream-105/cdc_test_db/dhdata_15/.hoodie/20221019152438682.commit
       at org.apache.hudi.common.table.timeline.HoodieActiveTimeline.readDataFromPath(HoodieActiveTimeline.java:761)
       at org.apache.hudi.common.table.timeline.HoodieActiveTimeline.getInstantDetails(HoodieActiveTimeline.java:266)
       at org.apache.hudi.common.table.timeline.HoodieDefaultTimeline.getInstantDetails(HoodieDefaultTimeline.java:372)
       at org.apache.hudi.table.action.commit.UpsertPartitioner.averageBytesPerRecord(UpsertPartitioner.java:373)
       at org.apache.hudi.table.action.commit.UpsertPartitioner.assignInserts(UpsertPartitioner.java:162)
       at org.apache.hudi.table.action.commit.UpsertPartitioner.<init>(UpsertPartitioner.java:95)
       at org.apache.hudi.table.action.deltacommit.SparkUpsertDeltaCommitPartitioner.<init>(SparkUpsertDeltaCommitPartitioner.java:50)
       at org.apache.hudi.table.action.deltacommit.BaseSparkDeltaCommitActionExecutor.getUpsertPartitioner(BaseSparkDeltaCommitActionExecutor.java:69)
       at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.getPartitioner(BaseSparkCommitActionExecutor.java:217)
       at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.execute(BaseSparkCommitActionExecutor.java:163)
       at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.execute(BaseSparkCommitActionExecutor.java:85)
       at org.apache.hudi.table.action.commit.BaseWriteHelper.write(BaseWriteHelper.java:57)
       at org.apache.hudi.table.action.deltacommit.SparkUpsertDeltaCommitActionExecutor.execute(SparkUpsertDeltaCommitActionExecutor.java:46)
       at org.apache.hudi.table.HoodieSparkMergeOnReadTable.upsert(HoodieSparkMergeOnReadTable.java:89)
       at org.apache.hudi.table.HoodieSparkMergeOnReadTable.upsert(HoodieSparkMergeOnReadTable.java:76)
       at org.apache.hudi.client.SparkRDDWriteClient.upsert(SparkRDDWriteClient.java:157)
       at org.apache.hudi.DataSourceUtils.doWriteOperation(DataSourceUtils.java:213)
       at org.apache.hudi.HoodieSparkSqlWriter$.write(HoodieSparkSqlWriter.scala:304)
       at org.apache.hudi.HoodieStreamingSink.$anonfun$addBatch$2(HoodieStreamingSink.scala:91)
       at scala.util.Try$.apply(Try.scala:213)
       at org.apache.hudi.HoodieStreamingSink.$anonfun$addBatch$1(HoodieStreamingSink.scala:90)
       at org.apache.hudi.HoodieStreamingSink.retry(HoodieStreamingSink.scala:166)
       at org.apache.hudi.HoodieStreamingSink.addBatch(HoodieStreamingSink.scala:89)
       at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$17(MicroBatchExecution.scala:600)
       at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:107)
       at org.apache.spark.sql.execution.SQLExecution$.withTracker(SQLExecution.scala:232)
       at org.apache.spark.sql.execution.SQLExecution$.executeQuery$1(SQLExecution.scala:110)
       at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:135)
       at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:107)
       at org.apache.spark.sql.execution.SQLExecution$.withTracker(SQLExecution.scala:232)
       at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:135)
       at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:253)
       at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:134)
       at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
       at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:68)
       at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$16(MicroBatchExecution.scala:598)
       at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:375)
       at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:373)
       at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:69)
       at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runBatch(MicroBatchExecution.scala:598)
       at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$2(MicroBatchExecution.scala:228)
       at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
       at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:375)
       at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:373)
       at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:69)
       at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$1(MicroBatchExecution.scala:193)
       at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:57)
       at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:187)
       at org.apache.spark.sql.execution.streaming.StreamExecution.$anonfun$runStream$1(StreamExecution.scala:303)
       at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
       at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
       at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:286)
       at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:209)
   Caused by: java.io.FileNotFoundException: No such file or directory 's3://app-util/hudi-bloom/multi-stream-105/cdc_test_db/dhdata_15/.hoodie/20221019152438682.commit'
       at com.amazon.ws.emr.hadoop.fs.s3n.S3NativeFileSystem.getFileStatus(S3NativeFileSystem.java:521)
       at com.amazon.ws.emr.hadoop.fs.s3n.S3NativeFileSystem.open(S3NativeFileSystem.java:932)
       at com.amazon.ws.emr.hadoop.fs.s3n.S3NativeFileSystem.open(S3NativeFileSystem.java:924)
       at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:906)
       at com.amazon.ws.emr.hadoop.fs.EmrFileSystem.open(EmrFileSystem.java:194)
       at org.apache.hudi.common.fs.HoodieWrapperFileSystem.open(HoodieWrapperFileSystem.java:460)
       at org.apache.hudi.common.table.timeline.HoodieActiveTimeline.readDataFromPath(HoodieActiveTimeline.java:758)
       ... 52 more
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [hudi] danny0405 commented on issue #6900: [SUPPORT]Hudi Failed to read MARKERS file

Posted by "danny0405 (via GitHub)" <gi...@apache.org>.

danny0405 commented on issue #6900:
URL: https://github.com/apache/hudi/issues/6900#issuecomment-1650859794

   > We're on EMR 6.11.0
   
   What the Hudi version are you using?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [hudi] yhyyz commented on issue #6900: [SUPPORT]Hudi Failed to read MARKERS file

Posted by GitBox <gi...@apache.org>.

yhyyz commented on issue #6900:
URL: https://github.com/apache/hudi/issues/6900#issuecomment-1279029022

   @yihua 
   1. tested set config hoodie.write.markers.type=DIRECT hoodie.embed.timeline.server=false hoodie.clean.automatic=false, no markers issue. but new issue `org.apache.hudi.exception.HoodieIOException: Could not read commit details from s3://app-util/hudi-bloom/emrfs-disalbe-cache-direct-101/cdc_test_db/dhdata_02/.hoodie/20221014024801458.deltacommit.requested ` occured. The application log  link:  https://dxs9dnjebzm6y.cloudfront.net/tmp/application-log.txt
   
   2. I use offline compaction and clean by independent job, set `hoodie.compact.inline=false and  hoodie.clean.automatic=false` in streaming job. Do I have to  use lock provider?
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [hudi] yhyyz commented on issue #6900: [SUPPORT]Hudi Failed to read MARKERS file

Posted by GitBox <gi...@apache.org>.

yhyyz commented on issue #6900:
URL: https://github.com/apache/hudi/issues/6900#issuecomment-1286415788

   @nsivabalan  Thanks for you help. 
   1.  using structured streaming multiple stream query instead of `forEachBatch` and  with the following properties. Application running for 19 hours without any errors.  But if set `hoodie.embed.timeline.server=true`, error occurred `UpsertPartitioner: Error trying to compute average bytes/record，... Caused by: java.io.FileNotFoundException: No such file or directory ..../.hoodie/....commit`.
   ```
   hoodie.datasource.hive_sync.enable=false
   hoodie.upsert.shuffle.parallelism=20
   hoodie.insert.shuffle.parallelism=20
   hoodie.keep.min.commits=6
   hoodie.keep.max.commits=7
   hoodie.parquet.small.file.limit=52428800
   hoodie.index.type=GLOBAL_BLOOM
   hoodie.datasource.write.payload.class=org.apache.hudi.common.model.DefaultHoodieRecordPayload
   hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.NonpartitionedKeyGenerator
   hoodie.metadata.enable=true
   hoodie.cleaner.commits.retained=3
   hoodie.clean.async=false
   hoodie.clean.automatic=true
   hoodie.archive.async=false
   hoodie.datasource.compaction.async.enable=true
   hoodie.write.markers.type=DIRECT
   hoodie.embed.timeline.server=false
   hoodie.embed.timeline.server.async=false
   ```
   2. using `forEachBatch` with multiple thread，enable inline compaction instead of offline compaction and with following properties, error occurred `UpsertPartitioner: Error trying to compute average bytes/record，... Caused by: java.io.FileNotFoundException: No such file or directory ..../.hoodie/....commit`, but application  still runs. I will set `hoodie.embed.timeline.server=false` to test again, any new information I will sync here.
   ```
   hoodie.datasource.hive_sync.enable=false
   hoodie.upsert.shuffle.parallelism=20
   hoodie.insert.shuffle.parallelism=20
   hoodie.keep.min.commits=6
   hoodie.keep.max.commits=7
   hoodie.parquet.small.file.limit=52428800
   hoodie.index.type=GLOBAL_BLOOM
   hoodie.datasource.write.payload.class=org.apache.hudi.common.model.DefaultHoodieRecordPayload
   hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.NonpartitionedKeyGenerator
   hoodie.metadata.enable=true
   hoodie.cleaner.commits.retained=3
   hoodie.clean.max.commits=5
   hoodie.clean.async=false
   hoodie.clean.automatic=true
   hoodie.archive.async=false
   hoodie.compact.inline=true
   hoodie.datasource.compaction.async.enable=false
   hoodie.write.markers.type=DIRECT
   hoodie.embed.timeline.server=true
   hoodie.embed.timeline.server.async=false
   hoodie.compact.schedule.inline=false
   hoodie.compact.inline.max.delta.commits=2
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [hudi] yhyyz commented on issue #6900: [SUPPORT]Hudi Failed to read MARKERS file

Posted by GitBox <gi...@apache.org>.

yhyyz commented on issue #6900:
URL: https://github.com/apache/hudi/issues/6900#issuecomment-1287161190

   using the following command,  the log file:  https://dxs9dnjebzm6y.cloudfront.net/tmp/metadata-table-validator.log
   ```
   basePath=s3://xxxxx/hudi-bloom/multi-thread-105/cdc_test_db/dhdata_13/
   spark-submit \
   --class org.apache.hudi.utilities.HoodieMetadataTableValidator \
   --master yarn \
   --driver-memory 2g \
   --executor-memory 4g \
   /usr/lib/hudi/hudi-utilities-bundle_2.12-0.11.0-amzn-0.jar \
   --base-path $basePath \
   --validate-latest-file-slices \
   --validate-latest-base-files \
   --validate-all-file-groups
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [hudi] nsivabalan commented on issue #6900: [SUPPORT]Hudi Failed to read MARKERS file

Posted by GitBox <gi...@apache.org>.

nsivabalan commented on issue #6900:
URL: https://github.com/apache/hudi/issues/6900#issuecomment-1284357653

   Hope that helps. 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [hudi] nsivabalan commented on issue #6900: [SUPPORT]Hudi Failed to read MARKERS file

Posted by "nsivabalan (via GitHub)" <gi...@apache.org>.

nsivabalan commented on issue #6900:
URL: https://github.com/apache/hudi/issues/6900#issuecomment-1424865962

   hey @yhyyz : any updates for us. 
   we might close this due to no activity. let us know if you need more help from us


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [hudi] zaza commented on issue #6900: [SUPPORT]Hudi Failed to read MARKERS file

Posted by "zaza (via GitHub)" <gi...@apache.org>.

zaza commented on issue #6900:
URL: https://github.com/apache/hudi/issues/6900#issuecomment-1651949365

   0.13.1 as in org.apache.hudi:hudi-spark3.3-bundle_2.12:0.13.1


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [hudi] nsivabalan commented on issue #6900: [SUPPORT]Hudi Failed to read MARKERS file

Posted by GitBox <gi...@apache.org>.

nsivabalan commented on issue #6900:
URL: https://github.com/apache/hudi/issues/6900#issuecomment-1284352768

   this is similar to another issue I have debugged recently. As of now, you can't disable auto clean and thats a limitation. 
   
   If you disable auto clean, cleaning is essentially not happening. But archival assumes that cleaning is enabled and makes progress. And it will result in dangling data files where timeline files are archived, but data files are left dangling since the cleaner could not run. 
   
   If you wish to run cleaner as a separate process, can you disable archival as well w/ regular writer. 
   I have filed a tracking jira here https://issues.apache.org/jira/browse/HUDI-5054
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [hudi] danny0405 commented on issue #6900: [SUPPORT]Hudi Failed to read MARKERS file

Posted by GitBox <gi...@apache.org>.

danny0405 commented on issue #6900:
URL: https://github.com/apache/hudi/issues/6900#issuecomment-1272726662

   @yihua Can you take a look ?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [hudi] zaza commented on issue #6900: [SUPPORT]Hudi Failed to read MARKERS file

Posted by "zaza (via GitHub)" <gi...@apache.org>.

zaza commented on issue #6900:
URL: https://github.com/apache/hudi/issues/6900#issuecomment-1649736303

   This is definitely still an issue, we were hit by an error that looks identical to what @umehrot2 reported a while ago:
   
   ```
   ERROR UpsertPartitioner: Error trying to compute average bytes/record 
   org.apache.hudi.exception.HoodieIOException: Could not read commit details from s3://tasktop-data-platform-dev-analytical-data/simulator/workstreams/.hoodie/20230714152804208.commit
           at org.apache.hudi.common.table.timeline.HoodieActiveTimeline.readDataFromPath(HoodieActiveTimeline.java:824) ~[org.apache.hudi_hudi-spark3.3-bundle_2.12-0.13.1.jar:0.13.1]
           at org.apache.hudi.common.table.timeline.HoodieActiveTimeline.getInstantDetails(HoodieActiveTimeline.java:310) ~[org.apache.hudi_hudi-spark3.3-bundle_2.12-0.13.1.jar:0.13.1]
           at org.apache.hudi.common.table.timeline.HoodieDefaultTimeline.getInstantDetails(HoodieDefaultTimeline.java:438) ~[org.apache.hudi_hudi-spark3.3-bundle_2.12-0.13.1.jar:0.13.1]
           at org.apache.hudi.table.action.commit.UpsertPartitioner.averageBytesPerRecord(UpsertPartitioner.java:380) ~[org.apache.hudi_hudi-spark3.3-bundle_2.12-0.13.1.jar:0.13.1]
           at org.apache.hudi.table.action.commit.UpsertPartitioner.assignInserts(UpsertPartitioner.java:169) ~[org.apache.hudi_hudi-spark3.3-bundle_2.12-0.13.1.jar:0.13.1]
           at org.apache.hudi.table.action.commit.UpsertPartitioner.<init>(UpsertPartitioner.java:98) ~[org.apache.hudi_hudi-spark3.3-bundle_2.12-0.13.1.jar:0.13.1]
           at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.getUpsertPartitioner(BaseSparkCommitActionExecutor.java:404) ~[org.apache.hudi_hudi-spark3.3-bundle_2.12-0.13.1.jar:0.13.1]
           at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.getPartitioner(BaseSparkCommitActionExecutor.java:224) ~[org.apache.hudi_hudi-spark3.3-bundle_2.12-0.13.1.jar:0.13.1]
           at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.execute(BaseSparkCommitActionExecutor.java:170) ~[org.apache.hudi_hudi-spark3.3-bundle_2.12-0.13.1.jar:0.13.1]
           at org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor.execute(BaseSparkCommitActionExecutor.java:83) ~[org.apache.hudi_hudi-spark3.3-bundle_2.12-0.13.1.jar:0.13.1]
           at org.apache.hudi.table.action.commit.BaseWriteHelper.write(BaseWriteHelper.java:68) ~[org.apache.hudi_hudi-spark3.3-bundle_2.12-0.13.1.jar:0.13.1]
           at org.apache.hudi.table.action.commit.SparkUpsertCommitActionExecutor.execute(SparkUpsertCommitActionExecutor.java:44) ~[org.apache.hudi_hudi-spark3.3-bundle_2.12-0.13.1.jar:0.13.1]
           at org.apache.hudi.table.HoodieSparkCopyOnWriteTable.upsert(HoodieSparkCopyOnWriteTable.java:107) ~[org.apache.hudi_hudi-spark3.3-bundle_2.12-0.13.1.jar:0.13.1]
           at org.apache.hudi.table.HoodieSparkCopyOnWriteTable.upsert(HoodieSparkCopyOnWriteTable.java:96) ~[org.apache.hudi_hudi-spark3.3-bundle_2.12-0.13.1.jar:0.13.1]
           at org.apache.hudi.client.SparkRDDWriteClient.upsert(SparkRDDWriteClient.java:140) ~[org.apache.hudi_hudi-spark3.3-bundle_2.12-0.13.1.jar:0.13.1]
           at org.apache.hudi.DataSourceUtils.doWriteOperation(DataSourceUtils.java:214) ~[org.apache.hudi_hudi-spark3.3-bundle_2.12-0.13.1.jar:0.13.1]
           at org.apache.hudi.HoodieSparkSqlWriter$.write(HoodieSparkSqlWriter.scala:372) ~[org.apache.hudi_hudi-spark3.3-bundle_2.12-0.13.1.jar:0.13.1]
           at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:150) ~[org.apache.hudi_hudi-spark3.3-bundle_2.12-0.13.1.jar:0.13.1]
           at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:47) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:84) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:104) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:107) ~[spark-catalyst_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.SQLExecution$.withTracker(SQLExecution.scala:224) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.SQLExecution$.executeQuery$1(SQLExecution.scala:114) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$7(SQLExecution.scala:139) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:107) ~[spark-catalyst_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.SQLExecution$.withTracker(SQLExecution.scala:224) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:139) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:245) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:138) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:68) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:101) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:97) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:626) ~[spark-catalyst_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:179) ~[spark-catalyst_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:626) ~[spark-catalyst_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:31) ~[spark-catalyst_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267) ~[spark-catalyst_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263) ~[spark-catalyst_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:31) ~[spark-catalyst_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:31) ~[spark-catalyst_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:602) ~[spark-catalyst_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:97) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:84) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:82) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:125) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:860) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:390) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:363) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at com.planview.ep.dap.workstreams.WorkStreamsTableWriter.write(WorkStreamsTableWriter.java:67) ~[__app__.jar:?]
           at com.planview.ep.dap.workstreams.java.StreamingJob.lambda$start$89fce565$1(StreamingJob.java:80) ~[__app__.jar:?]
           at org.apache.spark.sql.streaming.DataStreamWriter.$anonfun$foreachBatch$1(DataStreamWriter.scala:493) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.streaming.DataStreamWriter.$anonfun$foreachBatch$1$adapted(DataStreamWriter.scala:493) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.streaming.sources.ForeachBatchSink.addBatch(ForeachBatchSink.scala:32) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$17(MicroBatchExecution.scala:665) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:107) ~[spark-catalyst_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.SQLExecution$.withTracker(SQLExecution.scala:224) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.SQLExecution$.executeQuery$1(SQLExecution.scala:114) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$7(SQLExecution.scala:139) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:107) ~[spark-catalyst_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.SQLExecution$.withTracker(SQLExecution.scala:224) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:139) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:245) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:138) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:68) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$16(MicroBatchExecution.scala:663) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:375) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:373) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:68) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runBatch(MicroBatchExecution.scala:663) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$2(MicroBatchExecution.scala:256) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) ~[scala-library-2.12.15.jar:?]
           at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:375) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:373) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:68) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$1(MicroBatchExecution.scala:219) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:67) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:213) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.streaming.StreamExecution.$anonfun$runStream$1(StreamExecution.scala:307) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) ~[scala-library-2.12.15.jar:?]
           at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:285) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
           at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:208) ~[spark-sql_2.12-3.3.2-amzn-0.jar:3.3.2-amzn-0]
   Caused by: java.io.FileNotFoundException: No such file or directory 's3://tasktop-data-platform-dev-analytical-data/simulator/workstreams/.hoodie/20230714152804208.commit'
           at com.amazon.ws.emr.hadoop.fs.s3n.S3NativeFileSystem.getFileStatus(S3NativeFileSystem.java:529) ~[emrfs-hadoop-assembly-2.56.0.jar:?]
           at com.amazon.ws.emr.hadoop.fs.s3n.S3NativeFileSystem.open(S3NativeFileSystem.java:940) ~[emrfs-hadoop-assembly-2.56.0.jar:?]
           at com.amazon.ws.emr.hadoop.fs.s3n.S3NativeFileSystem.open(S3NativeFileSystem.java:932) ~[emrfs-hadoop-assembly-2.56.0.jar:?]
           at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:983) ~[hadoop-client-api-3.3.3-amzn-3.jar:?]
           at com.amazon.ws.emr.hadoop.fs.EmrFileSystem.open(EmrFileSystem.java:197) ~[emrfs-hadoop-assembly-2.56.0.jar:?]
           at org.apache.hudi.common.fs.HoodieWrapperFileSystem.open(HoodieWrapperFileSystem.java:476) ~[org.apache.hudi_hudi-spark3.3-bundle_2.12-0.13.1.jar:0.13.1]
           at org.apache.hudi.common.table.timeline.HoodieActiveTimeline.readDataFromPath(HoodieActiveTimeline.java:821) ~[org.apache.hudi_hudi-spark3.3-bundle_2.12-0.13.1.jar:0.13.1]
   ```
   
   We're on EMR 6.11.0 
   
   What additional information do you need in order to troubleshoot this? Are there any configuration settings that would help us mitigate the problem?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [hudi] zaza commented on issue #6900: [SUPPORT]Hudi Failed to read MARKERS file

Posted by "zaza (via GitHub)" <gi...@apache.org>.

zaza commented on issue #6900:
URL: https://github.com/apache/hudi/issues/6900#issuecomment-1686187020

   Is this issue currently within your awareness? Is there a possibility of it being addressed in the near future?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@hudi.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org