You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@flink.apache.org by Si-li Liu <un...@gmail.com> on 2023/04/14 09:13:56 UTC

Why my flink sql job on yarn keep crash

My job read data from mysql and write to doris. It will crash after 20 mins
~ 1 hour after start.

org.apache.flink.runtime.JobException: Recovery is suppressed by
FixedDelayRestartBackoffTimeStrategy(maxNumberRestartAttempts=10,
backoffTimeMS=10000)
at org.apache.flink.runtime.executiongraph.failover.flip1.
ExecutionFailureHandler.handleFailure(ExecutionFailureHandler.java:139)
at org.apache.flink.runtime.executiongraph.failover.flip1.
ExecutionFailureHandler.getFailureHandlingResult(ExecutionFailureHandler
.java:83)
at org.apache.flink.runtime.scheduler.DefaultScheduler.recordTaskFailure(
DefaultScheduler.java:256)
at org.apache.flink.runtime.scheduler.DefaultScheduler.handleTaskFailure(
DefaultScheduler.java:247)
at org.apache.flink.runtime.scheduler.DefaultScheduler.onTaskFailed(
DefaultScheduler.java:240)
at org.apache.flink.runtime.scheduler.SchedulerBase
.onTaskExecutionStateUpdate(SchedulerBase.java:738)
at org.apache.flink.runtime.scheduler.SchedulerBase
.updateTaskExecutionState(SchedulerBase.java:715)
at org.apache.flink.runtime.scheduler.SchedulerNG.updateTaskExecutionState(
SchedulerNG.java:78)
at org.apache.flink.runtime.jobmaster.JobMaster.updateTaskExecutionState(
JobMaster.java:477)
at sun.reflect.GeneratedMethodAccessor16.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(
DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor
.lambda$handleRpcInvocation$1(AkkaRpcActor.java:309)
at org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils
.runWithContextClassLoader(ClassLoadingUtils.java:83)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcInvocation(
AkkaRpcActor.java:307)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(
AkkaRpcActor.java:222)
at org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(
FencedAkkaRpcActor.java:84)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor
.java:168)
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:24)
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:20)
at scala.PartialFunction.applyOrElse(PartialFunction.scala:123)
at scala.PartialFunction.applyOrElse$(PartialFunction.scala:122)
at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:20)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172)
at akka.actor.Actor.aroundReceive(Actor.scala:537)
at akka.actor.Actor.aroundReceive$(Actor.scala:535)
at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:220)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:580)
at akka.actor.ActorCell.invoke(ActorCell.scala:548)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:270)
at akka.dispatch.Mailbox.run(Mailbox.scala:231)
at akka.dispatch.Mailbox.exec(Mailbox.scala:243)
at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289)
at java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:
1056)
at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1692)
at java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:
175)
Caused by: java.lang.InterruptedException
at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject
.reportInterruptAfterWait(AbstractQueuedSynchronizer.java:2014)
at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject
.await(AbstractQueuedSynchronizer.java:2173)
at org.apache.flink.streaming.runtime.tasks.mailbox.TaskMailboxImpl.take(
TaskMailboxImpl.java:149)
at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor
.processMailsWhenDefaultActionUnavailable(MailboxProcessor.java:363)
at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor
.processMail(MailboxProcessor.java:352)
at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor
.runMailboxLoop(MailboxProcessor.java:229)
at org.apache.flink.streaming.runtime.tasks.StreamTask.runMailboxLoop(
StreamTask.java:831)
at org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask
.java:780)
at org.apache.flink.runtime.taskmanager.Task.runWithSystemExitMonitoring(
Task.java:935)
at org.apache.flink.runtime.taskmanager.Task.restoreAndInvoke(Task.java:914)
at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:728)
at org.apache.flink.runtime.taskmanager.Task.run(Task.java:550)
at java.lang.Thread.run(Thread.java:748)

java.lang.InterruptedException
at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject
.reportInterruptAfterWait(AbstractQueuedSynchronizer.java:2014)
at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject
.await(AbstractQueuedSynchronizer.java:2173)
at org.apache.flink.streaming.runtime.tasks.mailbox.TaskMailboxImpl.take(
TaskMailboxImpl.java:149)
at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor
.processMailsWhenDefaultActionUnavailable(MailboxProcessor.java:363)
at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor
.processMail(MailboxProcessor.java:352)
at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor
.runMailboxLoop(MailboxProcessor.java:229)
at org.apache.flink.streaming.runtime.tasks.StreamTask.runMailboxLoop(
StreamTask.java:831)
at org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask
.java:780)
at org.apache.flink.runtime.taskmanager.Task.runWithSystemExitMonitoring(
Task.java:935)
at org.apache.flink.runtime.taskmanager.Task.restoreAndInvoke(Task.java:914)
at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:728)
at org.apache.flink.runtime.taskmanager.Task.run(Task.java:550)
at java.lang.Thread.run(Thread.java:748)

-- 
Best regards

Sili Liu

Re: Why my flink sql job on yarn keep crash

Posted by Hang Ruan <ru...@gmail.com>.
Hi, Si-li,

I think maybe it is not the root cause. You should find whether there are
more exceptions in the JM log and TM logs.

Best,
Hang

Shammon FY <zj...@gmail.com> 于2023年4月18日周二 09:02写道:

> Hi Si-li
>
> Could you give some more detailed exceptions? Or you can check the metrics
> of your job such as memory usage.
>
> Best,
> Shammon FY
>
>
> On Fri, Apr 14, 2023 at 5:14 PM Si-li Liu <un...@gmail.com> wrote:
>
>> My job read data from mysql and write to doris. It will crash after 20
>> mins ~ 1 hour after start.
>>
>> org.apache.flink.runtime.JobException: Recovery is suppressed by
>> FixedDelayRestartBackoffTimeStrategy(maxNumberRestartAttempts=10,
>> backoffTimeMS=10000)
>> at org.apache.flink.runtime.executiongraph.failover.flip1.
>> ExecutionFailureHandler.handleFailure(ExecutionFailureHandler.java:139)
>> at org.apache.flink.runtime.executiongraph.failover.flip1.
>> ExecutionFailureHandler.getFailureHandlingResult(ExecutionFailureHandler
>> .java:83)
>> at org.apache.flink.runtime.scheduler.DefaultScheduler.recordTaskFailure(
>> DefaultScheduler.java:256)
>> at org.apache.flink.runtime.scheduler.DefaultScheduler.handleTaskFailure(
>> DefaultScheduler.java:247)
>> at org.apache.flink.runtime.scheduler.DefaultScheduler.onTaskFailed(
>> DefaultScheduler.java:240)
>> at org.apache.flink.runtime.scheduler.SchedulerBase
>> .onTaskExecutionStateUpdate(SchedulerBase.java:738)
>> at org.apache.flink.runtime.scheduler.SchedulerBase
>> .updateTaskExecutionState(SchedulerBase.java:715)
>> at org.apache.flink.runtime.scheduler.SchedulerNG
>> .updateTaskExecutionState(SchedulerNG.java:78)
>> at org.apache.flink.runtime.jobmaster.JobMaster.updateTaskExecutionState(
>> JobMaster.java:477)
>> at sun.reflect.GeneratedMethodAccessor16.invoke(Unknown Source)
>> at sun.reflect.DelegatingMethodAccessorImpl.invoke(
>> DelegatingMethodAccessorImpl.java:43)
>> at java.lang.reflect.Method.invoke(Method.java:498)
>> at org.apache.flink.runtime.rpc.akka.AkkaRpcActor
>> .lambda$handleRpcInvocation$1(AkkaRpcActor.java:309)
>> at org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils
>> .runWithContextClassLoader(ClassLoadingUtils.java:83)
>> at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcInvocation(
>> AkkaRpcActor.java:307)
>> at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(
>> AkkaRpcActor.java:222)
>> at org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(
>> FencedAkkaRpcActor.java:84)
>> at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(
>> AkkaRpcActor.java:168)
>> at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:24)
>> at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:20)
>> at scala.PartialFunction.applyOrElse(PartialFunction.scala:123)
>> at scala.PartialFunction.applyOrElse$(PartialFunction.scala:122)
>> at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:20)
>> at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
>> at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172)
>> at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172)
>> at akka.actor.Actor.aroundReceive(Actor.scala:537)
>> at akka.actor.Actor.aroundReceive$(Actor.scala:535)
>> at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:220)
>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:580)
>> at akka.actor.ActorCell.invoke(ActorCell.scala:548)
>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:270)
>> at akka.dispatch.Mailbox.run(Mailbox.scala:231)
>> at akka.dispatch.Mailbox.exec(Mailbox.scala:243)
>> at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289)
>> at java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:
>> 1056)
>> at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1692)
>> at java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread
>> .java:175)
>> Caused by: java.lang.InterruptedException
>> at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject
>> .reportInterruptAfterWait(AbstractQueuedSynchronizer.java:2014)
>> at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject
>> .await(AbstractQueuedSynchronizer.java:2173)
>> at org.apache.flink.streaming.runtime.tasks.mailbox.TaskMailboxImpl.take(
>> TaskMailboxImpl.java:149)
>> at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor
>> .processMailsWhenDefaultActionUnavailable(MailboxProcessor.java:363)
>> at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor
>> .processMail(MailboxProcessor.java:352)
>> at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor
>> .runMailboxLoop(MailboxProcessor.java:229)
>> at org.apache.flink.streaming.runtime.tasks.StreamTask.runMailboxLoop(
>> StreamTask.java:831)
>> at org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask
>> .java:780)
>> at org.apache.flink.runtime.taskmanager.Task.runWithSystemExitMonitoring(
>> Task.java:935)
>> at org.apache.flink.runtime.taskmanager.Task.restoreAndInvoke(Task.java:
>> 914)
>> at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:728)
>> at org.apache.flink.runtime.taskmanager.Task.run(Task.java:550)
>> at java.lang.Thread.run(Thread.java:748)
>>
>> java.lang.InterruptedException
>> at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject
>> .reportInterruptAfterWait(AbstractQueuedSynchronizer.java:2014)
>> at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject
>> .await(AbstractQueuedSynchronizer.java:2173)
>> at org.apache.flink.streaming.runtime.tasks.mailbox.TaskMailboxImpl.take(
>> TaskMailboxImpl.java:149)
>> at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor
>> .processMailsWhenDefaultActionUnavailable(MailboxProcessor.java:363)
>> at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor
>> .processMail(MailboxProcessor.java:352)
>> at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor
>> .runMailboxLoop(MailboxProcessor.java:229)
>> at org.apache.flink.streaming.runtime.tasks.StreamTask.runMailboxLoop(
>> StreamTask.java:831)
>> at org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask
>> .java:780)
>> at org.apache.flink.runtime.taskmanager.Task.runWithSystemExitMonitoring(
>> Task.java:935)
>> at org.apache.flink.runtime.taskmanager.Task.restoreAndInvoke(Task.java:
>> 914)
>> at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:728)
>> at org.apache.flink.runtime.taskmanager.Task.run(Task.java:550)
>> at java.lang.Thread.run(Thread.java:748)
>>
>> --
>> Best regards
>>
>> Sili Liu
>>
>

Re: Why my flink sql job on yarn keep crash

Posted by Shammon FY <zj...@gmail.com>.
Hi Si-li

Could you give some more detailed exceptions? Or you can check the metrics
of your job such as memory usage.

Best,
Shammon FY


On Fri, Apr 14, 2023 at 5:14 PM Si-li Liu <un...@gmail.com> wrote:

> My job read data from mysql and write to doris. It will crash after 20
> mins ~ 1 hour after start.
>
> org.apache.flink.runtime.JobException: Recovery is suppressed by
> FixedDelayRestartBackoffTimeStrategy(maxNumberRestartAttempts=10,
> backoffTimeMS=10000)
> at org.apache.flink.runtime.executiongraph.failover.flip1.
> ExecutionFailureHandler.handleFailure(ExecutionFailureHandler.java:139)
> at org.apache.flink.runtime.executiongraph.failover.flip1.
> ExecutionFailureHandler.getFailureHandlingResult(ExecutionFailureHandler
> .java:83)
> at org.apache.flink.runtime.scheduler.DefaultScheduler.recordTaskFailure(
> DefaultScheduler.java:256)
> at org.apache.flink.runtime.scheduler.DefaultScheduler.handleTaskFailure(
> DefaultScheduler.java:247)
> at org.apache.flink.runtime.scheduler.DefaultScheduler.onTaskFailed(
> DefaultScheduler.java:240)
> at org.apache.flink.runtime.scheduler.SchedulerBase
> .onTaskExecutionStateUpdate(SchedulerBase.java:738)
> at org.apache.flink.runtime.scheduler.SchedulerBase
> .updateTaskExecutionState(SchedulerBase.java:715)
> at org.apache.flink.runtime.scheduler.SchedulerNG
> .updateTaskExecutionState(SchedulerNG.java:78)
> at org.apache.flink.runtime.jobmaster.JobMaster.updateTaskExecutionState(
> JobMaster.java:477)
> at sun.reflect.GeneratedMethodAccessor16.invoke(Unknown Source)
> at sun.reflect.DelegatingMethodAccessorImpl.invoke(
> DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:498)
> at org.apache.flink.runtime.rpc.akka.AkkaRpcActor
> .lambda$handleRpcInvocation$1(AkkaRpcActor.java:309)
> at org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils
> .runWithContextClassLoader(ClassLoadingUtils.java:83)
> at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcInvocation(
> AkkaRpcActor.java:307)
> at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(
> AkkaRpcActor.java:222)
> at org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(
> FencedAkkaRpcActor.java:84)
> at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(
> AkkaRpcActor.java:168)
> at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:24)
> at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:20)
> at scala.PartialFunction.applyOrElse(PartialFunction.scala:123)
> at scala.PartialFunction.applyOrElse$(PartialFunction.scala:122)
> at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:20)
> at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
> at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172)
> at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172)
> at akka.actor.Actor.aroundReceive(Actor.scala:537)
> at akka.actor.Actor.aroundReceive$(Actor.scala:535)
> at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:220)
> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:580)
> at akka.actor.ActorCell.invoke(ActorCell.scala:548)
> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:270)
> at akka.dispatch.Mailbox.run(Mailbox.scala:231)
> at akka.dispatch.Mailbox.exec(Mailbox.scala:243)
> at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289)
> at java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:
> 1056)
> at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1692)
> at java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread
> .java:175)
> Caused by: java.lang.InterruptedException
> at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject
> .reportInterruptAfterWait(AbstractQueuedSynchronizer.java:2014)
> at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject
> .await(AbstractQueuedSynchronizer.java:2173)
> at org.apache.flink.streaming.runtime.tasks.mailbox.TaskMailboxImpl.take(
> TaskMailboxImpl.java:149)
> at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor
> .processMailsWhenDefaultActionUnavailable(MailboxProcessor.java:363)
> at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor
> .processMail(MailboxProcessor.java:352)
> at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor
> .runMailboxLoop(MailboxProcessor.java:229)
> at org.apache.flink.streaming.runtime.tasks.StreamTask.runMailboxLoop(
> StreamTask.java:831)
> at org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask
> .java:780)
> at org.apache.flink.runtime.taskmanager.Task.runWithSystemExitMonitoring(
> Task.java:935)
> at org.apache.flink.runtime.taskmanager.Task.restoreAndInvoke(Task.java:
> 914)
> at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:728)
> at org.apache.flink.runtime.taskmanager.Task.run(Task.java:550)
> at java.lang.Thread.run(Thread.java:748)
>
> java.lang.InterruptedException
> at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject
> .reportInterruptAfterWait(AbstractQueuedSynchronizer.java:2014)
> at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject
> .await(AbstractQueuedSynchronizer.java:2173)
> at org.apache.flink.streaming.runtime.tasks.mailbox.TaskMailboxImpl.take(
> TaskMailboxImpl.java:149)
> at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor
> .processMailsWhenDefaultActionUnavailable(MailboxProcessor.java:363)
> at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor
> .processMail(MailboxProcessor.java:352)
> at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor
> .runMailboxLoop(MailboxProcessor.java:229)
> at org.apache.flink.streaming.runtime.tasks.StreamTask.runMailboxLoop(
> StreamTask.java:831)
> at org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask
> .java:780)
> at org.apache.flink.runtime.taskmanager.Task.runWithSystemExitMonitoring(
> Task.java:935)
> at org.apache.flink.runtime.taskmanager.Task.restoreAndInvoke(Task.java:
> 914)
> at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:728)
> at org.apache.flink.runtime.taskmanager.Task.run(Task.java:550)
> at java.lang.Thread.run(Thread.java:748)
>
> --
> Best regards
>
> Sili Liu
>