You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@flink.apache.org by Josson Paul <jo...@gmail.com> on 2022/02/16 07:15:08 UTC

Job manager slots are in bad state.

We are using Flink version 1.11.2.
At times if task managers are restarted for some reason, the job managers
throw the exception that I attached here. It is an illegal state exception.
We never had this issue with Flink 1.8. It started happening after
upgrading to Flink 1.11.2.

Why are the slots not released if it is in a bad state?. The issue doesn't
get resolved even if I restart all the task managers. It will get resolved
only if I restart Job manager.

java.util.concurrent.CompletionException: java.util.concurrent.
CompletionException: java.lang.IllegalStateException
    at org.apache.flink.runtime.jobmaster.slotpool.
SlotSharingManager$MultiTaskSlot.lambda$new$0(SlotSharingManager.java:433)
    at java.base/java.util.concurrent.CompletableFuture.uniHandle(
CompletableFuture.java:930)
    at java.base/java.util.concurrent.CompletableFuture$UniHandle.tryFire(
CompletableFuture.java:907)
    at java.base/java.util.concurrent.CompletableFuture.postComplete(
CompletableFuture.java:506)
    at java.base/java.util.concurrent.CompletableFuture
.completeExceptionally(CompletableFuture.java:2088)
    at org.apache.flink.runtime.concurrent.FutureUtils.lambda$forwardTo$21(
FutureUtils.java:1132)
    at java.base/java.util.concurrent.CompletableFuture.uniWhenComplete(
CompletableFuture.java:859)
    at java.base/java.util.concurrent.CompletableFuture
.uniWhenCompleteStage(CompletableFuture.java:883)
    at java.base/java.util.concurrent.CompletableFuture.whenComplete(
CompletableFuture.java:2251)
    at org.apache.flink.runtime.concurrent.FutureUtils.forward(FutureUtils
.java:1100)
    at org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager
.createRootSlot(SlotSharingManager.java:155)
    at org.apache.flink.runtime.jobmaster.slotpool.SchedulerImpl
.allocateMultiTaskSlot(SchedulerImpl.java:477)
    at org.apache.flink.runtime.jobmaster.slotpool.SchedulerImpl
.allocateSharedSlot(SchedulerImpl.java:311)
    at org.apache.flink.runtime.jobmaster.slotpool.SchedulerImpl
.internalAllocateSlot(SchedulerImpl.java:160)
    at org.apache.flink.runtime.jobmaster.slotpool.SchedulerImpl
.allocateSlotInternal(SchedulerImpl.java:143)
    at org.apache.flink.runtime.jobmaster.slotpool.SchedulerImpl
.allocateSlot(SchedulerImpl.java:113)
    at org.apache.flink.runtime.executiongraph.
SlotProviderStrategy$NormalSlotProviderStrategy.allocateSlot(
SlotProviderStrategy.java:115)
    at org.apache.flink.runtime.scheduler.DefaultExecutionSlotAllocator
.lambda$allocateSlotsFor$0(DefaultExecutionSlotAllocator.java:104)
    at java.base/java.util.concurrent.CompletableFuture.uniComposeStage(
CompletableFuture.java:1106)
    at java.base/java.util.concurrent.CompletableFuture.thenCompose(
CompletableFuture.java:2235)
    at org.apache.flink.runtime.scheduler.DefaultExecutionSlotAllocator
.allocateSlotsFor(DefaultExecutionSlotAllocator.java:102)
    at org.apache.flink.runtime.scheduler.DefaultScheduler.allocateSlots(
DefaultScheduler.java:339)
    at org.apache.flink.runtime.scheduler.DefaultScheduler
.allocateSlotsAndDeploy(DefaultScheduler.java:312)
    at org.apache.flink.runtime.scheduler.strategy.EagerSchedulingStrategy
.allocateSlotsAndDeploy(EagerSchedulingStrategy.java:76)
    at org.apache.flink.runtime.scheduler.strategy.EagerSchedulingStrategy
.restartTasks(EagerSchedulingStrategy.java:57)
    at org.apache.flink.runtime.scheduler.DefaultScheduler
.lambda$restartTasks$2(DefaultScheduler.java:265)
    at java.base/java.util.concurrent.CompletableFuture$UniRun.tryFire(
CompletableFuture.java:783)
    at java.base/java.util.concurrent.CompletableFuture$Completion.run(
CompletableFuture.java:478)
    at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(
AkkaRpcActor.java:402)
    at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(
AkkaRpcActor.java:195)
    at org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor
.handleRpcMessage(FencedAkkaRpcActor.java:74)
    at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(
AkkaRpcActor.java:152)
    at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26)
    at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21)
    at scala.PartialFunction$class.applyOrElse(PartialFunction.scala:123)
    at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:21)
    at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:170)
    at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
    at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
    at akka.actor.Actor$class.aroundReceive(Actor.scala:517)
    at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225)
    at akka.actor.ActorCell.receiveMessage(ActorCell.scala:592)
    at akka.actor.ActorCell.invoke(ActorCell.scala:561)
    at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258)
    at akka.dispatch.Mailbox.run(Mailbox.scala:225)
    at akka.dispatch.Mailbox.exec(Mailbox.scala:235)
    at akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
    at akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool
.java:1339)
    at akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
    at akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread
.java:107)
Caused by: java.util.concurrent.CompletionException: java.lang.
IllegalStateException
    at java.base/java.util.concurrent.CompletableFuture.encodeThrowable(
CompletableFuture.java:314)
    at java.base/java.util.concurrent.CompletableFuture.uniApplyNow(
CompletableFuture.java:683)
    at java.base/java.util.concurrent.CompletableFuture.uniApplyStage(
CompletableFuture.java:658)
    at java.base/java.util.concurrent.CompletableFuture.thenApply(
CompletableFuture.java:2094)
    at org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager
.createRootSlot(SlotSharingManager.java:156)
    ... 39 more
Caused by: java.lang.IllegalStateException
    at org.apache.flink.util.Preconditions.checkState(Preconditions.java:179
)
    at org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager
.tryMarkSlotAsResolved(SlotSharingManager.java:194)
    at org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager
.lambda$createRootSlot$0(SlotSharingManager.java:160)
    at java.base/java.util.concurrent.CompletableFuture.uniApplyNow(
CompletableFuture.java:680)
    ... 42 more

-- 
Thanks
Josson

Re: Job manager slots are in bad state.

Posted by Piotr Nowojski <pn...@apache.org>.
Hi Josson,

Would you be able to reproduce this issue on a more recent version of
Flink? I'm afraid that we won't be able to help with this issue as this
affects a Flink version that is not supported for quite some time and
moreover `SlotSharingManager` has been completed removed in Flink 1.13.

Can you upgrade to a more recent Flink version and try it out? I would
assume the bug should be gone in 1.13.x or 1.14.x branches. If not, you can
also try out Flink 1.11.4, as maybe it has fixed this issue as well.

Best,
Piotrek

śr., 16 lut 2022 o 08:16 Josson Paul <jo...@gmail.com> napisał(a):

> We are using Flink version 1.11.2.
> At times if task managers are restarted for some reason, the job managers
> throw the exception that I attached here. It is an illegal state exception.
> We never had this issue with Flink 1.8. It started happening after
> upgrading to Flink 1.11.2.
>
> Why are the slots not released if it is in a bad state?. The issue doesn't
> get resolved even if I restart all the task managers. It will get resolved
> only if I restart Job manager.
>
> java.util.concurrent.CompletionException: java.util.concurrent.
> CompletionException: java.lang.IllegalStateException
>     at org.apache.flink.runtime.jobmaster.slotpool.
> SlotSharingManager$MultiTaskSlot.lambda$new$0(SlotSharingManager.java:433)
>     at java.base/java.util.concurrent.CompletableFuture.uniHandle(
> CompletableFuture.java:930)
>     at java.base/java.util.concurrent.CompletableFuture$UniHandle.tryFire(
> CompletableFuture.java:907)
>     at java.base/java.util.concurrent.CompletableFuture.postComplete(
> CompletableFuture.java:506)
>     at java.base/java.util.concurrent.CompletableFuture
> .completeExceptionally(CompletableFuture.java:2088)
>     at org.apache.flink.runtime.concurrent.FutureUtils
> .lambda$forwardTo$21(FutureUtils.java:1132)
>     at java.base/java.util.concurrent.CompletableFuture.uniWhenComplete(
> CompletableFuture.java:859)
>     at java.base/java.util.concurrent.CompletableFuture
> .uniWhenCompleteStage(CompletableFuture.java:883)
>     at java.base/java.util.concurrent.CompletableFuture.whenComplete(
> CompletableFuture.java:2251)
>     at org.apache.flink.runtime.concurrent.FutureUtils.forward(FutureUtils
> .java:1100)
>     at org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager
> .createRootSlot(SlotSharingManager.java:155)
>     at org.apache.flink.runtime.jobmaster.slotpool.SchedulerImpl
> .allocateMultiTaskSlot(SchedulerImpl.java:477)
>     at org.apache.flink.runtime.jobmaster.slotpool.SchedulerImpl
> .allocateSharedSlot(SchedulerImpl.java:311)
>     at org.apache.flink.runtime.jobmaster.slotpool.SchedulerImpl
> .internalAllocateSlot(SchedulerImpl.java:160)
>     at org.apache.flink.runtime.jobmaster.slotpool.SchedulerImpl
> .allocateSlotInternal(SchedulerImpl.java:143)
>     at org.apache.flink.runtime.jobmaster.slotpool.SchedulerImpl
> .allocateSlot(SchedulerImpl.java:113)
>     at org.apache.flink.runtime.executiongraph.
> SlotProviderStrategy$NormalSlotProviderStrategy.allocateSlot(
> SlotProviderStrategy.java:115)
>     at org.apache.flink.runtime.scheduler.DefaultExecutionSlotAllocator
> .lambda$allocateSlotsFor$0(DefaultExecutionSlotAllocator.java:104)
>     at java.base/java.util.concurrent.CompletableFuture.uniComposeStage(
> CompletableFuture.java:1106)
>     at java.base/java.util.concurrent.CompletableFuture.thenCompose(
> CompletableFuture.java:2235)
>     at org.apache.flink.runtime.scheduler.DefaultExecutionSlotAllocator
> .allocateSlotsFor(DefaultExecutionSlotAllocator.java:102)
>     at org.apache.flink.runtime.scheduler.DefaultScheduler.allocateSlots(
> DefaultScheduler.java:339)
>     at org.apache.flink.runtime.scheduler.DefaultScheduler
> .allocateSlotsAndDeploy(DefaultScheduler.java:312)
>     at org.apache.flink.runtime.scheduler.strategy.EagerSchedulingStrategy
> .allocateSlotsAndDeploy(EagerSchedulingStrategy.java:76)
>     at org.apache.flink.runtime.scheduler.strategy.EagerSchedulingStrategy
> .restartTasks(EagerSchedulingStrategy.java:57)
>     at org.apache.flink.runtime.scheduler.DefaultScheduler
> .lambda$restartTasks$2(DefaultScheduler.java:265)
>     at java.base/java.util.concurrent.CompletableFuture$UniRun.tryFire(
> CompletableFuture.java:783)
>     at java.base/java.util.concurrent.CompletableFuture$Completion.run(
> CompletableFuture.java:478)
>     at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(
> AkkaRpcActor.java:402)
>     at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(
> AkkaRpcActor.java:195)
>     at org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor
> .handleRpcMessage(FencedAkkaRpcActor.java:74)
>     at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(
> AkkaRpcActor.java:152)
>     at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26)
>     at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21)
>     at scala.PartialFunction$class.applyOrElse(PartialFunction.scala:123)
>     at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:21)
>     at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:170)
>     at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
>     at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
>     at akka.actor.Actor$class.aroundReceive(Actor.scala:517)
>     at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225)
>     at akka.actor.ActorCell.receiveMessage(ActorCell.scala:592)
>     at akka.actor.ActorCell.invoke(ActorCell.scala:561)
>     at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258)
>     at akka.dispatch.Mailbox.run(Mailbox.scala:225)
>     at akka.dispatch.Mailbox.exec(Mailbox.scala:235)
>     at akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
>     at akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool
> .java:1339)
>     at akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:
> 1979)
>     at akka.dispatch.forkjoin.ForkJoinWorkerThread.run(
> ForkJoinWorkerThread.java:107)
> Caused by: java.util.concurrent.CompletionException: java.lang.
> IllegalStateException
>     at java.base/java.util.concurrent.CompletableFuture.encodeThrowable(
> CompletableFuture.java:314)
>     at java.base/java.util.concurrent.CompletableFuture.uniApplyNow(
> CompletableFuture.java:683)
>     at java.base/java.util.concurrent.CompletableFuture.uniApplyStage(
> CompletableFuture.java:658)
>     at java.base/java.util.concurrent.CompletableFuture.thenApply(
> CompletableFuture.java:2094)
>     at org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager
> .createRootSlot(SlotSharingManager.java:156)
>     ... 39 more
> Caused by: java.lang.IllegalStateException
>     at org.apache.flink.util.Preconditions.checkState(Preconditions.java:
> 179)
>     at org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager
> .tryMarkSlotAsResolved(SlotSharingManager.java:194)
>     at org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager
> .lambda$createRootSlot$0(SlotSharingManager.java:160)
>     at java.base/java.util.concurrent.CompletableFuture.uniApplyNow(
> CompletableFuture.java:680)
>     ... 42 more
>
> --
> Thanks
> Josson
>