You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@ignite.apache.org by Randall Woodruff <rw...@niksun.com> on 2022/07/06 14:32:47 UTC

Ignite Discovery worker blocked error when client node attempts to connect

We have deployed an Ignite 2.11.1 cluster in Kubernetes.  When a client node attempts to join the grid, we are getting the tcp-disco-msg-worker blocked below.

We have used the same configuration successfully with another deployment so we are not certain why we are getting this error.
The only change is that in the deployment that is failing we have added resource limits to the configuration.

Link to same question of stack overflow. <https://stackoverflow.com/questions/72885391/ignite-discovery-worker-blocked-error-when-client-node-attempts-to-connect>

[20:07:24,201][SEVERE][tcp-disco-msg-worker-[crd]-#2-#48][G] Blocked system-critical thread has been detected. This can lead to cluster-wide undefined behaviour [workerName=db-checkpoint-thread, threadName=db-checkpoint-thread-#78, blockedFor=4562s]
[20:07:24] Possible failure suppressed accordingly to a configured handler [hnd=StopNodeOrHaltFailureHandler [tryStop=false, timeout=0, super=AbstractFailureHandler [ignoredFailureTypes=UnmodifiableSet [SYSTEM_WORKER_BLOCKED, SYSTEM_CRITICAL_OPERATION_TIMEOUT]]], failureCtx=FailureContext [type=SYSTEM_WORKER_BLOCKED, err=class o.a.i.IgniteException: GridWorker [name=db-checkpoint-thread, igniteInstanceName=null, finished=false, heartbeatTs=1657047082073]]]
[20:07:27,073][SEVERE][tcp-disco-msg-worker-[crd]-#2-#48][G] Blocked system-critical thread has been detected. This can lead to cluster-wide undefined behaviour [workerName=sys-stripe-0, threadName=sys-stripe-0-#1, blockedFor=4072s]

Thread [name="qtp2015455415-61", id=61, state=TIMED_WAITING, blockCnt=1, waitCnt=702]
    Lock [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@78322a4, ownerName=null, ownerId=-1]
        at sun.misc.Unsafe.park(Native Method)
        at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
        at org.eclipse.jetty.util.BlockingArrayQueue.poll(BlockingArrayQueue.java:382)
        at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.idleJobPoll(QueuedThreadPool.java:973)
        at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1023)
        at java.lang.Thread.run(Thread.java:748)

Thread [name="qtp2015455415-60", id=60, state=RUNNABLE, blockCnt=3, waitCnt=679]
        at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method)
        at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269)
        at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:93)
        at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:86)
        - locked sun.nio.ch.Util$3@7a7b4390
        - locked java.util.Collections$UnmodifiableSet@220c68db
        - locked sun.nio.ch.EPollSelectorImpl@4f569077
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:97)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:101)
        at org.eclipse.jetty.io.ManagedSelector.nioSelect(ManagedSelector.java:183)
        at org.eclipse.jetty.io.ManagedSelector.select(ManagedSelector.java:190)
        at org.eclipse.jetty.io.ManagedSelector$SelectorProducer.select(ManagedSelector.java:606)
        at org.eclipse.jetty.io.ManagedSelector$SelectorProducer.produce(ManagedSelector.java:543)
        at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.produceTask(EatWhatYouKill.java:360)
        at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:184)
        at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:171)
        at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.run(EatWhatYouKill.java:129)
        at org.eclipse.jetty.util.thread.ReservedThreadExecutor$ReservedThread.run(ReservedThreadExecutor.java:383)
        at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:882)
        at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1036)
        at java.lang.Thread.run(Thread.java:748)

Thread [name="qtp2015455415-59", id=59, state=TIMED_WAITING, blockCnt=1, waitCnt=684]
    Lock [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@78322a4, ownerName=null, ownerId=-1]
        at sun.misc.Unsafe.park(Native Method)
        at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
        at org.eclipse.jetty.util.BlockingArrayQueue.poll(BlockingArrayQueue.java:382)
        at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.idleJobPoll(QueuedThreadPool.java:973)
        at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1023)
        at java.lang.Thread.run(Thread.java:748)

Thread [name="qtp2015455415-58", id=58, state=TIMED_WAITING, blockCnt=1, waitCnt=688]
    Lock [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@78322a4, ownerName=null, ownerId=-1]
        at sun.misc.Unsafe.park(Native Method)
        at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
        at org.eclipse.jetty.util.BlockingArrayQueue.poll(BlockingArrayQueue.java:382)
        at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.idleJobPoll(QueuedThreadPool.java:973)
        at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1023)
        at java.lang.Thread.run(Thread.java:748)

Re: Ignite Discovery worker blocked error when client node attempts to connect

Posted by Shubham Shirur <sh...@gmail.com>.
Unsubscribe

On Wed, Jul 6, 2022, 8:03 PM Randall Woodruff <rw...@niksun.com> wrote:

> We have deployed an Ignite 2.11.1 cluster in Kubernetes.  When a client
> node attempts to join the grid, we are getting the tcp-disco-msg-worker
> blocked below.
>
> We have used the same configuration successfully with another deployment
> so we are not certain why we are getting this error.
> The only change is that in the deployment that is failing we have added
> resource limits to the configuration.
>
> Link to same question of stack overflow.
> <https://stackoverflow.com/questions/72885391/ignite-discovery-worker-blocked-error-when-client-node-attempts-to-connect>
>
> [20:07:24,201][SEVERE][tcp-disco-msg-worker-[crd]-#2-#48][G] Blocked
> system-critical thread has been detected. This can lead to cluster-wide
> undefined behaviour [workerName=db-checkpoint-thread,
> threadName=db-checkpoint-thread-#78, blockedFor=4562s]
> [20:07:24] Possible failure suppressed accordingly to a configured handler
> [hnd=StopNodeOrHaltFailureHandler [tryStop=false, timeout=0,
> super=AbstractFailureHandler [ignoredFailureTypes=UnmodifiableSet
> [SYSTEM_WORKER_BLOCKED, SYSTEM_CRITICAL_OPERATION_TIMEOUT]]],
> failureCtx=FailureContext [type=SYSTEM_WORKER_BLOCKED, err=class
> o.a.i.IgniteException: GridWorker [name=db-checkpoint-thread,
> igniteInstanceName=null, finished=false, heartbeatTs=1657047082073]]]
> [20:07:27,073][SEVERE][tcp-disco-msg-worker-[crd]-#2-#48][G] Blocked
> system-critical thread has been detected. This can lead to cluster-wide
> undefined behaviour [workerName=sys-stripe-0, threadName=sys-stripe-0-#1,
> blockedFor=4072s]
>
> Thread [name="qtp2015455415-61", id=61, state=TIMED_WAITING, blockCnt=1,
> waitCnt=702]
>     Lock
> [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@78322a4,
> ownerName=null, ownerId=-1]
>         at sun.misc.Unsafe.park(Native Method)
>         at
> java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
>         at
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
>         at
> org.eclipse.jetty.util.BlockingArrayQueue.poll(BlockingArrayQueue.java:382)
>         at
> org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.idleJobPoll(QueuedThreadPool.java:973)
>         at
> org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1023)
>         at java.lang.Thread.run(Thread.java:748)
>
> Thread [name="qtp2015455415-60", id=60, state=RUNNABLE, blockCnt=3,
> waitCnt=679]
>         at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method)
>         at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269)
>         at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:93)
>         at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:86)
>         - locked sun.nio.ch.Util$3@7a7b4390
>         - locked java.util.Collections$UnmodifiableSet@220c68db
>         - locked sun.nio.ch.EPollSelectorImpl@4f569077
>         at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:97)
>         at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:101)
>         at
> org.eclipse.jetty.io.ManagedSelector.nioSelect(ManagedSelector.java:183)
>         at
> org.eclipse.jetty.io.ManagedSelector.select(ManagedSelector.java:190)
>         at
> org.eclipse.jetty.io.ManagedSelector$SelectorProducer.select(ManagedSelector.java:606)
>         at
> org.eclipse.jetty.io.ManagedSelector$SelectorProducer.produce(ManagedSelector.java:543)
>         at
> org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.produceTask(EatWhatYouKill.java:360)
>         at
> org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:184)
>         at
> org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:171)
>         at
> org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.run(EatWhatYouKill.java:129)
>         at
> org.eclipse.jetty.util.thread.ReservedThreadExecutor$ReservedThread.run(ReservedThreadExecutor.java:383)
>         at
> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:882)
>         at
> org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1036)
>         at java.lang.Thread.run(Thread.java:748)
>
> Thread [name="qtp2015455415-59", id=59, state=TIMED_WAITING, blockCnt=1,
> waitCnt=684]
>     Lock
> [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@78322a4,
> ownerName=null, ownerId=-1]
>         at sun.misc.Unsafe.park(Native Method)
>         at
> java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
>         at
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
>         at
> org.eclipse.jetty.util.BlockingArrayQueue.poll(BlockingArrayQueue.java:382)
>         at
> org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.idleJobPoll(QueuedThreadPool.java:973)
>         at
> org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1023)
>         at java.lang.Thread.run(Thread.java:748)
>
> Thread [name="qtp2015455415-58", id=58, state=TIMED_WAITING, blockCnt=1,
> waitCnt=688]
>     Lock
> [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@78322a4,
> ownerName=null, ownerId=-1]
>         at sun.misc.Unsafe.park(Native Method)
>         at
> java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
>         at
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
>         at
> org.eclipse.jetty.util.BlockingArrayQueue.poll(BlockingArrayQueue.java:382)
>         at
> org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.idleJobPoll(QueuedThreadPool.java:973)
>         at
> org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1023)
>         at java.lang.Thread.run(Thread.java:748)
>

Re: Ignite Discovery worker blocked error when client node attempts to connect

Posted by Zhenya Stanilovsky <ar...@mail.ru>.
Hi, can you share full log somehow ?
Provided information is not enough for analysis.

 
>We have deployed an Ignite 2.11.1 cluster in Kubernetes.  When a client node attempts to join the grid, we are getting the  tcp-disco-msg-worker blocked below.  
> 
>We have used the same configuration successfully with another deployment so we are not certain why we are getting this error. 
>The only change is that in the deployment that is failing we have added resource limits to the configuration.
> 
>Link to same question of stack overflow. 
> 
>[20:07:24,201][SEVERE][tcp-disco-msg-worker-[crd]-#2-#48][G] Blocked system-critical thread has been detected. This can lead to cluster-wide undefined behaviour [workerName=db-checkpoint-thread, threadName=db-checkpoint-thread-#78, blockedFor=4562s]
>[20:07:24] Possible failure suppressed accordingly to a configured handler [hnd=StopNodeOrHaltFailureHandler [tryStop=false, timeout=0, super=AbstractFailureHandler [ignoredFailureTypes=UnmodifiableSet [SYSTEM_WORKER_BLOCKED, SYSTEM_CRITICAL_OPERATION_TIMEOUT]]], failureCtx=FailureContext [type=SYSTEM_WORKER_BLOCKED, err=class o.a.i.IgniteException: GridWorker [name=db-checkpoint-thread, igniteInstanceName=null, finished=false, heartbeatTs=1657047082073]]]
>[20:07:27,073][SEVERE][tcp-disco-msg-worker-[crd]-#2-#48][G] Blocked system-critical thread has been detected. This can lead to cluster-wide undefined behaviour [workerName=sys-stripe-0, threadName=sys-stripe-0-#1, blockedFor=4072s]
> 
>Thread [name="qtp2015455415-61", id=61, state=TIMED_WAITING, blockCnt=1, waitCnt=702]
>    Lock [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@78322a4, ownerName=null, ownerId=-1]
>        at sun.misc.Unsafe.park(Native Method)
>        at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
>        at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
>        at org.eclipse.jetty.util.BlockingArrayQueue.poll(BlockingArrayQueue.java:382)
>        at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.idleJobPoll(QueuedThreadPool.java:973)
>        at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1023)
>        at java.lang.Thread.run(Thread.java:748)
> 
>Thread [name="qtp2015455415-60", id=60, state=RUNNABLE, blockCnt=3, waitCnt=679]
>        at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method)
>        at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269)
>        at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:93)
>        at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:86)
>        - locked sun.nio.ch.Util$3@7a7b4390
>        - locked java.util.Collections$UnmodifiableSet@220c68db
>        - locked sun.nio.ch.EPollSelectorImpl@4f569077
>        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:97)
>        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:101)
>        at org.eclipse.jetty.io.ManagedSelector.nioSelect(ManagedSelector.java:183)
>        at org.eclipse.jetty.io.ManagedSelector.select(ManagedSelector.java:190)
>        at org.eclipse.jetty.io.ManagedSelector$SelectorProducer.select(ManagedSelector.java:606)
>        at org.eclipse.jetty.io.ManagedSelector$SelectorProducer.produce(ManagedSelector.java:543)
>        at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.produceTask(EatWhatYouKill.java:360)
>        at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:184)
>        at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:171)
>        at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.run(EatWhatYouKill.java:129)
>        at org.eclipse.jetty.util.thread.ReservedThreadExecutor$ReservedThread.run(ReservedThreadExecutor.java:383)
>        at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:882)
>        at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1036)
>        at java.lang.Thread.run(Thread.java:748)
> 
>Thread [name="qtp2015455415-59", id=59, state=TIMED_WAITING, blockCnt=1, waitCnt=684]
>    Lock [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@78322a4, ownerName=null, ownerId=-1]
>        at sun.misc.Unsafe.park(Native Method)
>        at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
>        at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
>        at org.eclipse.jetty.util.BlockingArrayQueue.poll(BlockingArrayQueue.java:382)
>        at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.idleJobPoll(QueuedThreadPool.java:973)
>        at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1023)
>        at java.lang.Thread.run(Thread.java:748)
> 
>Thread [name="qtp2015455415-58", id=58, state=TIMED_WAITING, blockCnt=1, waitCnt=688]
>    Lock [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@78322a4, ownerName=null, ownerId=-1]
>        at sun.misc.Unsafe.park(Native Method)
>        at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
>        at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
>        at org.eclipse.jetty.util.BlockingArrayQueue.poll(BlockingArrayQueue.java:382)
>        at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.idleJobPoll(QueuedThreadPool.java:973)
>        at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1023)         at java.lang.Thread.run(Thread.java:748)