You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@ignite.apache.org by "Alexander Lapin (Jira)" <ji...@apache.org> on 2020/07/13 15:17:00 UTC

[jira] [Updated] (IGNITE-13251) Deadlock between grid-timeout-worker and a thread opening a communication connection.

     [ https://issues.apache.org/jira/browse/IGNITE-13251?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Alexander Lapin updated IGNITE-13251:
-------------------------------------
    Description: 
rid-timeout-worker is known to go into a deadlock state with other threads in a few scenarios.

The general scheme is:
 1. A thread `T` is holding lock `L` and is trying to establish a communication connection, hanging in `safeTcpHandshake` method. Due to the logic of `safeTcpHandshake`, `grid-timeout-worker` needs to send a signal to `T` in order for it to proceed.

2. `grid-timeout-worker` is trying to acquire `L`. Hence, the deadlock.

It may include more threads. The lock `L` can be different: checkpoint lock, GridCacheMapEntry lock, etc.
 #  
 ## Example

The below example shows a lock between
 * `grid-timeout-worker` trying to acquire a cp read lock in a `dumpLongRunningTransactions`

 * `tcp-comm-worker` trying to establish a connection but hanging on socket read due to unstable network

 * checkpointer trying to start a checkpoint and acquire cp write lock

 * `utility` worker waiting for the connection to be established by `tcp-comm-worker` while holding cp read lock

{code:java}
Thread [name="grid-timeout-worker-#23", id=42, state=WAITING, blockCnt=6991, waitCnt=1746467]
    Lock [object=java.util.concurrent.locks.ReentrantReadWriteLock$NonfairSync@4545a8b9, ownerName=null, ownerId=-1]
        at sun.misc.Unsafe.park(Native Method)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:836)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer.doAcquireShared(AbstractQueuedSynchronizer.java:967)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireShared(AbstractQueuedSynchronizer.java:1283)
        at java.util.concurrent.locks.ReentrantReadWriteLock$ReadLock.lock(ReentrantReadWriteLock.java:727)
        at o.a.i.i.processors.cache.persistence.GridCacheDatabaseSharedManager.checkpointReadLock(GridCacheDatabaseSharedManager.java:1707)
        at o.a.i.i.processors.cache.distributed.dht.GridPartitionedSingleGetFuture.setResult(GridPartitionedSingleGetFuture.java:715)
        at o.a.i.i.processors.cache.distributed.dht.GridPartitionedSingleGetFuture.localGet(GridPartitionedSingleGetFuture.java:511)
        at o.a.i.i.processors.cache.distributed.dht.GridPartitionedSingleGetFuture.tryLocalGet(GridPartitionedSingleGetFuture.java:399)
        at o.a.i.i.processors.cache.distributed.dht.GridPartitionedSingleGetFuture.mapKeyToNode(GridPartitionedSingleGetFuture.java:366)
        at o.a.i.i.processors.cache.distributed.dht.GridPartitionedSingleGetFuture.map(GridPartitionedSingleGetFuture.java:243)
        at o.a.i.i.processors.cache.distributed.dht.GridPartitionedSingleGetFuture.init(GridPartitionedSingleGetFuture.java:232)
        at o.a.i.i.processors.cache.distributed.dht.colocated.GridDhtColocatedCache.getAsync(GridDhtColocatedCache.java:246)
        at o.a.i.i.processors.cache.GridCacheAdapter.get0(GridCacheAdapter.java:4190)
        at o.a.i.i.processors.cache.GridCacheAdapter.get(GridCacheAdapter.java:4171)
        at o.a.i.i.processors.cache.GridCacheAdapter.get(GridCacheAdapter.java:1362)
        at o.a.i.i.processors.task.GridTaskProcessor.saveTaskMetadata(GridTaskProcessor.java:908)
        at o.a.i.i.processors.task.GridTaskProcessor.startTask(GridTaskProcessor.java:746)
        at o.a.i.i.processors.task.GridTaskProcessor.execute(GridTaskProcessor.java:477)
        at o.a.i.i.processors.closure.GridClosureProcessor.callAsync(GridClosureProcessor.java:674)
        at o.a.i.i.processors.closure.GridClosureProcessor.callAsync(GridClosureProcessor.java:479)
        at o.a.i.i.IgniteComputeImpl.callAsync0(IgniteComputeImpl.java:809)
        at o.a.i.i.IgniteComputeImpl.callAsync(IgniteComputeImpl.java:794)
        at o.a.i.i.processors.cache.GridCachePartitionExchangeManager.dumpLongRunningTransaction(GridCachePartitionExchangeManager.java:2115)
        at o.a.i.i.processors.cache.GridCachePartitionExchangeManager.dumpLongRunningOperations0(GridCachePartitionExchangeManager.java:2012)
        - locked o.a.i.i.processors.cache.GridCachePartitionExchangeManager$ActionLimiter@47b86e8f
        at o.a.i.i.processors.cache.GridCachePartitionExchangeManager.dumpLongRunningOperations(GridCachePartitionExchangeManager.java:2180)
        at o.a.i.i.IgniteKernal$4.run(IgniteKernal.java:1478)
        at o.a.i.i.processors.timeout.GridTimeoutProcessor$CancelableTask.onTimeout(GridTimeoutProcessor.java:410)
        - locked o.a.i.i.processors.timeout.GridTimeoutProcessor$CancelableTask@67c951c7
        at o.a.i.i.processors.timeout.GridTimeoutProcessor$TimeoutWorker.body(GridTimeoutProcessor.java:279)
        at o.a.i.i.util.worker.GridWorker.run(GridWorker.java:120)
        at java.lang.Thread.run(Thread.java:748)

Thread [name="tcp-comm-worker-#1", id=50, state=RUNNABLE, blockCnt=16, waitCnt=6712]
        at sun.nio.ch.FileDispatcherImpl.read0(Native Method)
        at sun.nio.ch.SocketDispatcher.read(SocketDispatcher.java:39)
        at sun.nio.ch.IOUtil.readIntoNativeBuffer(IOUtil.java:223)
        at sun.nio.ch.IOUtil.read(IOUtil.java:197)
        at sun.nio.ch.SocketChannelImpl.read(SocketChannelImpl.java:377)
        - locked java.lang.Object@71681f0c
        at o.a.i.spi.communication.tcp.TcpCommunicationSpi.safeTcpHandshake(TcpCommunicationSpi.java:3906)
        at o.a.i.spi.communication.tcp.TcpCommunicationSpi.createTcpClient(TcpCommunicationSpi.java:3420)
        at o.a.i.spi.communication.tcp.TcpCommunicationSpi.createNioClient(TcpCommunicationSpi.java:3055)
        at o.a.i.spi.communication.tcp.TcpCommunicationSpi.reserveClient(TcpCommunicationSpi.java:2908)
        at o.a.i.spi.communication.tcp.TcpCommunicationSpi.access$6300(TcpCommunicationSpi.java:272)
        at o.a.i.spi.communication.tcp.TcpCommunicationSpi$CommunicationWorker.processDisconnect(TcpCommunicationSpi.java:4603)
        at o.a.i.spi.communication.tcp.TcpCommunicationSpi$CommunicationWorker.body(TcpCommunicationSpi.java:4408)
        at o.a.i.i.util.worker.GridWorker.run(GridWorker.java:120)
        at o.a.i.spi.communication.tcp.TcpCommunicationSpi$7.body(TcpCommunicationSpi.java:2261)
        at o.a.i.spi.IgniteSpiThread.run(IgniteSpiThread.java:62)

Thread [name="db-checkpoint-thread-#69", id=144, state=WAITING, blockCnt=0, waitCnt=12657468]
    Lock [object=java.util.concurrent.locks.ReentrantReadWriteLock$NonfairSync@4545a8b9, ownerName=null, ownerId=-1]
        at sun.misc.Unsafe.park(Native Method)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:836)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireQueued(AbstractQueuedSynchronizer.java:870)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquire(AbstractQueuedSynchronizer.java:1199)
        at java.util.concurrent.locks.ReentrantReadWriteLock$WriteLock.lock(ReentrantReadWriteLock.java:943)
        at o.a.i.i.processors.cache.persistence.GridCacheDatabaseSharedManager$Checkpointer.markCheckpointBegin(GridCacheDatabaseSharedManager.java:4059)
        at o.a.i.i.processors.cache.persistence.GridCacheDatabaseSharedManager$Checkpointer.doCheckpoint(GridCacheDatabaseSharedManager.java:3637)
        at o.a.i.i.processors.cache.persistence.GridCacheDatabaseSharedManager$Checkpointer.body(GridCacheDatabaseSharedManager.java:3522)
        at o.a.i.i.util.worker.GridWorker.run(GridWorker.java:120)
        at java.lang.Thread.run(Thread.java:748)

Thread [name="utility-#53812", id=59375, state=TIMED_WAITING, blockCnt=0, waitCnt=1433]
        at sun.misc.Unsafe.park(Native Method)
        at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:338)
        at o.a.i.i.util.future.GridFutureAdapter.get0(GridFutureAdapter.java:219)
        at o.a.i.i.util.future.GridFutureAdapter.get(GridFutureAdapter.java:160)
        at o.a.i.spi.communication.tcp.TcpCommunicationSpi.reserveClient(TcpCommunicationSpi.java:2959)
        at o.a.i.spi.communication.tcp.TcpCommunicationSpi.sendMessage0(TcpCommunicationSpi.java:2751)
        at o.a.i.spi.communication.tcp.TcpCommunicationSpi.sendMessage(TcpCommunicationSpi.java:2710)
        at o.a.i.i.managers.communication.GridIoManager.send(GridIoManager.java:1643)
        at o.a.i.i.managers.communication.GridIoManager.sendOrderedMessage(GridIoManager.java:1863)
        at o.a.i.i.processors.continuous.GridContinuousProcessor.sendWithRetries(GridContinuousProcessor.java:1873)
        at o.a.i.i.processors.continuous.GridContinuousProcessor.sendWithRetries(GridContinuousProcessor.java:1844)
        at o.a.i.i.processors.continuous.GridContinuousProcessor.sendWithRetries(GridContinuousProcessor.java:1826)
        at o.a.i.i.processors.continuous.GridContinuousProcessor.sendNotification(GridContinuousProcessor.java:1244)
        at o.a.i.i.processors.continuous.GridContinuousProcessor.addNotification(GridContinuousProcessor.java:1181)
        at o.a.i.i.processors.cache.query.continuous.CacheContinuousQueryHandler.onEntryUpdate(CacheContinuousQueryHandler.java:882)
        at o.a.i.i.processors.cache.query.continuous.CacheContinuousQueryHandler.access$600(CacheContinuousQueryHandler.java:85)
        at o.a.i.i.processors.cache.query.continuous.CacheContinuousQueryHandler$2.onEntryUpdated(CacheContinuousQueryHandler.java:429)
        at o.a.i.i.processors.cache.query.continuous.CacheContinuousQueryManager.onEntryUpdated(CacheContinuousQueryManager.java:448)
        at o.a.i.i.processors.cache.GridCacheMapEntry.innerSet(GridCacheMapEntry.java:1115)
        at o.a.i.i.processors.cache.transactions.IgniteTxLocalAdapter.userCommit(IgniteTxLocalAdapter.java:747)
        at o.a.i.i.processors.cache.distributed.dht.GridDhtTxLocalAdapter.localFinish(GridDhtTxLocalAdapter.java:796)
        at o.a.i.i.processors.cache.distributed.dht.GridDhtTxLocal.localFinish(GridDhtTxLocal.java:603)
        at o.a.i.i.processors.cache.distributed.dht.GridDhtTxLocal.finishTx(GridDhtTxLocal.java:475)
        at o.a.i.i.processors.cache.distributed.dht.GridDhtTxLocal.commitDhtLocalAsync(GridDhtTxLocal.java:532)
        at o.a.i.i.processors.cache.transactions.IgniteTxHandler.finishDhtLocal(IgniteTxHandler.java:1042)
        at o.a.i.i.processors.cache.transactions.IgniteTxHandler.finish(IgniteTxHandler.java:921)
        at o.a.i.i.processors.cache.transactions.IgniteTxHandler.processNearTxFinishRequest(IgniteTxHandler.java:877)
        at o.a.i.i.processors.cache.transactions.IgniteTxHandler.access$200(IgniteTxHandler.java:109)
        at o.a.i.i.processors.cache.transactions.IgniteTxHandler$3.apply(IgniteTxHandler.java:203)
        at o.a.i.i.processors.cache.transactions.IgniteTxHandler$3.apply(IgniteTxHandler.java:201)
        at o.a.i.i.processors.cache.GridCacheIoManager.processMessage(GridCacheIoManager.java:1078)
        at o.a.i.i.processors.cache.GridCacheIoManager.onMessage0(GridCacheIoManager.java:587)
        at o.a.i.i.processors.cache.GridCacheIoManager.handleMessage(GridCacheIoManager.java:386)
        at o.a.i.i.processors.cache.GridCacheIoManager.handleMessage(GridCacheIoManager.java:312)
        at o.a.i.i.processors.cache.GridCacheIoManager.access$100(GridCacheIoManager.java:102)
        at o.a.i.i.processors.cache.GridCacheIoManager$1.onMessage(GridCacheIoManager.java:301)
        at o.a.i.i.managers.communication.GridIoManager.invokeListener(GridIoManager.java:1556)
        at o.a.i.i.managers.communication.GridIoManager.processRegularMessage0(GridIoManager.java:1184)
        at o.a.i.i.managers.communication.GridIoManager.access$4200(GridIoManager.java:125)
        at o.a.i.i.managers.communication.GridIoManager$9.run(GridIoManager.java:1091)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)
{code}
{color:#172b4d}{{}}{color}

  was:
rid-timeout-worker is known to go into a deadlock state with other threads in a few scenarios.

The general scheme is:
1. A thread `T` is holding lock `L` and is trying to establish a communication connection, hanging in `safeTcpHandshake` method. Due to the logic of `safeTcpHandshake`, `grid-timeout-worker` needs to send a signal to `T` in order for it to proceed.

2. `grid-timeout-worker` is trying to acquire `L`. Hence, the deadlock.

It may include more threads. The lock `L` can be different: checkpoint lock, GridCacheMapEntry lock, etc.
 #  

 ## Example

The below example shows a lock between
 * `grid-timeout-worker` trying to acquire a cp read lock in a `dumpLongRunningTransactions`

 * `tcp-comm-worker` trying to establish a connection but hanging on socket read due to unstable network

 * checkpointer trying to start a checkpoint and acquire cp write lock

 * `utility` worker waiting for the connection to be established by `tcp-comm-worker` while holding cp read lock

{code:java}
Thread [name="grid-timeout-worker-#23", id=42, state=WAITING, blockCnt=6991, waitCnt=1746467]
    Lock [object=java.util.concurrent.locks.ReentrantReadWriteLock$NonfairSync@4545a8b9, ownerName=null, ownerId=-1]
        at sun.misc.Unsafe.park(Native Method)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:836)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer.doAcquireShared(AbstractQueuedSynchronizer.java:967)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireShared(AbstractQueuedSynchronizer.java:1283)
        at java.util.concurrent.locks.ReentrantReadWriteLock$ReadLock.lock(ReentrantReadWriteLock.java:727)
        at o.a.i.i.processors.cache.persistence.GridCacheDatabaseSharedManager.checkpointReadLock(GridCacheDatabaseSharedManager.java:1707)
        at o.a.i.i.processors.cache.distributed.dht.GridPartitionedSingleGetFuture.setResult(GridPartitionedSingleGetFuture.java:715)
        at o.a.i.i.processors.cache.distributed.dht.GridPartitionedSingleGetFuture.localGet(GridPartitionedSingleGetFuture.java:511)
        at o.a.i.i.processors.cache.distributed.dht.GridPartitionedSingleGetFuture.tryLocalGet(GridPartitionedSingleGetFuture.java:399)
        at o.a.i.i.processors.cache.distributed.dht.GridPartitionedSingleGetFuture.mapKeyToNode(GridPartitionedSingleGetFuture.java:366)
        at o.a.i.i.processors.cache.distributed.dht.GridPartitionedSingleGetFuture.map(GridPartitionedSingleGetFuture.java:243)
        at o.a.i.i.processors.cache.distributed.dht.GridPartitionedSingleGetFuture.init(GridPartitionedSingleGetFuture.java:232)
        at o.a.i.i.processors.cache.distributed.dht.colocated.GridDhtColocatedCache.getAsync(GridDhtColocatedCache.java:246)
        at o.a.i.i.processors.cache.GridCacheAdapter.get0(GridCacheAdapter.java:4190)
        at o.a.i.i.processors.cache.GridCacheAdapter.get(GridCacheAdapter.java:4171)
        at o.a.i.i.processors.cache.GridCacheAdapter.get(GridCacheAdapter.java:1362)
        at o.a.i.i.processors.task.GridTaskProcessor.saveTaskMetadata(GridTaskProcessor.java:908)
        at o.a.i.i.processors.task.GridTaskProcessor.startTask(GridTaskProcessor.java:746)
        at o.a.i.i.processors.task.GridTaskProcessor.execute(GridTaskProcessor.java:477)
        at o.a.i.i.processors.closure.GridClosureProcessor.callAsync(GridClosureProcessor.java:674)
        at o.a.i.i.processors.closure.GridClosureProcessor.callAsync(GridClosureProcessor.java:479)
        at o.a.i.i.IgniteComputeImpl.callAsync0(IgniteComputeImpl.java:809)
        at o.a.i.i.IgniteComputeImpl.callAsync(IgniteComputeImpl.java:794)
        at o.a.i.i.processors.cache.GridCachePartitionExchangeManager.dumpLongRunningTransaction(GridCachePartitionExchangeManager.java:2115)
        at o.a.i.i.processors.cache.GridCachePartitionExchangeManager.dumpLongRunningOperations0(GridCachePartitionExchangeManager.java:2012)
        - locked o.a.i.i.processors.cache.GridCachePartitionExchangeManager$ActionLimiter@47b86e8f
        at o.a.i.i.processors.cache.GridCachePartitionExchangeManager.dumpLongRunningOperations(GridCachePartitionExchangeManager.java:2180)
        at o.a.i.i.IgniteKernal$4.run(IgniteKernal.java:1478)
        at o.a.i.i.processors.timeout.GridTimeoutProcessor$CancelableTask.onTimeout(GridTimeoutProcessor.java:410)
        - locked o.a.i.i.processors.timeout.GridTimeoutProcessor$CancelableTask@67c951c7
        at o.a.i.i.processors.timeout.GridTimeoutProcessor$TimeoutWorker.body(GridTimeoutProcessor.java:279)
        at o.a.i.i.util.worker.GridWorker.run(GridWorker.java:120)
        at java.lang.Thread.run(Thread.java:748)

Thread [name="tcp-comm-worker-#1", id=50, state=RUNNABLE, blockCnt=16, waitCnt=6712]
        at sun.nio.ch.FileDispatcherImpl.read0(Native Method)
        at sun.nio.ch.SocketDispatcher.read(SocketDispatcher.java:39)
        at sun.nio.ch.IOUtil.readIntoNativeBuffer(IOUtil.java:223)
        at sun.nio.ch.IOUtil.read(IOUtil.java:197)
        at sun.nio.ch.SocketChannelImpl.read(SocketChannelImpl.java:377)
        - locked java.lang.Object@71681f0c
        at o.a.i.spi.communication.tcp.TcpCommunicationSpi.safeTcpHandshake(TcpCommunicationSpi.java:3906)
        at o.a.i.spi.communication.tcp.TcpCommunicationSpi.createTcpClient(TcpCommunicationSpi.java:3420)
        at o.a.i.spi.communication.tcp.TcpCommunicationSpi.createNioClient(TcpCommunicationSpi.java:3055)
        at o.a.i.spi.communication.tcp.TcpCommunicationSpi.reserveClient(TcpCommunicationSpi.java:2908)
        at o.a.i.spi.communication.tcp.TcpCommunicationSpi.access$6300(TcpCommunicationSpi.java:272)
        at o.a.i.spi.communication.tcp.TcpCommunicationSpi$CommunicationWorker.processDisconnect(TcpCommunicationSpi.java:4603)
        at o.a.i.spi.communication.tcp.TcpCommunicationSpi$CommunicationWorker.body(TcpCommunicationSpi.java:4408)
        at o.a.i.i.util.worker.GridWorker.run(GridWorker.java:120)
        at o.a.i.spi.communication.tcp.TcpCommunicationSpi$7.body(TcpCommunicationSpi.java:2261)
        at o.a.i.spi.IgniteSpiThread.run(IgniteSpiThread.java:62)

Thread [name="db-checkpoint-thread-#69", id=144, state=WAITING, blockCnt=0, waitCnt=12657468]
    Lock [object=java.util.concurrent.locks.ReentrantReadWriteLock$NonfairSync@4545a8b9, ownerName=null, ownerId=-1]
        at sun.misc.Unsafe.park(Native Method)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:836)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireQueued(AbstractQueuedSynchronizer.java:870)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquire(AbstractQueuedSynchronizer.java:1199)
        at java.util.concurrent.locks.ReentrantReadWriteLock$WriteLock.lock(ReentrantReadWriteLock.java:943)
        at o.a.i.i.processors.cache.persistence.GridCacheDatabaseSharedManager$Checkpointer.markCheckpointBegin(GridCacheDatabaseSharedManager.java:4059)
        at o.a.i.i.processors.cache.persistence.GridCacheDatabaseSharedManager$Checkpointer.doCheckpoint(GridCacheDatabaseSharedManager.java:3637)
        at o.a.i.i.processors.cache.persistence.GridCacheDatabaseSharedManager$Checkpointer.body(GridCacheDatabaseSharedManager.java:3522)
        at o.a.i.i.util.worker.GridWorker.run(GridWorker.java:120)
        at java.lang.Thread.run(Thread.java:748)

Thread [name="utility-#53812", id=59375, state=TIMED_WAITING, blockCnt=0, waitCnt=1433]
        at sun.misc.Unsafe.park(Native Method)
        at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:338)
        at o.a.i.i.util.future.GridFutureAdapter.get0(GridFutureAdapter.java:219)
        at o.a.i.i.util.future.GridFutureAdapter.get(GridFutureAdapter.java:160)
        at o.a.i.spi.communication.tcp.TcpCommunicationSpi.reserveClient(TcpCommunicationSpi.java:2959)
        at o.a.i.spi.communication.tcp.TcpCommunicationSpi.sendMessage0(TcpCommunicationSpi.java:2751)
        at o.a.i.spi.communication.tcp.TcpCommunicationSpi.sendMessage(TcpCommunicationSpi.java:2710)
        at o.a.i.i.managers.communication.GridIoManager.send(GridIoManager.java:1643)
        at o.a.i.i.managers.communication.GridIoManager.sendOrderedMessage(GridIoManager.java:1863)
        at o.a.i.i.processors.continuous.GridContinuousProcessor.sendWithRetries(GridContinuousProcessor.java:1873)
        at o.a.i.i.processors.continuous.GridContinuousProcessor.sendWithRetries(GridContinuousProcessor.java:1844)
        at o.a.i.i.processors.continuous.GridContinuousProcessor.sendWithRetries(GridContinuousProcessor.java:1826)
        at o.a.i.i.processors.continuous.GridContinuousProcessor.sendNotification(GridContinuousProcessor.java:1244)
        at o.a.i.i.processors.continuous.GridContinuousProcessor.addNotification(GridContinuousProcessor.java:1181)
        at o.a.i.i.processors.cache.query.continuous.CacheContinuousQueryHandler.onEntryUpdate(CacheContinuousQueryHandler.java:882)
        at o.a.i.i.processors.cache.query.continuous.CacheContinuousQueryHandler.access$600(CacheContinuousQueryHandler.java:85)
        at o.a.i.i.processors.cache.query.continuous.CacheContinuousQueryHandler$2.onEntryUpdated(CacheContinuousQueryHandler.java:429)
        at o.a.i.i.processors.cache.query.continuous.CacheContinuousQueryManager.onEntryUpdated(CacheContinuousQueryManager.java:448)
        at o.a.i.i.processors.cache.GridCacheMapEntry.innerSet(GridCacheMapEntry.java:1115)
        at o.a.i.i.processors.cache.transactions.IgniteTxLocalAdapter.userCommit(IgniteTxLocalAdapter.java:747)
        at o.a.i.i.processors.cache.distributed.dht.GridDhtTxLocalAdapter.localFinish(GridDhtTxLocalAdapter.java:796)
        at o.a.i.i.processors.cache.distributed.dht.GridDhtTxLocal.localFinish(GridDhtTxLocal.java:603)
        at o.a.i.i.processors.cache.distributed.dht.GridDhtTxLocal.finishTx(GridDhtTxLocal.java:475)
        at o.a.i.i.processors.cache.distributed.dht.GridDhtTxLocal.commitDhtLocalAsync(GridDhtTxLocal.java:532)
        at o.a.i.i.processors.cache.transactions.IgniteTxHandler.finishDhtLocal(IgniteTxHandler.java:1042)
        at o.a.i.i.processors.cache.transactions.IgniteTxHandler.finish(IgniteTxHandler.java:921)
        at o.a.i.i.processors.cache.transactions.IgniteTxHandler.processNearTxFinishRequest(IgniteTxHandler.java:877)
        at o.a.i.i.processors.cache.transactions.IgniteTxHandler.access$200(IgniteTxHandler.java:109)
        at o.a.i.i.processors.cache.transactions.IgniteTxHandler$3.apply(IgniteTxHandler.java:203)
        at o.a.i.i.processors.cache.transactions.IgniteTxHandler$3.apply(IgniteTxHandler.java:201)
        at o.a.i.i.processors.cache.GridCacheIoManager.processMessage(GridCacheIoManager.java:1078)
        at o.a.i.i.processors.cache.GridCacheIoManager.onMessage0(GridCacheIoManager.java:587)
        at o.a.i.i.processors.cache.GridCacheIoManager.handleMessage(GridCacheIoManager.java:386)
        at o.a.i.i.processors.cache.GridCacheIoManager.handleMessage(GridCacheIoManager.java:312)
        at o.a.i.i.processors.cache.GridCacheIoManager.access$100(GridCacheIoManager.java:102)
        at o.a.i.i.processors.cache.GridCacheIoManager$1.onMessage(GridCacheIoManager.java:301)
        at o.a.i.i.managers.communication.GridIoManager.invokeListener(GridIoManager.java:1556)
        at o.a.i.i.managers.communication.GridIoManager.processRegularMessage0(GridIoManager.java:1184)
        at o.a.i.i.managers.communication.GridIoManager.access$4200(GridIoManager.java:125)
        at o.a.i.i.managers.communication.GridIoManager$9.run(GridIoManager.java:1091)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)
{code}
{color:#172b4d}{{}}{color}


> Deadlock between grid-timeout-worker and a thread opening a communication connection.
> -------------------------------------------------------------------------------------
>
>                 Key: IGNITE-13251
>                 URL: https://issues.apache.org/jira/browse/IGNITE-13251
>             Project: Ignite
>          Issue Type: Bug
>            Reporter: Alexander Lapin
>            Priority: Major
>
> rid-timeout-worker is known to go into a deadlock state with other threads in a few scenarios.
> The general scheme is:
>  1. A thread `T` is holding lock `L` and is trying to establish a communication connection, hanging in `safeTcpHandshake` method. Due to the logic of `safeTcpHandshake`, `grid-timeout-worker` needs to send a signal to `T` in order for it to proceed.
> 2. `grid-timeout-worker` is trying to acquire `L`. Hence, the deadlock.
> It may include more threads. The lock `L` can be different: checkpoint lock, GridCacheMapEntry lock, etc.
>  #  
>  ## Example
> The below example shows a lock between
>  * `grid-timeout-worker` trying to acquire a cp read lock in a `dumpLongRunningTransactions`
>  * `tcp-comm-worker` trying to establish a connection but hanging on socket read due to unstable network
>  * checkpointer trying to start a checkpoint and acquire cp write lock
>  * `utility` worker waiting for the connection to be established by `tcp-comm-worker` while holding cp read lock
> {code:java}
> Thread [name="grid-timeout-worker-#23", id=42, state=WAITING, blockCnt=6991, waitCnt=1746467]
>     Lock [object=java.util.concurrent.locks.ReentrantReadWriteLock$NonfairSync@4545a8b9, ownerName=null, ownerId=-1]
>         at sun.misc.Unsafe.park(Native Method)
>         at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
>         at java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:836)
>         at java.util.concurrent.locks.AbstractQueuedSynchronizer.doAcquireShared(AbstractQueuedSynchronizer.java:967)
>         at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireShared(AbstractQueuedSynchronizer.java:1283)
>         at java.util.concurrent.locks.ReentrantReadWriteLock$ReadLock.lock(ReentrantReadWriteLock.java:727)
>         at o.a.i.i.processors.cache.persistence.GridCacheDatabaseSharedManager.checkpointReadLock(GridCacheDatabaseSharedManager.java:1707)
>         at o.a.i.i.processors.cache.distributed.dht.GridPartitionedSingleGetFuture.setResult(GridPartitionedSingleGetFuture.java:715)
>         at o.a.i.i.processors.cache.distributed.dht.GridPartitionedSingleGetFuture.localGet(GridPartitionedSingleGetFuture.java:511)
>         at o.a.i.i.processors.cache.distributed.dht.GridPartitionedSingleGetFuture.tryLocalGet(GridPartitionedSingleGetFuture.java:399)
>         at o.a.i.i.processors.cache.distributed.dht.GridPartitionedSingleGetFuture.mapKeyToNode(GridPartitionedSingleGetFuture.java:366)
>         at o.a.i.i.processors.cache.distributed.dht.GridPartitionedSingleGetFuture.map(GridPartitionedSingleGetFuture.java:243)
>         at o.a.i.i.processors.cache.distributed.dht.GridPartitionedSingleGetFuture.init(GridPartitionedSingleGetFuture.java:232)
>         at o.a.i.i.processors.cache.distributed.dht.colocated.GridDhtColocatedCache.getAsync(GridDhtColocatedCache.java:246)
>         at o.a.i.i.processors.cache.GridCacheAdapter.get0(GridCacheAdapter.java:4190)
>         at o.a.i.i.processors.cache.GridCacheAdapter.get(GridCacheAdapter.java:4171)
>         at o.a.i.i.processors.cache.GridCacheAdapter.get(GridCacheAdapter.java:1362)
>         at o.a.i.i.processors.task.GridTaskProcessor.saveTaskMetadata(GridTaskProcessor.java:908)
>         at o.a.i.i.processors.task.GridTaskProcessor.startTask(GridTaskProcessor.java:746)
>         at o.a.i.i.processors.task.GridTaskProcessor.execute(GridTaskProcessor.java:477)
>         at o.a.i.i.processors.closure.GridClosureProcessor.callAsync(GridClosureProcessor.java:674)
>         at o.a.i.i.processors.closure.GridClosureProcessor.callAsync(GridClosureProcessor.java:479)
>         at o.a.i.i.IgniteComputeImpl.callAsync0(IgniteComputeImpl.java:809)
>         at o.a.i.i.IgniteComputeImpl.callAsync(IgniteComputeImpl.java:794)
>         at o.a.i.i.processors.cache.GridCachePartitionExchangeManager.dumpLongRunningTransaction(GridCachePartitionExchangeManager.java:2115)
>         at o.a.i.i.processors.cache.GridCachePartitionExchangeManager.dumpLongRunningOperations0(GridCachePartitionExchangeManager.java:2012)
>         - locked o.a.i.i.processors.cache.GridCachePartitionExchangeManager$ActionLimiter@47b86e8f
>         at o.a.i.i.processors.cache.GridCachePartitionExchangeManager.dumpLongRunningOperations(GridCachePartitionExchangeManager.java:2180)
>         at o.a.i.i.IgniteKernal$4.run(IgniteKernal.java:1478)
>         at o.a.i.i.processors.timeout.GridTimeoutProcessor$CancelableTask.onTimeout(GridTimeoutProcessor.java:410)
>         - locked o.a.i.i.processors.timeout.GridTimeoutProcessor$CancelableTask@67c951c7
>         at o.a.i.i.processors.timeout.GridTimeoutProcessor$TimeoutWorker.body(GridTimeoutProcessor.java:279)
>         at o.a.i.i.util.worker.GridWorker.run(GridWorker.java:120)
>         at java.lang.Thread.run(Thread.java:748)
> Thread [name="tcp-comm-worker-#1", id=50, state=RUNNABLE, blockCnt=16, waitCnt=6712]
>         at sun.nio.ch.FileDispatcherImpl.read0(Native Method)
>         at sun.nio.ch.SocketDispatcher.read(SocketDispatcher.java:39)
>         at sun.nio.ch.IOUtil.readIntoNativeBuffer(IOUtil.java:223)
>         at sun.nio.ch.IOUtil.read(IOUtil.java:197)
>         at sun.nio.ch.SocketChannelImpl.read(SocketChannelImpl.java:377)
>         - locked java.lang.Object@71681f0c
>         at o.a.i.spi.communication.tcp.TcpCommunicationSpi.safeTcpHandshake(TcpCommunicationSpi.java:3906)
>         at o.a.i.spi.communication.tcp.TcpCommunicationSpi.createTcpClient(TcpCommunicationSpi.java:3420)
>         at o.a.i.spi.communication.tcp.TcpCommunicationSpi.createNioClient(TcpCommunicationSpi.java:3055)
>         at o.a.i.spi.communication.tcp.TcpCommunicationSpi.reserveClient(TcpCommunicationSpi.java:2908)
>         at o.a.i.spi.communication.tcp.TcpCommunicationSpi.access$6300(TcpCommunicationSpi.java:272)
>         at o.a.i.spi.communication.tcp.TcpCommunicationSpi$CommunicationWorker.processDisconnect(TcpCommunicationSpi.java:4603)
>         at o.a.i.spi.communication.tcp.TcpCommunicationSpi$CommunicationWorker.body(TcpCommunicationSpi.java:4408)
>         at o.a.i.i.util.worker.GridWorker.run(GridWorker.java:120)
>         at o.a.i.spi.communication.tcp.TcpCommunicationSpi$7.body(TcpCommunicationSpi.java:2261)
>         at o.a.i.spi.IgniteSpiThread.run(IgniteSpiThread.java:62)
> Thread [name="db-checkpoint-thread-#69", id=144, state=WAITING, blockCnt=0, waitCnt=12657468]
>     Lock [object=java.util.concurrent.locks.ReentrantReadWriteLock$NonfairSync@4545a8b9, ownerName=null, ownerId=-1]
>         at sun.misc.Unsafe.park(Native Method)
>         at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
>         at java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:836)
>         at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireQueued(AbstractQueuedSynchronizer.java:870)
>         at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquire(AbstractQueuedSynchronizer.java:1199)
>         at java.util.concurrent.locks.ReentrantReadWriteLock$WriteLock.lock(ReentrantReadWriteLock.java:943)
>         at o.a.i.i.processors.cache.persistence.GridCacheDatabaseSharedManager$Checkpointer.markCheckpointBegin(GridCacheDatabaseSharedManager.java:4059)
>         at o.a.i.i.processors.cache.persistence.GridCacheDatabaseSharedManager$Checkpointer.doCheckpoint(GridCacheDatabaseSharedManager.java:3637)
>         at o.a.i.i.processors.cache.persistence.GridCacheDatabaseSharedManager$Checkpointer.body(GridCacheDatabaseSharedManager.java:3522)
>         at o.a.i.i.util.worker.GridWorker.run(GridWorker.java:120)
>         at java.lang.Thread.run(Thread.java:748)
> Thread [name="utility-#53812", id=59375, state=TIMED_WAITING, blockCnt=0, waitCnt=1433]
>         at sun.misc.Unsafe.park(Native Method)
>         at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:338)
>         at o.a.i.i.util.future.GridFutureAdapter.get0(GridFutureAdapter.java:219)
>         at o.a.i.i.util.future.GridFutureAdapter.get(GridFutureAdapter.java:160)
>         at o.a.i.spi.communication.tcp.TcpCommunicationSpi.reserveClient(TcpCommunicationSpi.java:2959)
>         at o.a.i.spi.communication.tcp.TcpCommunicationSpi.sendMessage0(TcpCommunicationSpi.java:2751)
>         at o.a.i.spi.communication.tcp.TcpCommunicationSpi.sendMessage(TcpCommunicationSpi.java:2710)
>         at o.a.i.i.managers.communication.GridIoManager.send(GridIoManager.java:1643)
>         at o.a.i.i.managers.communication.GridIoManager.sendOrderedMessage(GridIoManager.java:1863)
>         at o.a.i.i.processors.continuous.GridContinuousProcessor.sendWithRetries(GridContinuousProcessor.java:1873)
>         at o.a.i.i.processors.continuous.GridContinuousProcessor.sendWithRetries(GridContinuousProcessor.java:1844)
>         at o.a.i.i.processors.continuous.GridContinuousProcessor.sendWithRetries(GridContinuousProcessor.java:1826)
>         at o.a.i.i.processors.continuous.GridContinuousProcessor.sendNotification(GridContinuousProcessor.java:1244)
>         at o.a.i.i.processors.continuous.GridContinuousProcessor.addNotification(GridContinuousProcessor.java:1181)
>         at o.a.i.i.processors.cache.query.continuous.CacheContinuousQueryHandler.onEntryUpdate(CacheContinuousQueryHandler.java:882)
>         at o.a.i.i.processors.cache.query.continuous.CacheContinuousQueryHandler.access$600(CacheContinuousQueryHandler.java:85)
>         at o.a.i.i.processors.cache.query.continuous.CacheContinuousQueryHandler$2.onEntryUpdated(CacheContinuousQueryHandler.java:429)
>         at o.a.i.i.processors.cache.query.continuous.CacheContinuousQueryManager.onEntryUpdated(CacheContinuousQueryManager.java:448)
>         at o.a.i.i.processors.cache.GridCacheMapEntry.innerSet(GridCacheMapEntry.java:1115)
>         at o.a.i.i.processors.cache.transactions.IgniteTxLocalAdapter.userCommit(IgniteTxLocalAdapter.java:747)
>         at o.a.i.i.processors.cache.distributed.dht.GridDhtTxLocalAdapter.localFinish(GridDhtTxLocalAdapter.java:796)
>         at o.a.i.i.processors.cache.distributed.dht.GridDhtTxLocal.localFinish(GridDhtTxLocal.java:603)
>         at o.a.i.i.processors.cache.distributed.dht.GridDhtTxLocal.finishTx(GridDhtTxLocal.java:475)
>         at o.a.i.i.processors.cache.distributed.dht.GridDhtTxLocal.commitDhtLocalAsync(GridDhtTxLocal.java:532)
>         at o.a.i.i.processors.cache.transactions.IgniteTxHandler.finishDhtLocal(IgniteTxHandler.java:1042)
>         at o.a.i.i.processors.cache.transactions.IgniteTxHandler.finish(IgniteTxHandler.java:921)
>         at o.a.i.i.processors.cache.transactions.IgniteTxHandler.processNearTxFinishRequest(IgniteTxHandler.java:877)
>         at o.a.i.i.processors.cache.transactions.IgniteTxHandler.access$200(IgniteTxHandler.java:109)
>         at o.a.i.i.processors.cache.transactions.IgniteTxHandler$3.apply(IgniteTxHandler.java:203)
>         at o.a.i.i.processors.cache.transactions.IgniteTxHandler$3.apply(IgniteTxHandler.java:201)
>         at o.a.i.i.processors.cache.GridCacheIoManager.processMessage(GridCacheIoManager.java:1078)
>         at o.a.i.i.processors.cache.GridCacheIoManager.onMessage0(GridCacheIoManager.java:587)
>         at o.a.i.i.processors.cache.GridCacheIoManager.handleMessage(GridCacheIoManager.java:386)
>         at o.a.i.i.processors.cache.GridCacheIoManager.handleMessage(GridCacheIoManager.java:312)
>         at o.a.i.i.processors.cache.GridCacheIoManager.access$100(GridCacheIoManager.java:102)
>         at o.a.i.i.processors.cache.GridCacheIoManager$1.onMessage(GridCacheIoManager.java:301)
>         at o.a.i.i.managers.communication.GridIoManager.invokeListener(GridIoManager.java:1556)
>         at o.a.i.i.managers.communication.GridIoManager.processRegularMessage0(GridIoManager.java:1184)
>         at o.a.i.i.managers.communication.GridIoManager.access$4200(GridIoManager.java:125)
>         at o.a.i.i.managers.communication.GridIoManager$9.run(GridIoManager.java:1091)
>         at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
>         at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>         at java.lang.Thread.run(Thread.java:748)
> {code}
> {color:#172b4d}{{}}{color}



--
This message was sent by Atlassian Jira
(v8.3.4#803005)