You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ignite.apache.org by av...@apache.org on 2022/08/17 14:52:15 UTC
[ignite] branch master updated: IGNITE-17457 Fix cluster lock after tx recovery (#10178)
This is an automated email from the ASF dual-hosted git repository.
av pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ignite.git
The following commit(s) were added to refs/heads/master by this push:
new 2b31b9d3743 IGNITE-17457 Fix cluster lock after tx recovery (#10178)
2b31b9d3743 is described below
commit 2b31b9d374343f9bd960c89edebf97c1a40d93fe
Author: Sergey Korotkov <se...@gmail.com>
AuthorDate: Wed Aug 17 21:52:07 2022 +0700
IGNITE-17457 Fix cluster lock after tx recovery (#10178)
---
.../cache/transactions/IgniteTxAdapter.java | 6 +-
.../transactions/TxRecoveryConcurrentTest.java | 177 +++++++++++++++++++++
.../ignite/testsuites/IgniteCacheTestSuite12.java | 2 +
3 files changed, 181 insertions(+), 4 deletions(-)
diff --git a/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/transactions/IgniteTxAdapter.java b/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/transactions/IgniteTxAdapter.java
index 11b2c040a62..0fe1da5b8e7 100644
--- a/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/transactions/IgniteTxAdapter.java
+++ b/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/transactions/IgniteTxAdapter.java
@@ -596,14 +596,12 @@ public abstract class IgniteTxAdapter extends GridMetadataAwareAdapter implement
switch (status) {
case USER_FINISH:
- res = FINALIZING_UPD.compareAndSet(this, FinalizationStatus.NONE, FinalizationStatus.USER_FINISH);
+ res = FINALIZING_UPD.compareAndSet(this, FinalizationStatus.NONE, status);
break;
case RECOVERY_FINISH:
- FinalizationStatus old = finalizing;
-
- res = old != FinalizationStatus.USER_FINISH && FINALIZING_UPD.compareAndSet(this, old, status);
+ res = FINALIZING_UPD.compareAndSet(this, FinalizationStatus.NONE, status) || finalizing == status;
break;
diff --git a/modules/core/src/test/java/org/apache/ignite/internal/processors/cache/transactions/TxRecoveryConcurrentTest.java b/modules/core/src/test/java/org/apache/ignite/internal/processors/cache/transactions/TxRecoveryConcurrentTest.java
new file mode 100644
index 00000000000..970c56c43d2
--- /dev/null
+++ b/modules/core/src/test/java/org/apache/ignite/internal/processors/cache/transactions/TxRecoveryConcurrentTest.java
@@ -0,0 +1,177 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.ignite.internal.processors.cache.transactions;
+
+import java.util.Collection;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ThreadPoolExecutor;
+import org.apache.ignite.Ignite;
+import org.apache.ignite.IgniteCache;
+import org.apache.ignite.cache.affinity.rendezvous.RendezvousAffinityFunction;
+import org.apache.ignite.configuration.CacheConfiguration;
+import org.apache.ignite.configuration.IgniteConfiguration;
+import org.apache.ignite.failure.StopNodeFailureHandler;
+import org.apache.ignite.internal.IgniteEx;
+import org.apache.ignite.internal.IgniteInterruptedCheckedException;
+import org.apache.ignite.internal.TestRecordingCommunicationSpi;
+import org.apache.ignite.internal.util.typedef.G;
+import org.apache.ignite.testframework.GridTestUtils;
+import org.apache.ignite.testframework.junits.common.GridCommonAbstractTest;
+import org.apache.ignite.transactions.TransactionState;
+import org.junit.Test;
+
+import static org.apache.ignite.cache.CacheAtomicityMode.TRANSACTIONAL;
+import static org.apache.ignite.cache.CacheMode.PARTITIONED;
+import static org.apache.ignite.testframework.GridTestUtils.runAsync;
+
+/**
+ * Tests concurrent execution of the tx recovery.
+ */
+public class TxRecoveryConcurrentTest extends GridCommonAbstractTest {
+ /** {@inheritDoc} */
+ @Override protected IgniteConfiguration getConfiguration(String name) throws Exception {
+ final IgniteConfiguration cfg = super.getConfiguration(name);
+
+ cfg.setConsistentId(name);
+
+ cfg.setCommunicationSpi(new TestRecordingCommunicationSpi());
+
+ cfg.setFailureHandler(new StopNodeFailureHandler());
+
+ cfg.setSystemThreadPoolSize(1);
+ cfg.setStripedPoolSize(1);
+
+ cfg.setCacheConfiguration(new CacheConfiguration<>(DEFAULT_CACHE_NAME)
+ .setCacheMode(PARTITIONED)
+ .setBackups(2)
+ .setAtomicityMode(TRANSACTIONAL)
+ .setReadFromBackup(true)
+ .setAffinity(new RendezvousAffinityFunction(false, 1)));
+
+ return cfg;
+ }
+
+ /**
+ * The test enforces the concurrent processing of the same prepared transaction
+ * both in the tx recovery procedure started due to near and primary node left and in the
+ * tx recovery request handler invoked by message from another backup node.
+ * <p>
+ * The idea is to have a 3-nodes cluster and a cache with 2 backups. So there
+ * will be 2 backup nodes to execute the tx recovery in parallel if primary one
+ * would fail. These backup nodes will send the tx recovery requests to each
+ * other, so the tx recovery request handler will be invoked as well.
+ * <p>
+ * Blocking is used to force concurrent processing on one of the backup nodes. Another
+ * backup works unconstrained to provide the right environment for the blocked one.
+ * In particular, it should send a tx recovery request to the blocked backup.
+ * <p>
+ * Use several attempts to reproduce the race condition in the blocked backup node.
+ * <p>
+ * Expected result: transaction is finished on both backup nodes and the partition
+ * map exchange is completed as well.
+ */
+ @Test
+ public void testRecoveryNotDeadLockOnNearAndPrimaryFail() throws Exception {
+ startGrids(2);
+
+ final int key = 0;
+
+ for (int iter = 0; iter < 100; iter++) {
+ startGrid(iter + 2);
+
+ awaitPartitionMapExchange();
+
+ final Ignite primary = primaryNode(key, DEFAULT_CACHE_NAME);
+ final List<Ignite> backups = backupNodes(key, DEFAULT_CACHE_NAME);
+
+ final IgniteCache<Object, Object> cache = primary.cache(DEFAULT_CACHE_NAME);
+
+ final TransactionProxyImpl<?, ?> tx = (TransactionProxyImpl<?, ?>)primary.transactions().txStart();
+
+ cache.put(key, key + iter);
+
+ tx.tx().prepare(true);
+
+ for (Ignite grid : G.allGrids())
+ assertTrue(((IgniteEx)grid).context().cache().context().tm().activeTransactions().size() == 1);
+
+ final Collection<IgniteInternalTx> backupTransactions = new LinkedList<>();
+
+ for (Ignite backup : backups)
+ backupTransactions.addAll(((IgniteEx)backup).context().cache().context().tm().activeTransactions());
+
+ assertTrue(backupTransactions.size() == 2);
+
+ final CountDownLatch ensureBothPoolsAreBlockedLatch = new CountDownLatch(2);
+ final CountDownLatch unblockBothPoolsLatch = new CountDownLatch(1);
+
+ final Runnable poolBlockerTask = () -> {
+ try {
+ ensureBothPoolsAreBlockedLatch.countDown();
+ unblockBothPoolsLatch.await();
+ }
+ catch (InterruptedException ignored) {
+ // No-op.
+ }
+ };
+
+ final IgniteEx blockedBackup = (IgniteEx)backups.get(0);
+
+ blockedBackup.context().pools().getSystemExecutorService().execute(poolBlockerTask);
+ blockedBackup.context().pools().getStripedExecutorService().execute(0, poolBlockerTask);
+
+ ensureBothPoolsAreBlockedLatch.await();
+
+ runAsync(primary::close);
+
+ waitForTxRecoveryRequestEnqueuedOn(blockedBackup);
+ waitForTxRecoveryTaskEnqueuedOn(blockedBackup);
+
+ // Unblock processing in blocked backup node. Simultaneously in striped and system pools to start recovery
+ // procedure and the tx recovery request processing at the "same" moment (for the same transaction). This
+ // should increase chances for race condition occur in the IgniteTxAdapter#markFinalizing.
+ unblockBothPoolsLatch.countDown();
+
+ waitForTopology(2);
+
+ awaitPartitionMapExchange();
+
+ for (IgniteInternalTx transaction : backupTransactions) {
+ assertTrue(transaction.finishFuture().isDone());
+ assertTrue(transaction.state() == TransactionState.COMMITTED);
+ }
+
+ for (Ignite backup : backups)
+ assertEquals(key + iter, backup.cache(DEFAULT_CACHE_NAME).get(key));
+ }
+ }
+
+ /** */
+ private void waitForTxRecoveryRequestEnqueuedOn(IgniteEx grid) throws IgniteInterruptedCheckedException {
+ assertTrue(GridTestUtils.waitForCondition(() ->
+ grid.context().pools().getStripedExecutorService().queueStripeSize(0) > 0, 5_000, 10));
+ }
+
+ /** */
+ private void waitForTxRecoveryTaskEnqueuedOn(IgniteEx grid) throws IgniteInterruptedCheckedException {
+ assertTrue(GridTestUtils.waitForCondition(() ->
+ !((ThreadPoolExecutor)grid.context().pools().getSystemExecutorService()).getQueue().isEmpty(), 5_000, 10));
+ }
+}
diff --git a/modules/core/src/test/java/org/apache/ignite/testsuites/IgniteCacheTestSuite12.java b/modules/core/src/test/java/org/apache/ignite/testsuites/IgniteCacheTestSuite12.java
index 8b0fe38f269..222161fac11 100755
--- a/modules/core/src/test/java/org/apache/ignite/testsuites/IgniteCacheTestSuite12.java
+++ b/modules/core/src/test/java/org/apache/ignite/testsuites/IgniteCacheTestSuite12.java
@@ -44,6 +44,7 @@ import org.apache.ignite.internal.processors.cache.transactions.AtomicVolatilePa
import org.apache.ignite.internal.processors.cache.transactions.TransactionIntegrityWithPrimaryIndexCorruptionTest;
import org.apache.ignite.internal.processors.cache.transactions.TxCrossCacheMapOnInvalidTopologyTest;
import org.apache.ignite.internal.processors.cache.transactions.TxCrossCacheRemoteMultiplePartitionReservationTest;
+import org.apache.ignite.internal.processors.cache.transactions.TxRecoveryConcurrentTest;
import org.apache.ignite.internal.processors.cache.transactions.TxRecoveryWithConcurrentRollbackTest;
import org.apache.ignite.internal.processors.cache.transactions.TxWithKeyContentionSelfTest;
import org.apache.ignite.testframework.GridTestUtils;
@@ -86,6 +87,7 @@ public class IgniteCacheTestSuite12 {
GridTestUtils.addTestIfNeeded(suite, SafeLogTxFinishErrorTest.class, ignoredTests);
+ GridTestUtils.addTestIfNeeded(suite, TxRecoveryConcurrentTest.class, ignoredTests);
GridTestUtils.addTestIfNeeded(suite, TxRecoveryWithConcurrentRollbackTest.class, ignoredTests);
GridTestUtils.addTestIfNeeded(suite, AtomicPartitionCounterStateConsistencyTest.class, ignoredTests);