You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@activemq.apache.org by ni...@apache.org on 2019/06/10 11:37:15 UTC
[activemq-artemis] branch master updated: ARTEMIS-2069 Backup
doesn't activate after shared store is reconnected
This is an automated email from the ASF dual-hosted git repository.
nigrofranz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/activemq-artemis.git
The following commit(s) were added to refs/heads/master by this push:
new 097ef28 ARTEMIS-2069 Backup doesn't activate after shared store is reconnected
new 42f0157 This closes #2287
097ef28 is described below
commit 097ef281fd987579c9a92a50fb1906729b07d5f5
Author: Tomas Hofman <th...@redhat.com>
AuthorDate: Mon Sep 3 15:47:03 2018 +0200
ARTEMIS-2069 Backup doesn't activate after shared store is reconnected
---
.../ActiveMQLockAcquisitionTimeoutException.java | 29 ++++
.../core/server/impl/FileLockNodeManager.java | 69 ++++++----
.../extras/byteman/FileLockNodeManagerTest.java | 86 ++++++++++++
.../byteman/SharedStoreBackupActivationTest.java | 150 +++++++++++++++++++++
.../cluster/failover/FailoverTestBase.java | 4 +
5 files changed, 309 insertions(+), 29 deletions(-)
diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/ActiveMQLockAcquisitionTimeoutException.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/ActiveMQLockAcquisitionTimeoutException.java
new file mode 100644
index 0000000..af6d5c2
--- /dev/null
+++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/ActiveMQLockAcquisitionTimeoutException.java
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.activemq.artemis.core.server;
+
+import org.apache.activemq.artemis.api.core.ActiveMQException;
+
+public class ActiveMQLockAcquisitionTimeoutException extends ActiveMQException {
+
+ public ActiveMQLockAcquisitionTimeoutException() {
+ }
+
+ public ActiveMQLockAcquisitionTimeoutException(String msg) {
+ super(msg);
+ }
+}
diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/FileLockNodeManager.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/FileLockNodeManager.java
index 73156ee..2044189 100644
--- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/FileLockNodeManager.java
+++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/FileLockNodeManager.java
@@ -24,6 +24,7 @@ import java.nio.channels.FileLock;
import org.apache.activemq.artemis.api.core.ActiveMQIllegalStateException;
import org.apache.activemq.artemis.api.core.SimpleString;
import org.apache.activemq.artemis.core.server.ActivateCallback;
+import org.apache.activemq.artemis.core.server.ActiveMQLockAcquisitionTimeoutException;
import org.apache.activemq.artemis.core.server.ActiveMQServerLogger;
import org.apache.activemq.artemis.core.server.NodeManager;
import org.apache.activemq.artemis.utils.UUID;
@@ -49,6 +50,8 @@ public class FileLockNodeManager extends NodeManager {
private static final byte NOT_STARTED = 'N';
+ private static final long LOCK_ACCESS_FAILURE_WAIT_TIME = 2000;
+
private FileLock liveLock;
private FileLock backupLock;
@@ -299,44 +302,52 @@ public class FileLockNodeManager extends NodeManager {
protected FileLock lock(final long lockPosition) throws Exception {
long start = System.currentTimeMillis();
+ boolean isRecurringFailure = false;
while (!interrupted) {
- FileLock lock = tryLock(lockPosition);
-
- if (lock == null) {
- try {
- Thread.sleep(500);
- } catch (InterruptedException e) {
- return null;
+ try {
+ FileLock lock = tryLock(lockPosition);
+ isRecurringFailure = false;
+
+ if (lock == null) {
+ try {
+ Thread.sleep(500);
+ } catch (InterruptedException e) {
+ return null;
+ }
+
+ if (lockAcquisitionTimeout != -1 && (System.currentTimeMillis() - start) > lockAcquisitionTimeout) {
+ throw new ActiveMQLockAcquisitionTimeoutException("timed out waiting for lock");
+ }
+ } else {
+ return lock;
}
-
- if (lockAcquisitionTimeout != -1 && (System.currentTimeMillis() - start) > lockAcquisitionTimeout) {
- throw new Exception("timed out waiting for lock");
+ } catch (IOException e) {
+ // IOException during trylock() may be a temporary issue, e.g. NFS volume not being accessible
+
+ logger.log(isRecurringFailure ? Logger.Level.DEBUG : Logger.Level.WARN,
+ "Failure when accessing a lock file", e);
+ isRecurringFailure = true;
+
+ long waitTime = LOCK_ACCESS_FAILURE_WAIT_TIME;
+ if (lockAcquisitionTimeout != -1) {
+ final long remainingTime = lockAcquisitionTimeout - (System.currentTimeMillis() - start);
+ if (remainingTime <= 0) {
+ throw new ActiveMQLockAcquisitionTimeoutException("timed out waiting for lock");
+ }
+ waitTime = Math.min(waitTime, remainingTime);
}
- } else {
- return lock;
- }
- }
- // todo this is here because sometimes channel.lock throws a resource deadlock exception but trylock works,
- // need to investigate further and review
- FileLock lock;
- do {
- lock = tryLock(lockPosition);
- if (lock == null) {
try {
- Thread.sleep(500);
- } catch (InterruptedException e1) {
- //
+ Thread.sleep(waitTime);
+ } catch (InterruptedException interrupt) {
+ return null;
}
}
- if (interrupted) {
- interrupted = false;
- throw new IOException("Lock was interrupted");
- }
}
- while (lock == null);
- return lock;
+
+ // presumed interrupted
+ return null;
}
}
diff --git a/tests/extra-tests/src/test/java/org/apache/activemq/artemis/tests/extras/byteman/FileLockNodeManagerTest.java b/tests/extra-tests/src/test/java/org/apache/activemq/artemis/tests/extras/byteman/FileLockNodeManagerTest.java
new file mode 100644
index 0000000..4a74018
--- /dev/null
+++ b/tests/extra-tests/src/test/java/org/apache/activemq/artemis/tests/extras/byteman/FileLockNodeManagerTest.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.activemq.artemis.tests.extras.byteman;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.activemq.artemis.core.server.impl.FileLockNodeManager;
+import org.jboss.byteman.contrib.bmunit.BMRule;
+import org.jboss.byteman.contrib.bmunit.BMRules;
+import org.jboss.byteman.contrib.bmunit.BMUnitRunner;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+@RunWith(BMUnitRunner.class)
+public class FileLockNodeManagerTest {
+
+ private static final int TIMEOUT_TOLERANCE = 50;
+
+ private File sharedDir;
+
+ public FileLockNodeManagerTest() throws IOException {
+ sharedDir = File.createTempFile("shared-dir", "");
+ sharedDir.delete();
+ Assert.assertTrue(sharedDir.mkdir());
+ }
+
+ @Test
+ @BMRules(
+ rules = {@BMRule(
+ name = "throw IOException during activation",
+ targetClass = "org.apache.activemq.artemis.core.server.impl.FileLockNodeManager",
+ targetMethod = "tryLock",
+ targetLocation = "AT ENTRY",
+ action = "THROW new IOException(\"IO Error\");")
+ })
+ public void test() throws Exception {
+ measureLockAcquisisionTimeout(100); // warm-up
+
+ assertMeasuredTimeoutFor(100);
+ assertMeasuredTimeoutFor(200);
+ }
+
+ protected void assertMeasuredTimeoutFor(long lockAcquisitionTimeout) throws Exception {
+ long realTimeout = measureLockAcquisisionTimeout(lockAcquisitionTimeout);
+ System.out.println(String.format("lockAcquisisionTimeout: %d ms, measured timeout: %d ms", lockAcquisitionTimeout, realTimeout));
+ Assert.assertTrue(String.format("Timeout %d ms was larger than expected %d ms", realTimeout, lockAcquisitionTimeout + TIMEOUT_TOLERANCE),
+ lockAcquisitionTimeout + TIMEOUT_TOLERANCE >= realTimeout);
+ Assert.assertTrue(String.format("Timeout %d ms was lower than expected %d ms", realTimeout, lockAcquisitionTimeout),
+ lockAcquisitionTimeout + TIMEOUT_TOLERANCE >= realTimeout);
+ }
+
+ private long measureLockAcquisisionTimeout(long lockAcquisitionTimeout) throws Exception {
+ FileLockNodeManager manager = new FileLockNodeManager(sharedDir, false, lockAcquisitionTimeout);
+ manager.start();
+
+ // try to lock and measure real timeout
+ long start = System.currentTimeMillis();
+ try {
+ manager.awaitLiveNode();
+ } catch (Exception e) {
+ long stop = System.currentTimeMillis();
+ if (!"timed out waiting for lock".equals(e.getMessage())) {
+ throw e;
+ }
+ return stop - start;
+ }
+ Assert.fail("Exception expected");
+ return 0;
+ }
+}
diff --git a/tests/extra-tests/src/test/java/org/apache/activemq/artemis/tests/extras/byteman/SharedStoreBackupActivationTest.java b/tests/extra-tests/src/test/java/org/apache/activemq/artemis/tests/extras/byteman/SharedStoreBackupActivationTest.java
new file mode 100644
index 0000000..2df895d
--- /dev/null
+++ b/tests/extra-tests/src/test/java/org/apache/activemq/artemis/tests/extras/byteman/SharedStoreBackupActivationTest.java
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.activemq.artemis.tests.extras.byteman;
+
+import java.io.File;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.activemq.artemis.api.core.TransportConfiguration;
+import org.apache.activemq.artemis.core.config.ScaleDownConfiguration;
+import org.apache.activemq.artemis.core.config.ha.SharedStoreMasterPolicyConfiguration;
+import org.apache.activemq.artemis.core.config.ha.SharedStoreSlavePolicyConfiguration;
+import org.apache.activemq.artemis.core.server.NodeManager;
+import org.apache.activemq.artemis.core.server.impl.FileLockNodeManager;
+import org.apache.activemq.artemis.tests.integration.cluster.failover.FailoverTestBase;
+import org.apache.activemq.artemis.tests.util.TransportConfigurationUtils;
+import org.jboss.byteman.contrib.bmunit.BMRule;
+import org.jboss.byteman.contrib.bmunit.BMRules;
+import org.jboss.byteman.contrib.bmunit.BMUnitRunner;
+import org.jboss.logging.Logger;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+@RunWith(BMUnitRunner.class)
+public class SharedStoreBackupActivationTest extends FailoverTestBase {
+
+ private static Logger logger = Logger.getLogger(SharedStoreBackupActivationTest.class);
+
+ private static volatile boolean throwException = false;
+ private static CountDownLatch exceptionThrownLatch;
+
+ public static synchronized boolean isThrowException() {
+ logger.debugf("Throwing IOException during FileLockNodeManager.tryLock(): %s", throwException);
+ if (exceptionThrownLatch != null) {
+ exceptionThrownLatch.countDown();
+ }
+ return throwException;
+ }
+
+ /**
+ * Waits for the backup server to call FileLockNodeManager.tryLock().
+ */
+ public static void awaitTryLock(boolean throwException) throws InterruptedException {
+ synchronized (SharedStoreBackupActivationTest.class) {
+ SharedStoreBackupActivationTest.throwException = throwException;
+ exceptionThrownLatch = new CountDownLatch(1);
+ }
+ logger.debugf("Awaiting backup to perform FileLockNodeManager.tryLock()");
+ boolean ret = exceptionThrownLatch.await(10, TimeUnit.SECONDS);
+ SharedStoreBackupActivationTest.throwException = false;
+
+ Assert.assertTrue("FileLockNodeManager.tryLock() was not called during specified timeout", ret);
+ logger.debugf("Awaiting FileLockNodeManager.tryLock() done");
+ }
+
+ @Test
+ @BMRules(
+ rules = {@BMRule(
+ name = "throw IOException during activation",
+ targetClass = "org.apache.activemq.artemis.core.server.impl.FileLockNodeManager",
+ targetMethod = "tryLock",
+ targetLocation = "AT ENTRY",
+ condition = "org.apache.activemq.artemis.tests.extras.byteman.SharedStoreBackupActivationTest.isThrowException()",
+ action = "THROW new IOException(\"IO Error\");")
+ })
+ public void testFailOverAfterTryLockException() throws Exception {
+ Assert.assertTrue(liveServer.isActive());
+ Assert.assertFalse(backupServer.isActive());
+
+ // wait for backup to try to acquire lock, once without exception (acquiring will not succeed because live is
+ // still active)
+ awaitTryLock(false);
+
+ // wait for backup to try to acquire lock, this time throw an IOException
+ logger.debug("Causing exception");
+ awaitTryLock(true);
+
+ // stop live server
+ logger.debugf("Stopping live server");
+ liveServer.stop();
+ waitForServerToStop(liveServer.getServer());
+ logger.debugf("Live server stopped, waiting for backup activation");
+ backupServer.getServer().waitForActivation(10, TimeUnit.SECONDS);
+
+ // backup should be activated by now
+ Assert.assertFalse(liveServer.isActive());
+ Assert.assertTrue("Backup server didn't activate", backupServer.isActive());
+ }
+
+ @Override
+ protected TransportConfiguration getAcceptorTransportConfiguration(final boolean live) {
+ return TransportConfigurationUtils.getInVMAcceptor(live);
+ }
+
+ @Override
+ protected TransportConfiguration getConnectorTransportConfiguration(final boolean live) {
+ return TransportConfigurationUtils.getInVMConnector(live);
+ }
+
+ @Override
+ protected void createConfigs() throws Exception {
+ File sharedDir = File.createTempFile("shared-dir", "");
+ sharedDir.delete();
+ Assert.assertTrue(sharedDir.mkdir());
+ logger.debugf("Created shared store directory %s", sharedDir.getCanonicalPath());
+
+ TransportConfiguration liveConnector = getConnectorTransportConfiguration(true);
+ TransportConfiguration backupConnector = getConnectorTransportConfiguration(false);
+
+ // nodes must use separate FileLockNodeManager instances!
+ NodeManager liveNodeManager = new FileLockNodeManager(sharedDir, false);
+ NodeManager backupNodeManager = new FileLockNodeManager(sharedDir, false);
+
+ backupConfig = super.createDefaultConfig(false)
+ .clearAcceptorConfigurations()
+ .addAcceptorConfiguration(getAcceptorTransportConfiguration(false))
+ .setHAPolicyConfiguration(
+ new SharedStoreSlavePolicyConfiguration()
+ .setScaleDownConfiguration(new ScaleDownConfiguration().setEnabled(false))
+ .setRestartBackup(false))
+ .addConnectorConfiguration(liveConnector.getName(), liveConnector)
+ .addConnectorConfiguration(backupConnector.getName(), backupConnector)
+ .addClusterConfiguration(basicClusterConnectionConfig(backupConnector.getName(), liveConnector.getName()));
+ backupServer = createTestableServer(backupConfig, backupNodeManager);
+
+ liveConfig = super.createDefaultConfig(false)
+ .clearAcceptorConfigurations()
+ .addAcceptorConfiguration(getAcceptorTransportConfiguration(true))
+ .setHAPolicyConfiguration(new SharedStoreMasterPolicyConfiguration().setFailoverOnServerShutdown(true))
+ .addClusterConfiguration(basicClusterConnectionConfig(liveConnector.getName()))
+ .addConnectorConfiguration(liveConnector.getName(), liveConnector);
+ liveServer = createTestableServer(liveConfig, liveNodeManager);
+ }
+
+}
diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTestBase.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTestBase.java
index 2a75f94..e4d4711 100644
--- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTestBase.java
+++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTestBase.java
@@ -113,6 +113,10 @@ public abstract class FailoverTestBase extends ActiveMQTestBase {
}
protected TestableServer createTestableServer(Configuration config) throws Exception {
+ return createTestableServer(config, nodeManager);
+ }
+
+ protected TestableServer createTestableServer(Configuration config, NodeManager nodeManager) throws Exception {
boolean isBackup = config.getHAPolicyConfiguration() instanceof ReplicaPolicyConfiguration || config.getHAPolicyConfiguration() instanceof SharedStoreSlavePolicyConfiguration;
return new SameProcessActiveMQServer(createInVMFailoverServer(true, config, nodeManager, isBackup ? 2 : 1));
}