You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@activemq.apache.org by ni...@apache.org on 2019/06/10 11:37:15 UTC

[activemq-artemis] branch master updated: ARTEMIS-2069 Backup doesn't activate after shared store is reconnected

This is an automated email from the ASF dual-hosted git repository.

nigrofranz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/activemq-artemis.git


The following commit(s) were added to refs/heads/master by this push:
     new 097ef28  ARTEMIS-2069 Backup doesn't activate after shared store is reconnected
     new 42f0157  This closes #2287
097ef28 is described below

commit 097ef281fd987579c9a92a50fb1906729b07d5f5
Author: Tomas Hofman <th...@redhat.com>
AuthorDate: Mon Sep 3 15:47:03 2018 +0200

    ARTEMIS-2069 Backup doesn't activate after shared store is reconnected
---
 .../ActiveMQLockAcquisitionTimeoutException.java   |  29 ++++
 .../core/server/impl/FileLockNodeManager.java      |  69 ++++++----
 .../extras/byteman/FileLockNodeManagerTest.java    |  86 ++++++++++++
 .../byteman/SharedStoreBackupActivationTest.java   | 150 +++++++++++++++++++++
 .../cluster/failover/FailoverTestBase.java         |   4 +
 5 files changed, 309 insertions(+), 29 deletions(-)

diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/ActiveMQLockAcquisitionTimeoutException.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/ActiveMQLockAcquisitionTimeoutException.java
new file mode 100644
index 0000000..af6d5c2
--- /dev/null
+++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/ActiveMQLockAcquisitionTimeoutException.java
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.activemq.artemis.core.server;
+
+import org.apache.activemq.artemis.api.core.ActiveMQException;
+
+public class ActiveMQLockAcquisitionTimeoutException extends ActiveMQException {
+
+   public ActiveMQLockAcquisitionTimeoutException() {
+   }
+
+   public ActiveMQLockAcquisitionTimeoutException(String msg) {
+      super(msg);
+   }
+}
diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/FileLockNodeManager.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/FileLockNodeManager.java
index 73156ee..2044189 100644
--- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/FileLockNodeManager.java
+++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/FileLockNodeManager.java
@@ -24,6 +24,7 @@ import java.nio.channels.FileLock;
 import org.apache.activemq.artemis.api.core.ActiveMQIllegalStateException;
 import org.apache.activemq.artemis.api.core.SimpleString;
 import org.apache.activemq.artemis.core.server.ActivateCallback;
+import org.apache.activemq.artemis.core.server.ActiveMQLockAcquisitionTimeoutException;
 import org.apache.activemq.artemis.core.server.ActiveMQServerLogger;
 import org.apache.activemq.artemis.core.server.NodeManager;
 import org.apache.activemq.artemis.utils.UUID;
@@ -49,6 +50,8 @@ public class FileLockNodeManager extends NodeManager {
 
    private static final byte NOT_STARTED = 'N';
 
+   private static final long LOCK_ACCESS_FAILURE_WAIT_TIME = 2000;
+
    private FileLock liveLock;
 
    private FileLock backupLock;
@@ -299,44 +302,52 @@ public class FileLockNodeManager extends NodeManager {
 
    protected FileLock lock(final long lockPosition) throws Exception {
       long start = System.currentTimeMillis();
+      boolean isRecurringFailure = false;
 
       while (!interrupted) {
-         FileLock lock = tryLock(lockPosition);
-
-         if (lock == null) {
-            try {
-               Thread.sleep(500);
-            } catch (InterruptedException e) {
-               return null;
+         try {
+            FileLock lock = tryLock(lockPosition);
+            isRecurringFailure = false;
+
+            if (lock == null) {
+               try {
+                  Thread.sleep(500);
+               } catch (InterruptedException e) {
+                  return null;
+               }
+
+               if (lockAcquisitionTimeout != -1 && (System.currentTimeMillis() - start) > lockAcquisitionTimeout) {
+                  throw new ActiveMQLockAcquisitionTimeoutException("timed out waiting for lock");
+               }
+            } else {
+               return lock;
             }
-
-            if (lockAcquisitionTimeout != -1 && (System.currentTimeMillis() - start) > lockAcquisitionTimeout) {
-               throw new Exception("timed out waiting for lock");
+         } catch (IOException e) {
+            // IOException during trylock() may be a temporary issue, e.g. NFS volume not being accessible
+
+            logger.log(isRecurringFailure ? Logger.Level.DEBUG : Logger.Level.WARN,
+                    "Failure when accessing a lock file", e);
+            isRecurringFailure = true;
+
+            long waitTime = LOCK_ACCESS_FAILURE_WAIT_TIME;
+            if (lockAcquisitionTimeout != -1) {
+               final long remainingTime = lockAcquisitionTimeout - (System.currentTimeMillis() - start);
+               if (remainingTime <= 0) {
+                  throw new ActiveMQLockAcquisitionTimeoutException("timed out waiting for lock");
+               }
+               waitTime = Math.min(waitTime, remainingTime);
             }
-         } else {
-            return lock;
-         }
-      }
 
-      // todo this is here because sometimes channel.lock throws a resource deadlock exception but trylock works,
-      // need to investigate further and review
-      FileLock lock;
-      do {
-         lock = tryLock(lockPosition);
-         if (lock == null) {
             try {
-               Thread.sleep(500);
-            } catch (InterruptedException e1) {
-               //
+               Thread.sleep(waitTime);
+            } catch (InterruptedException interrupt) {
+               return null;
             }
          }
-         if (interrupted) {
-            interrupted = false;
-            throw new IOException("Lock was interrupted");
-         }
       }
-      while (lock == null);
-      return lock;
+
+      // presumed interrupted
+      return null;
    }
 
 }
diff --git a/tests/extra-tests/src/test/java/org/apache/activemq/artemis/tests/extras/byteman/FileLockNodeManagerTest.java b/tests/extra-tests/src/test/java/org/apache/activemq/artemis/tests/extras/byteman/FileLockNodeManagerTest.java
new file mode 100644
index 0000000..4a74018
--- /dev/null
+++ b/tests/extra-tests/src/test/java/org/apache/activemq/artemis/tests/extras/byteman/FileLockNodeManagerTest.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.activemq.artemis.tests.extras.byteman;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.activemq.artemis.core.server.impl.FileLockNodeManager;
+import org.jboss.byteman.contrib.bmunit.BMRule;
+import org.jboss.byteman.contrib.bmunit.BMRules;
+import org.jboss.byteman.contrib.bmunit.BMUnitRunner;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+@RunWith(BMUnitRunner.class)
+public class FileLockNodeManagerTest {
+
+   private static final int TIMEOUT_TOLERANCE = 50;
+
+   private File sharedDir;
+
+   public FileLockNodeManagerTest() throws IOException {
+      sharedDir = File.createTempFile("shared-dir", "");
+      sharedDir.delete();
+      Assert.assertTrue(sharedDir.mkdir());
+   }
+
+   @Test
+   @BMRules(
+         rules = {@BMRule(
+               name = "throw IOException during activation",
+               targetClass = "org.apache.activemq.artemis.core.server.impl.FileLockNodeManager",
+               targetMethod = "tryLock",
+               targetLocation = "AT ENTRY",
+               action = "THROW new IOException(\"IO Error\");")
+         })
+   public void test() throws Exception {
+      measureLockAcquisisionTimeout(100); // warm-up
+
+      assertMeasuredTimeoutFor(100);
+      assertMeasuredTimeoutFor(200);
+   }
+
+   protected void assertMeasuredTimeoutFor(long lockAcquisitionTimeout) throws Exception {
+      long realTimeout = measureLockAcquisisionTimeout(lockAcquisitionTimeout);
+      System.out.println(String.format("lockAcquisisionTimeout: %d ms, measured timeout: %d ms", lockAcquisitionTimeout, realTimeout));
+      Assert.assertTrue(String.format("Timeout %d ms was larger than expected %d ms", realTimeout, lockAcquisitionTimeout + TIMEOUT_TOLERANCE),
+            lockAcquisitionTimeout + TIMEOUT_TOLERANCE >= realTimeout);
+      Assert.assertTrue(String.format("Timeout %d ms was lower than expected %d ms", realTimeout, lockAcquisitionTimeout),
+            lockAcquisitionTimeout + TIMEOUT_TOLERANCE >= realTimeout);
+   }
+
+   private long measureLockAcquisisionTimeout(long lockAcquisitionTimeout) throws Exception {
+      FileLockNodeManager manager = new FileLockNodeManager(sharedDir, false, lockAcquisitionTimeout);
+      manager.start();
+
+      // try to lock and measure real timeout
+      long start = System.currentTimeMillis();
+      try {
+         manager.awaitLiveNode();
+      } catch (Exception e) {
+         long stop = System.currentTimeMillis();
+         if (!"timed out waiting for lock".equals(e.getMessage())) {
+            throw e;
+         }
+         return stop - start;
+      }
+      Assert.fail("Exception expected");
+      return 0;
+   }
+}
diff --git a/tests/extra-tests/src/test/java/org/apache/activemq/artemis/tests/extras/byteman/SharedStoreBackupActivationTest.java b/tests/extra-tests/src/test/java/org/apache/activemq/artemis/tests/extras/byteman/SharedStoreBackupActivationTest.java
new file mode 100644
index 0000000..2df895d
--- /dev/null
+++ b/tests/extra-tests/src/test/java/org/apache/activemq/artemis/tests/extras/byteman/SharedStoreBackupActivationTest.java
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.activemq.artemis.tests.extras.byteman;
+
+import java.io.File;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.activemq.artemis.api.core.TransportConfiguration;
+import org.apache.activemq.artemis.core.config.ScaleDownConfiguration;
+import org.apache.activemq.artemis.core.config.ha.SharedStoreMasterPolicyConfiguration;
+import org.apache.activemq.artemis.core.config.ha.SharedStoreSlavePolicyConfiguration;
+import org.apache.activemq.artemis.core.server.NodeManager;
+import org.apache.activemq.artemis.core.server.impl.FileLockNodeManager;
+import org.apache.activemq.artemis.tests.integration.cluster.failover.FailoverTestBase;
+import org.apache.activemq.artemis.tests.util.TransportConfigurationUtils;
+import org.jboss.byteman.contrib.bmunit.BMRule;
+import org.jboss.byteman.contrib.bmunit.BMRules;
+import org.jboss.byteman.contrib.bmunit.BMUnitRunner;
+import org.jboss.logging.Logger;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+@RunWith(BMUnitRunner.class)
+public class SharedStoreBackupActivationTest extends FailoverTestBase {
+
+   private static Logger logger = Logger.getLogger(SharedStoreBackupActivationTest.class);
+
+   private static volatile boolean throwException = false;
+   private static CountDownLatch exceptionThrownLatch;
+
+   public static synchronized boolean isThrowException() {
+      logger.debugf("Throwing IOException during FileLockNodeManager.tryLock(): %s", throwException);
+      if (exceptionThrownLatch != null) {
+         exceptionThrownLatch.countDown();
+      }
+      return throwException;
+   }
+
+   /**
+    * Waits for the backup server to call FileLockNodeManager.tryLock().
+    */
+   public static void awaitTryLock(boolean throwException) throws InterruptedException {
+      synchronized (SharedStoreBackupActivationTest.class) {
+         SharedStoreBackupActivationTest.throwException = throwException;
+         exceptionThrownLatch = new CountDownLatch(1);
+      }
+      logger.debugf("Awaiting backup to perform FileLockNodeManager.tryLock()");
+      boolean ret = exceptionThrownLatch.await(10, TimeUnit.SECONDS);
+      SharedStoreBackupActivationTest.throwException = false;
+
+      Assert.assertTrue("FileLockNodeManager.tryLock() was not called during specified timeout", ret);
+      logger.debugf("Awaiting FileLockNodeManager.tryLock() done");
+   }
+
+   @Test
+   @BMRules(
+         rules = {@BMRule(
+               name = "throw IOException during activation",
+               targetClass = "org.apache.activemq.artemis.core.server.impl.FileLockNodeManager",
+               targetMethod = "tryLock",
+               targetLocation = "AT ENTRY",
+               condition = "org.apache.activemq.artemis.tests.extras.byteman.SharedStoreBackupActivationTest.isThrowException()",
+               action = "THROW new IOException(\"IO Error\");")
+         })
+   public void testFailOverAfterTryLockException() throws Exception {
+      Assert.assertTrue(liveServer.isActive());
+      Assert.assertFalse(backupServer.isActive());
+
+      // wait for backup to try to acquire lock, once without exception (acquiring will not succeed because live is
+      // still active)
+      awaitTryLock(false);
+
+      // wait for backup to try to acquire lock, this time throw an IOException
+      logger.debug("Causing exception");
+      awaitTryLock(true);
+
+      // stop live server
+      logger.debugf("Stopping live server");
+      liveServer.stop();
+      waitForServerToStop(liveServer.getServer());
+      logger.debugf("Live server stopped, waiting for backup activation");
+      backupServer.getServer().waitForActivation(10, TimeUnit.SECONDS);
+
+      // backup should be activated by now
+      Assert.assertFalse(liveServer.isActive());
+      Assert.assertTrue("Backup server didn't activate", backupServer.isActive());
+   }
+
+   @Override
+   protected TransportConfiguration getAcceptorTransportConfiguration(final boolean live) {
+      return TransportConfigurationUtils.getInVMAcceptor(live);
+   }
+
+   @Override
+   protected TransportConfiguration getConnectorTransportConfiguration(final boolean live) {
+      return TransportConfigurationUtils.getInVMConnector(live);
+   }
+
+   @Override
+   protected void createConfigs() throws Exception {
+      File sharedDir = File.createTempFile("shared-dir", "");
+      sharedDir.delete();
+      Assert.assertTrue(sharedDir.mkdir());
+      logger.debugf("Created shared store directory %s", sharedDir.getCanonicalPath());
+
+      TransportConfiguration liveConnector = getConnectorTransportConfiguration(true);
+      TransportConfiguration backupConnector = getConnectorTransportConfiguration(false);
+
+      // nodes must use separate FileLockNodeManager instances!
+      NodeManager liveNodeManager = new FileLockNodeManager(sharedDir, false);
+      NodeManager backupNodeManager = new FileLockNodeManager(sharedDir, false);
+
+      backupConfig = super.createDefaultConfig(false)
+            .clearAcceptorConfigurations()
+            .addAcceptorConfiguration(getAcceptorTransportConfiguration(false))
+            .setHAPolicyConfiguration(
+                  new SharedStoreSlavePolicyConfiguration()
+                        .setScaleDownConfiguration(new ScaleDownConfiguration().setEnabled(false))
+                        .setRestartBackup(false))
+            .addConnectorConfiguration(liveConnector.getName(), liveConnector)
+            .addConnectorConfiguration(backupConnector.getName(), backupConnector)
+            .addClusterConfiguration(basicClusterConnectionConfig(backupConnector.getName(), liveConnector.getName()));
+      backupServer = createTestableServer(backupConfig, backupNodeManager);
+
+      liveConfig = super.createDefaultConfig(false)
+            .clearAcceptorConfigurations()
+            .addAcceptorConfiguration(getAcceptorTransportConfiguration(true))
+            .setHAPolicyConfiguration(new SharedStoreMasterPolicyConfiguration().setFailoverOnServerShutdown(true))
+            .addClusterConfiguration(basicClusterConnectionConfig(liveConnector.getName()))
+            .addConnectorConfiguration(liveConnector.getName(), liveConnector);
+      liveServer = createTestableServer(liveConfig, liveNodeManager);
+   }
+
+}
diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTestBase.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTestBase.java
index 2a75f94..e4d4711 100644
--- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTestBase.java
+++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTestBase.java
@@ -113,6 +113,10 @@ public abstract class FailoverTestBase extends ActiveMQTestBase {
    }
 
    protected TestableServer createTestableServer(Configuration config) throws Exception {
+      return createTestableServer(config, nodeManager);
+   }
+
+   protected TestableServer createTestableServer(Configuration config, NodeManager nodeManager) throws Exception {
       boolean isBackup = config.getHAPolicyConfiguration() instanceof ReplicaPolicyConfiguration || config.getHAPolicyConfiguration() instanceof SharedStoreSlavePolicyConfiguration;
       return new SameProcessActiveMQServer(createInVMFailoverServer(true, config, nodeManager, isBackup ? 2 : 1));
    }