You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@geode.apache.org by bs...@apache.org on 2019/04/17 17:30:54 UTC

[geode] branch develop updated: GEODE-6646 - CI failure in serverRestarsAfterLocatorReconnects

This is an automated email from the ASF dual-hosted git repository.

bschuchardt pushed a commit to branch develop
in repository https://gitbox.apache.org/repos/asf/geode.git


The following commit(s) were added to refs/heads/develop by this push:
     new 0e90b5a  GEODE-6646 - CI failure in serverRestarsAfterLocatorReconnects
0e90b5a is described below

commit 0e90b5ac2f3ada0e2813d6b518c7c70cf1986b45
Author: Bruce Schuchardt <bs...@pivotal.io>
AuthorDate: Wed Apr 17 10:29:44 2019 -0700

    GEODE-6646 - CI failure in serverRestarsAfterLocatorReconnects
    
    In this test there is a locator and there are two servers.  The
    locator and second servers are forced out of the cluster and
    auto-reconnect.  In the failure the first server becomes membership
    coordinator but the other processes fail to join its cluster when
    they restart.  They are misconfigured to have a max-wait-time-reconnect
    that is too short (5 seconds instead of 60) to allow the first server to detect failures
    before the failed server and locator start trying to reconnect.
    
    The fix is in InternalDistributedSystem and ensures that the
    waiting period before attempting to reconnect is a sufficient
    multiple of the member-timeout setting.
---
 .../management/JMXMBeanReconnectDUnitTest.java     | 11 +++++++---
 .../ClusterConfigLocatorRestartDUnitTest.java      | 25 +++++++++++++++-------
 .../internal/InternalDistributedSystem.java        |  8 +++++++
 .../geode/test/junit/rules/MemberStarterRule.java  |  4 ----
 4 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/geode-core/src/distributedTest/java/org/apache/geode/management/JMXMBeanReconnectDUnitTest.java b/geode-core/src/distributedTest/java/org/apache/geode/management/JMXMBeanReconnectDUnitTest.java
index b7344ae..f83797f 100644
--- a/geode-core/src/distributedTest/java/org/apache/geode/management/JMXMBeanReconnectDUnitTest.java
+++ b/geode-core/src/distributedTest/java/org/apache/geode/management/JMXMBeanReconnectDUnitTest.java
@@ -16,6 +16,7 @@
 package org.apache.geode.management;
 
 import static java.util.stream.Collectors.toList;
+import static org.apache.geode.distributed.ConfigurationProperties.MAX_WAIT_TIME_RECONNECT;
 import static org.apache.geode.management.ManagementService.getExistingManagementService;
 import static org.apache.geode.test.awaitility.GeodeAwaitility.await;
 import static org.apache.geode.test.dunit.internal.JUnit4DistributedTestCase.getBlackboard;
@@ -85,15 +86,18 @@ public class JMXMBeanReconnectDUnitTest {
 
   @Before
   public void before() throws Exception {
+    Properties properties = new Properties();
+    properties.setProperty(MAX_WAIT_TIME_RECONNECT, "5000");
+
     locator1 = lsRule.startLocatorVM(LOCATOR_1_VM_INDEX, locator1Properties());
     locator1.waitTilLocatorFullyStarted();
 
     locator2 = lsRule.startLocatorVM(LOCATOR_2_VM_INDEX, locator2Properties(), locator1.getPort());
     locator2.waitTilLocatorFullyStarted();
 
-    server1 = lsRule.startServerVM(SERVER_1_VM_INDEX, locator1.getPort());
+    server1 = lsRule.startServerVM(SERVER_1_VM_INDEX, properties, locator1.getPort());
     // start an extra server to have more MBeans, but we don't need to use it in these tests
-    lsRule.startServerVM(SERVER_2_VM_INDEX, locator1.getPort());
+    lsRule.startServerVM(SERVER_2_VM_INDEX, properties, locator1.getPort());
 
     gfsh.connectAndVerify(locator1);
     gfsh.executeAndAssertThat("create region --type=REPLICATE --name=" + REGION_PATH
@@ -317,7 +321,7 @@ public class JMXMBeanReconnectDUnitTest {
     Properties props = new Properties();
     props.setProperty(ConfigurationProperties.JMX_MANAGER_HOSTNAME_FOR_CLIENTS, "localhost");
     props.setProperty(ConfigurationProperties.NAME, LOCATOR_1_NAME);
-    props.setProperty(ConfigurationProperties.MAX_WAIT_TIME_RECONNECT, "5000");
+    props.setProperty(MAX_WAIT_TIME_RECONNECT, "5000");
     return props;
   }
 
@@ -326,6 +330,7 @@ public class JMXMBeanReconnectDUnitTest {
     props.setProperty(ConfigurationProperties.JMX_MANAGER_HOSTNAME_FOR_CLIENTS, "localhost");
     props.setProperty(ConfigurationProperties.NAME, LOCATOR_2_NAME);
     props.setProperty(ConfigurationProperties.LOCATORS, "localhost[" + locator1.getPort() + "]");
+    props.setProperty(MAX_WAIT_TIME_RECONNECT, "5000");
     return props;
   }
 }
diff --git a/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java b/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java
index 0ddb364..81c3d0c 100644
--- a/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java
+++ b/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java
@@ -17,8 +17,11 @@ package org.apache.geode.management.internal.configuration;
 
 
 
+import static org.apache.geode.distributed.ConfigurationProperties.MAX_WAIT_TIME_RECONNECT;
 import static org.apache.geode.test.awaitility.GeodeAwaitility.await;
 
+import java.util.Properties;
+
 import org.junit.Rule;
 import org.junit.Test;
 
@@ -61,10 +64,13 @@ public class ClusterConfigLocatorRestartDUnitTest {
         .addIgnoredException("member unexpectedly shut down shared, unordered connection");
     IgnoredException.addIgnoredException("Connection refused");
 
-    MemberVM locator0 = rule.startLocatorVM(0);
+    Properties properties = new Properties();
+    properties.setProperty(MAX_WAIT_TIME_RECONNECT, "15000");
+
+    MemberVM locator0 = rule.startLocatorVM(0, properties);
 
-    rule.startServerVM(1, locator0.getPort());
-    MemberVM server2 = rule.startServerVM(2, locator0.getPort());
+    rule.startServerVM(1, properties, locator0.getPort());
+    MemberVM server2 = rule.startServerVM(2, properties, locator0.getPort());
 
     addDisconnectListener(locator0);
 
@@ -90,18 +96,21 @@ public class ClusterConfigLocatorRestartDUnitTest {
     IgnoredException.addIgnoredException("Connection refused");
 
 
-    MemberVM locator0 = rule.startLocatorVM(0);
-    MemberVM locator1 = rule.startLocatorVM(1, locator0.getPort());
+    Properties properties = new Properties();
+    properties.setProperty(MAX_WAIT_TIME_RECONNECT, "30000");
+
+    MemberVM locator0 = rule.startLocatorVM(0, properties);
+    MemberVM locator1 = rule.startLocatorVM(1, properties, locator0.getPort());
 
-    MemberVM server2 = rule.startServerVM(2, locator0.getPort(), locator1.getPort());
-    MemberVM server3 = rule.startServerVM(3, locator0.getPort(), locator1.getPort());
+    rule.startServerVM(2, properties, locator0.getPort(), locator1.getPort());
+    MemberVM server3 = rule.startServerVM(3, properties, locator0.getPort(), locator1.getPort());
 
     // Shut down hard
     rule.crashVM(0);
 
     server3.forceDisconnect();
 
-    rule.startServerVM(4, locator1.getPort(), locator0.getPort());
+    rule.startServerVM(4, properties, locator1.getPort(), locator0.getPort());
 
     gfsh.connectAndVerify(locator1);
 
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
index 1b539e9..39e7101 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
@@ -2464,6 +2464,14 @@ public class InternalDistributedSystem extends DistributedSystem
     configProps.putAll(this.config.toSecurityProperties());
 
     int timeOut = oldConfig.getMaxWaitTimeForReconnect();
+    int memberTimeout = oldConfig.getMemberTimeout();
+    // we need to make sure that a surviving member is able
+    // to take over coordination before trying to auto-reconnect.
+    // failure detection can take 4 member-timeout intervals
+    // so we set that as a minimum. (suspect, check suspect, final check, send new view)
+    final int intervalsAllowedForFailureDetection = 4;
+    timeOut = Math.max(timeOut, memberTimeout * intervalsAllowedForFailureDetection);
+
     int maxTries = oldConfig.getMaxNumReconnectTries();
 
     final boolean isDebugEnabled = logger.isDebugEnabled();
diff --git a/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/MemberStarterRule.java b/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/MemberStarterRule.java
index 4ca92b6..1448392 100644
--- a/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/MemberStarterRule.java
+++ b/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/MemberStarterRule.java
@@ -21,7 +21,6 @@ import static org.apache.geode.distributed.ConfigurationProperties.JMX_MANAGER_P
 import static org.apache.geode.distributed.ConfigurationProperties.JMX_MANAGER_START;
 import static org.apache.geode.distributed.ConfigurationProperties.LOCATORS;
 import static org.apache.geode.distributed.ConfigurationProperties.LOG_FILE;
-import static org.apache.geode.distributed.ConfigurationProperties.MAX_WAIT_TIME_RECONNECT;
 import static org.apache.geode.distributed.ConfigurationProperties.MCAST_PORT;
 import static org.apache.geode.distributed.ConfigurationProperties.NAME;
 import static org.apache.geode.distributed.ConfigurationProperties.SECURITY_MANAGER;
@@ -120,9 +119,6 @@ public abstract class MemberStarterRule<T> extends SerializableExternalResource
     // initial values
     properties.setProperty(MCAST_PORT, "0");
     properties.setProperty(LOCATORS, "");
-    // set the reconnect wait time to 5 seconds in case some tests needs to reconnect in a timely
-    // manner.
-    properties.setProperty(MAX_WAIT_TIME_RECONNECT, "5000");
     systemProperties.setProperty(ClusterManagementService.FEATURE_FLAG, "true");
   }