You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@geode.apache.org by bs...@apache.org on 2019/04/15 19:07:08 UTC

[geode] branch feature/GEODE-6646 created (now 38d40fe)

This is an automated email from the ASF dual-hosted git repository.

bschuchardt pushed a change to branch feature/GEODE-6646
in repository https://gitbox.apache.org/repos/asf/geode.git.


      at 38d40fe  GEODE-6646 - CI failure in serverRestarsAfterLocatorReconnects

This branch includes the following new commits:

     new 38d40fe  GEODE-6646 - CI failure in serverRestarsAfterLocatorReconnects

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[geode] 01/01: GEODE-6646 - CI failure in serverRestarsAfterLocatorReconnects

Posted by bs...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

bschuchardt pushed a commit to branch feature/GEODE-6646
in repository https://gitbox.apache.org/repos/asf/geode.git

commit 38d40fe41ded5486ec019b2e7b5633fa8f829ee6
Author: Bruce Schuchardt <bs...@pivotal.io>
AuthorDate: Mon Apr 15 11:55:05 2019 -0700

    GEODE-6646 - CI failure in serverRestarsAfterLocatorReconnects
    
    In this test there is a locator and there are two servers.  The
    locator and second servers are forced out of the cluster and
    auto-reconnect.  In the failure the first server becomes membership
    coordinator but the other processes fail to join its cluster when
    they restart.  They are misconfigured to have a max-wait-time-reconnect
    that is too short (5 seconds instead of 60) to allow the first server to detect failures
    before the failed server and locator start trying to reconnect.
    
    The fix is in InternalDistributedSystem and ensures that the
    waiting period before attempting to reconnect is a sufficient
    multiple of the member-timeout setting.
---
 .../management/JMXMBeanReconnectDUnitTest.java     | 11 +++++++---
 .../ClusterConfigLocatorRestartDUnitTest.java      | 25 +++++++++++++++-------
 .../internal/InternalDistributedSystem.java        |  4 ++++
 .../geode/test/junit/rules/MemberStarterRule.java  |  4 ----
 4 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/geode-core/src/distributedTest/java/org/apache/geode/management/JMXMBeanReconnectDUnitTest.java b/geode-core/src/distributedTest/java/org/apache/geode/management/JMXMBeanReconnectDUnitTest.java
index b7344ae..f83797f 100644
--- a/geode-core/src/distributedTest/java/org/apache/geode/management/JMXMBeanReconnectDUnitTest.java
+++ b/geode-core/src/distributedTest/java/org/apache/geode/management/JMXMBeanReconnectDUnitTest.java
@@ -16,6 +16,7 @@
 package org.apache.geode.management;
 
 import static java.util.stream.Collectors.toList;
+import static org.apache.geode.distributed.ConfigurationProperties.MAX_WAIT_TIME_RECONNECT;
 import static org.apache.geode.management.ManagementService.getExistingManagementService;
 import static org.apache.geode.test.awaitility.GeodeAwaitility.await;
 import static org.apache.geode.test.dunit.internal.JUnit4DistributedTestCase.getBlackboard;
@@ -85,15 +86,18 @@ public class JMXMBeanReconnectDUnitTest {
 
   @Before
   public void before() throws Exception {
+    Properties properties = new Properties();
+    properties.setProperty(MAX_WAIT_TIME_RECONNECT, "5000");
+
     locator1 = lsRule.startLocatorVM(LOCATOR_1_VM_INDEX, locator1Properties());
     locator1.waitTilLocatorFullyStarted();
 
     locator2 = lsRule.startLocatorVM(LOCATOR_2_VM_INDEX, locator2Properties(), locator1.getPort());
     locator2.waitTilLocatorFullyStarted();
 
-    server1 = lsRule.startServerVM(SERVER_1_VM_INDEX, locator1.getPort());
+    server1 = lsRule.startServerVM(SERVER_1_VM_INDEX, properties, locator1.getPort());
     // start an extra server to have more MBeans, but we don't need to use it in these tests
-    lsRule.startServerVM(SERVER_2_VM_INDEX, locator1.getPort());
+    lsRule.startServerVM(SERVER_2_VM_INDEX, properties, locator1.getPort());
 
     gfsh.connectAndVerify(locator1);
     gfsh.executeAndAssertThat("create region --type=REPLICATE --name=" + REGION_PATH
@@ -317,7 +321,7 @@ public class JMXMBeanReconnectDUnitTest {
     Properties props = new Properties();
     props.setProperty(ConfigurationProperties.JMX_MANAGER_HOSTNAME_FOR_CLIENTS, "localhost");
     props.setProperty(ConfigurationProperties.NAME, LOCATOR_1_NAME);
-    props.setProperty(ConfigurationProperties.MAX_WAIT_TIME_RECONNECT, "5000");
+    props.setProperty(MAX_WAIT_TIME_RECONNECT, "5000");
     return props;
   }
 
@@ -326,6 +330,7 @@ public class JMXMBeanReconnectDUnitTest {
     props.setProperty(ConfigurationProperties.JMX_MANAGER_HOSTNAME_FOR_CLIENTS, "localhost");
     props.setProperty(ConfigurationProperties.NAME, LOCATOR_2_NAME);
     props.setProperty(ConfigurationProperties.LOCATORS, "localhost[" + locator1.getPort() + "]");
+    props.setProperty(MAX_WAIT_TIME_RECONNECT, "5000");
     return props;
   }
 }
diff --git a/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java b/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java
index 0ddb364..419d6c0 100644
--- a/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java
+++ b/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java
@@ -17,8 +17,11 @@ package org.apache.geode.management.internal.configuration;
 
 
 
+import static org.apache.geode.distributed.ConfigurationProperties.MAX_WAIT_TIME_RECONNECT;
 import static org.apache.geode.test.awaitility.GeodeAwaitility.await;
 
+import java.util.Properties;
+
 import org.junit.Rule;
 import org.junit.Test;
 
@@ -61,10 +64,13 @@ public class ClusterConfigLocatorRestartDUnitTest {
         .addIgnoredException("member unexpectedly shut down shared, unordered connection");
     IgnoredException.addIgnoredException("Connection refused");
 
-    MemberVM locator0 = rule.startLocatorVM(0);
+    Properties properties = new Properties();
+    properties.setProperty(MAX_WAIT_TIME_RECONNECT, "15000");
+
+    MemberVM locator0 = rule.startLocatorVM(0, properties);
 
-    rule.startServerVM(1, locator0.getPort());
-    MemberVM server2 = rule.startServerVM(2, locator0.getPort());
+    rule.startServerVM(1, properties, locator0.getPort());
+    MemberVM server2 = rule.startServerVM(2, properties, locator0.getPort());
 
     addDisconnectListener(locator0);
 
@@ -90,18 +96,21 @@ public class ClusterConfigLocatorRestartDUnitTest {
     IgnoredException.addIgnoredException("Connection refused");
 
 
-    MemberVM locator0 = rule.startLocatorVM(0);
-    MemberVM locator1 = rule.startLocatorVM(1, locator0.getPort());
+    Properties properties = new Properties();
+    properties.setProperty(MAX_WAIT_TIME_RECONNECT, "5000");
+
+    MemberVM locator0 = rule.startLocatorVM(0, properties);
+    MemberVM locator1 = rule.startLocatorVM(1, properties, locator0.getPort());
 
-    MemberVM server2 = rule.startServerVM(2, locator0.getPort(), locator1.getPort());
-    MemberVM server3 = rule.startServerVM(3, locator0.getPort(), locator1.getPort());
+    MemberVM server2 = rule.startServerVM(2, properties, locator0.getPort(), locator1.getPort());
+    MemberVM server3 = rule.startServerVM(3, properties, locator0.getPort(), locator1.getPort());
 
     // Shut down hard
     rule.crashVM(0);
 
     server3.forceDisconnect();
 
-    rule.startServerVM(4, locator1.getPort(), locator0.getPort());
+    rule.startServerVM(4, properties, locator1.getPort(), locator0.getPort());
 
     gfsh.connectAndVerify(locator1);
 
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
index 1b539e9..3f724b7 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
@@ -2464,6 +2464,10 @@ public class InternalDistributedSystem extends DistributedSystem
     configProps.putAll(this.config.toSecurityProperties());
 
     int timeOut = oldConfig.getMaxWaitTimeForReconnect();
+    int memberTimeout = oldConfig.getMemberTimeout();
+    final int intervalsAllowedForFailureDetection = 4;
+    timeOut = Math.max(timeOut, memberTimeout * intervalsAllowedForFailureDetection);
+
     int maxTries = oldConfig.getMaxNumReconnectTries();
 
     final boolean isDebugEnabled = logger.isDebugEnabled();
diff --git a/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/MemberStarterRule.java b/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/MemberStarterRule.java
index 4ca92b6..1448392 100644
--- a/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/MemberStarterRule.java
+++ b/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/MemberStarterRule.java
@@ -21,7 +21,6 @@ import static org.apache.geode.distributed.ConfigurationProperties.JMX_MANAGER_P
 import static org.apache.geode.distributed.ConfigurationProperties.JMX_MANAGER_START;
 import static org.apache.geode.distributed.ConfigurationProperties.LOCATORS;
 import static org.apache.geode.distributed.ConfigurationProperties.LOG_FILE;
-import static org.apache.geode.distributed.ConfigurationProperties.MAX_WAIT_TIME_RECONNECT;
 import static org.apache.geode.distributed.ConfigurationProperties.MCAST_PORT;
 import static org.apache.geode.distributed.ConfigurationProperties.NAME;
 import static org.apache.geode.distributed.ConfigurationProperties.SECURITY_MANAGER;
@@ -120,9 +119,6 @@ public abstract class MemberStarterRule<T> extends SerializableExternalResource
     // initial values
     properties.setProperty(MCAST_PORT, "0");
     properties.setProperty(LOCATORS, "");
-    // set the reconnect wait time to 5 seconds in case some tests needs to reconnect in a timely
-    // manner.
-    properties.setProperty(MAX_WAIT_TIME_RECONNECT, "5000");
     systemProperties.setProperty(ClusterManagementService.FEATURE_FLAG, "true");
   }