You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@geode.apache.org by bs...@apache.org on 2019/04/17 17:30:54 UTC
[geode] branch develop updated: GEODE-6646 - CI failure in
serverRestarsAfterLocatorReconnects
This is an automated email from the ASF dual-hosted git repository.
bschuchardt pushed a commit to branch develop
in repository https://gitbox.apache.org/repos/asf/geode.git
The following commit(s) were added to refs/heads/develop by this push:
new 0e90b5a GEODE-6646 - CI failure in serverRestarsAfterLocatorReconnects
0e90b5a is described below
commit 0e90b5ac2f3ada0e2813d6b518c7c70cf1986b45
Author: Bruce Schuchardt <bs...@pivotal.io>
AuthorDate: Wed Apr 17 10:29:44 2019 -0700
GEODE-6646 - CI failure in serverRestarsAfterLocatorReconnects
In this test there is a locator and there are two servers. The
locator and second servers are forced out of the cluster and
auto-reconnect. In the failure the first server becomes membership
coordinator but the other processes fail to join its cluster when
they restart. They are misconfigured to have a max-wait-time-reconnect
that is too short (5 seconds instead of 60) to allow the first server to detect failures
before the failed server and locator start trying to reconnect.
The fix is in InternalDistributedSystem and ensures that the
waiting period before attempting to reconnect is a sufficient
multiple of the member-timeout setting.
---
.../management/JMXMBeanReconnectDUnitTest.java | 11 +++++++---
.../ClusterConfigLocatorRestartDUnitTest.java | 25 +++++++++++++++-------
.../internal/InternalDistributedSystem.java | 8 +++++++
.../geode/test/junit/rules/MemberStarterRule.java | 4 ----
4 files changed, 33 insertions(+), 15 deletions(-)
diff --git a/geode-core/src/distributedTest/java/org/apache/geode/management/JMXMBeanReconnectDUnitTest.java b/geode-core/src/distributedTest/java/org/apache/geode/management/JMXMBeanReconnectDUnitTest.java
index b7344ae..f83797f 100644
--- a/geode-core/src/distributedTest/java/org/apache/geode/management/JMXMBeanReconnectDUnitTest.java
+++ b/geode-core/src/distributedTest/java/org/apache/geode/management/JMXMBeanReconnectDUnitTest.java
@@ -16,6 +16,7 @@
package org.apache.geode.management;
import static java.util.stream.Collectors.toList;
+import static org.apache.geode.distributed.ConfigurationProperties.MAX_WAIT_TIME_RECONNECT;
import static org.apache.geode.management.ManagementService.getExistingManagementService;
import static org.apache.geode.test.awaitility.GeodeAwaitility.await;
import static org.apache.geode.test.dunit.internal.JUnit4DistributedTestCase.getBlackboard;
@@ -85,15 +86,18 @@ public class JMXMBeanReconnectDUnitTest {
@Before
public void before() throws Exception {
+ Properties properties = new Properties();
+ properties.setProperty(MAX_WAIT_TIME_RECONNECT, "5000");
+
locator1 = lsRule.startLocatorVM(LOCATOR_1_VM_INDEX, locator1Properties());
locator1.waitTilLocatorFullyStarted();
locator2 = lsRule.startLocatorVM(LOCATOR_2_VM_INDEX, locator2Properties(), locator1.getPort());
locator2.waitTilLocatorFullyStarted();
- server1 = lsRule.startServerVM(SERVER_1_VM_INDEX, locator1.getPort());
+ server1 = lsRule.startServerVM(SERVER_1_VM_INDEX, properties, locator1.getPort());
// start an extra server to have more MBeans, but we don't need to use it in these tests
- lsRule.startServerVM(SERVER_2_VM_INDEX, locator1.getPort());
+ lsRule.startServerVM(SERVER_2_VM_INDEX, properties, locator1.getPort());
gfsh.connectAndVerify(locator1);
gfsh.executeAndAssertThat("create region --type=REPLICATE --name=" + REGION_PATH
@@ -317,7 +321,7 @@ public class JMXMBeanReconnectDUnitTest {
Properties props = new Properties();
props.setProperty(ConfigurationProperties.JMX_MANAGER_HOSTNAME_FOR_CLIENTS, "localhost");
props.setProperty(ConfigurationProperties.NAME, LOCATOR_1_NAME);
- props.setProperty(ConfigurationProperties.MAX_WAIT_TIME_RECONNECT, "5000");
+ props.setProperty(MAX_WAIT_TIME_RECONNECT, "5000");
return props;
}
@@ -326,6 +330,7 @@ public class JMXMBeanReconnectDUnitTest {
props.setProperty(ConfigurationProperties.JMX_MANAGER_HOSTNAME_FOR_CLIENTS, "localhost");
props.setProperty(ConfigurationProperties.NAME, LOCATOR_2_NAME);
props.setProperty(ConfigurationProperties.LOCATORS, "localhost[" + locator1.getPort() + "]");
+ props.setProperty(MAX_WAIT_TIME_RECONNECT, "5000");
return props;
}
}
diff --git a/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java b/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java
index 0ddb364..81c3d0c 100644
--- a/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java
+++ b/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java
@@ -17,8 +17,11 @@ package org.apache.geode.management.internal.configuration;
+import static org.apache.geode.distributed.ConfigurationProperties.MAX_WAIT_TIME_RECONNECT;
import static org.apache.geode.test.awaitility.GeodeAwaitility.await;
+import java.util.Properties;
+
import org.junit.Rule;
import org.junit.Test;
@@ -61,10 +64,13 @@ public class ClusterConfigLocatorRestartDUnitTest {
.addIgnoredException("member unexpectedly shut down shared, unordered connection");
IgnoredException.addIgnoredException("Connection refused");
- MemberVM locator0 = rule.startLocatorVM(0);
+ Properties properties = new Properties();
+ properties.setProperty(MAX_WAIT_TIME_RECONNECT, "15000");
+
+ MemberVM locator0 = rule.startLocatorVM(0, properties);
- rule.startServerVM(1, locator0.getPort());
- MemberVM server2 = rule.startServerVM(2, locator0.getPort());
+ rule.startServerVM(1, properties, locator0.getPort());
+ MemberVM server2 = rule.startServerVM(2, properties, locator0.getPort());
addDisconnectListener(locator0);
@@ -90,18 +96,21 @@ public class ClusterConfigLocatorRestartDUnitTest {
IgnoredException.addIgnoredException("Connection refused");
- MemberVM locator0 = rule.startLocatorVM(0);
- MemberVM locator1 = rule.startLocatorVM(1, locator0.getPort());
+ Properties properties = new Properties();
+ properties.setProperty(MAX_WAIT_TIME_RECONNECT, "30000");
+
+ MemberVM locator0 = rule.startLocatorVM(0, properties);
+ MemberVM locator1 = rule.startLocatorVM(1, properties, locator0.getPort());
- MemberVM server2 = rule.startServerVM(2, locator0.getPort(), locator1.getPort());
- MemberVM server3 = rule.startServerVM(3, locator0.getPort(), locator1.getPort());
+ rule.startServerVM(2, properties, locator0.getPort(), locator1.getPort());
+ MemberVM server3 = rule.startServerVM(3, properties, locator0.getPort(), locator1.getPort());
// Shut down hard
rule.crashVM(0);
server3.forceDisconnect();
- rule.startServerVM(4, locator1.getPort(), locator0.getPort());
+ rule.startServerVM(4, properties, locator1.getPort(), locator0.getPort());
gfsh.connectAndVerify(locator1);
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
index 1b539e9..39e7101 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
@@ -2464,6 +2464,14 @@ public class InternalDistributedSystem extends DistributedSystem
configProps.putAll(this.config.toSecurityProperties());
int timeOut = oldConfig.getMaxWaitTimeForReconnect();
+ int memberTimeout = oldConfig.getMemberTimeout();
+ // we need to make sure that a surviving member is able
+ // to take over coordination before trying to auto-reconnect.
+ // failure detection can take 4 member-timeout intervals
+ // so we set that as a minimum. (suspect, check suspect, final check, send new view)
+ final int intervalsAllowedForFailureDetection = 4;
+ timeOut = Math.max(timeOut, memberTimeout * intervalsAllowedForFailureDetection);
+
int maxTries = oldConfig.getMaxNumReconnectTries();
final boolean isDebugEnabled = logger.isDebugEnabled();
diff --git a/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/MemberStarterRule.java b/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/MemberStarterRule.java
index 4ca92b6..1448392 100644
--- a/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/MemberStarterRule.java
+++ b/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/MemberStarterRule.java
@@ -21,7 +21,6 @@ import static org.apache.geode.distributed.ConfigurationProperties.JMX_MANAGER_P
import static org.apache.geode.distributed.ConfigurationProperties.JMX_MANAGER_START;
import static org.apache.geode.distributed.ConfigurationProperties.LOCATORS;
import static org.apache.geode.distributed.ConfigurationProperties.LOG_FILE;
-import static org.apache.geode.distributed.ConfigurationProperties.MAX_WAIT_TIME_RECONNECT;
import static org.apache.geode.distributed.ConfigurationProperties.MCAST_PORT;
import static org.apache.geode.distributed.ConfigurationProperties.NAME;
import static org.apache.geode.distributed.ConfigurationProperties.SECURITY_MANAGER;
@@ -120,9 +119,6 @@ public abstract class MemberStarterRule<T> extends SerializableExternalResource
// initial values
properties.setProperty(MCAST_PORT, "0");
properties.setProperty(LOCATORS, "");
- // set the reconnect wait time to 5 seconds in case some tests needs to reconnect in a timely
- // manner.
- properties.setProperty(MAX_WAIT_TIME_RECONNECT, "5000");
systemProperties.setProperty(ClusterManagementService.FEATURE_FLAG, "true");
}