You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@geode.apache.org by bs...@apache.org on 2019/04/30 21:26:29 UTC
[geode] 01/01: GEODE-6724 split brain formed on concurrent locator
startup
This is an automated email from the ASF dual-hosted git repository.
bschuchardt pushed a commit to branch feature/GEODE-6724
in repository https://gitbox.apache.org/repos/asf/geode.git
commit 0f41c5f46fa731423f7b4d895cccbf8418b40da8
Author: Bruce Schuchardt <bs...@pivotal.io>
AuthorDate: Tue Apr 30 14:24:04 2019 -0700
GEODE-6724 split brain formed on concurrent locator startup
Ensure that either all locators have been contacted or a decent
number of attempts to join have occurred before allowing a member to
start its own cluster.
If all locators have been contacted we ought to have a sufficient
registration pool to choose a membership coordinator during concurrent
startup.
---
.../apache/geode/distributed/LocatorDUnitTest.java | 59 +++++++++++++++++++++-
.../membership/gms/membership/GMSJoinLeave.java | 13 +++--
2 files changed, 66 insertions(+), 6 deletions(-)
diff --git a/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java b/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java
index 4268596..e07f572 100644
--- a/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java
+++ b/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java
@@ -14,6 +14,7 @@
*/
package org.apache.geode.distributed;
+import static java.util.concurrent.TimeUnit.MINUTES;
import static java.util.concurrent.TimeUnit.SECONDS;
import static org.apache.geode.distributed.ConfigurationProperties.DISABLE_AUTO_RECONNECT;
import static org.apache.geode.distributed.ConfigurationProperties.ENABLE_CLUSTER_CONFIGURATION;
@@ -1265,11 +1266,64 @@ public class LocatorDUnitTest implements java.io.Serializable {
}
}
+ @Test
+ public void testConcurrentLocatorStartup() throws Exception {
+ List<AvailablePort.Keeper> portKeepers =
+ AvailablePortHelper.getRandomAvailableTCPPortKeepers(4);
+ List<AsyncInvocation<Object>> asyncInvocations = new ArrayList(portKeepers.size());
+ StringBuilder sb = new StringBuilder(100);
+ for (int i = 0; i < portKeepers.size(); i++) {
+ AvailablePort.Keeper keeper = portKeepers.get(i);
+ sb.append("localhost[").append(keeper.getPort()).append("]");
+ if (i < portKeepers.size() - 1) {
+ sb.append(',');
+ }
+ }
+ String locators = sb.toString();
+ Properties dsProps = getClusterProperties(locators, "false");
+ for (int i = 0; i < portKeepers.size(); i++) {
+ AvailablePort.Keeper keeper = portKeepers.get(i);
+ final int port = keeper.getPort();
+ DistributedTestUtils.deleteLocatorStateFile(port);
+ keeper.release();
+ AsyncInvocation<Object> startLocator = VM.getVM(i).invokeAsync("start locator " + i, () -> {
+ DUnitBlackboard blackboard = getBlackboard();
+ blackboard.signalGate("" + port);
+ try {
+ blackboard.waitForGate("startLocators", 5, MINUTES);
+ } catch (InterruptedException e) {
+ throw new RuntimeException("test was interrupted");
+ }
+ startLocatorBase(dsProps, port);
+ assertTrue(isSystemConnected());
+ System.out.println("Locator startup completed");
+ });
+ asyncInvocations.add(startLocator);
+ getBlackboard().waitForGate("" + port, 5, MINUTES);
+ }
+
+ getBlackboard().signalGate("startLocators");
+ int expectedCount = asyncInvocations.size() - 1;
+ for (int i = 0; i < asyncInvocations.size(); i++) {
+ asyncInvocations.get(i).await();
+ }
+ for (int i = 0; i < asyncInvocations.size(); i++) {
+ assertTrue(VM.getVM(i).invoke("assert all in same cluster", () -> CacheFactory
+ .getAnyInstance().getDistributedSystem().getAllOtherMembers().size() == expectedCount));
+ }
+ for (int i = 0; i < asyncInvocations.size(); i++) {
+ VM.getVM(i).invoke(() -> {
+ Locator.getLocator().stop();
+ system = null;
+ });
+ }
+ }
+
/**
- * Tests starting multiple locators in multiple VMs.
+ * Tests starting two locators and two servers in different JVMs
*/
@Test
- public void testMultipleLocators() {
+ public void testTwoLocatorsTwoServers() {
VM vm0 = VM.getVM(0);
VM vm1 = VM.getVM(1);
VM vm2 = VM.getVM(2);
@@ -1904,6 +1958,7 @@ public class LocatorDUnitTest implements java.io.Serializable {
properties.put(MEMBER_TIMEOUT, "2000");
properties.put(LOG_LEVEL, logger.getLevel().name());
properties.put(ENABLE_CLUSTER_CONFIGURATION, "false");
+ properties.put(USE_CLUSTER_CONFIGURATION, "false");
properties.put(LOCATOR_WAIT_TIME, "10"); // seconds
return properties;
}
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
index e529c9e..d7712c1 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
@@ -279,13 +279,13 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
public String toString() {
StringBuffer sb = new StringBuffer(200);
- sb.append("SearchState(locatorsContacted=").append(locatorsContacted)
+ sb.append("locatorsContacted=").append(locatorsContacted)
.append("; findInViewResponses=").append(joinedMembersContacted)
.append("; alreadyTried=").append(alreadyTried).append("; registrants=")
.append(registrants).append("; possibleCoordinator=").append(possibleCoordinator)
.append("; viewId=").append(viewId).append("; hasContactedAJoinedLocator=")
.append(hasContactedAJoinedLocator).append("; view=").append(view).append("; responses=")
- .append(responses).append(")");
+ .append(responses);
return sb.toString();
}
}
@@ -323,6 +323,7 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
long startTime = System.currentTimeMillis();
long locatorGiveUpTime = startTime + locatorWaitTime;
long giveupTime = startTime + timeout;
+ int minimumRetriesBeforeBecomingCoordinator = locators.size() * 2;
for (int tries = 0; !this.isJoined && !this.isStopping; tries++) {
logger.debug("searching for the membership coordinator");
@@ -333,8 +334,11 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
logger.info("found possible coordinator {}", state.possibleCoordinator);
if (localAddress.getNetMember().preferredForCoordinator()
&& state.possibleCoordinator.equals(this.localAddress)) {
+ // if we haven't contacted a member of a cluster maybe this node should
+ // become the coordinator.
if (state.joinedMembersContacted <= 0 &&
- (tries > 2 || System.currentTimeMillis() < giveupTime)) {
+ (tries >= minimumRetriesBeforeBecomingCoordinator ||
+ state.locatorsContacted >= locators.size())) {
synchronized (viewInstallationLock) {
becomeCoordinator();
}
@@ -376,10 +380,11 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
logger.debug("sleeping for {} before making another attempt to find the coordinator",
retrySleep);
Thread.sleep(retrySleep);
- }
+ } /* else { */
// since we were given a coordinator that couldn't be used we should keep trying
tries = 0;
giveupTime = System.currentTimeMillis() + timeout;
+ // }
}
} catch (InterruptedException e) {
logger.debug("retry sleep interrupted - giving up on joining the distributed system");