You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@geode.apache.org by bs...@apache.org on 2019/04/30 21:26:29 UTC

[geode] 01/01: GEODE-6724 split brain formed on concurrent locator startup

This is an automated email from the ASF dual-hosted git repository.

bschuchardt pushed a commit to branch feature/GEODE-6724
in repository https://gitbox.apache.org/repos/asf/geode.git

commit 0f41c5f46fa731423f7b4d895cccbf8418b40da8
Author: Bruce Schuchardt <bs...@pivotal.io>
AuthorDate: Tue Apr 30 14:24:04 2019 -0700

    GEODE-6724 split brain formed on concurrent locator startup
    
    Ensure that either all locators have been contacted or a decent
    number of attempts to join have occurred before allowing a member to
    start its own cluster.
    
    If all locators have been contacted we ought to have a sufficient
    registration pool to choose a membership coordinator during concurrent
    startup.
---
 .../apache/geode/distributed/LocatorDUnitTest.java | 59 +++++++++++++++++++++-
 .../membership/gms/membership/GMSJoinLeave.java    | 13 +++--
 2 files changed, 66 insertions(+), 6 deletions(-)

diff --git a/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java b/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java
index 4268596..e07f572 100644
--- a/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java
+++ b/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java
@@ -14,6 +14,7 @@
  */
 package org.apache.geode.distributed;
 
+import static java.util.concurrent.TimeUnit.MINUTES;
 import static java.util.concurrent.TimeUnit.SECONDS;
 import static org.apache.geode.distributed.ConfigurationProperties.DISABLE_AUTO_RECONNECT;
 import static org.apache.geode.distributed.ConfigurationProperties.ENABLE_CLUSTER_CONFIGURATION;
@@ -1265,11 +1266,64 @@ public class LocatorDUnitTest implements java.io.Serializable {
     }
   }
 
+  @Test
+  public void testConcurrentLocatorStartup() throws Exception {
+    List<AvailablePort.Keeper> portKeepers =
+        AvailablePortHelper.getRandomAvailableTCPPortKeepers(4);
+    List<AsyncInvocation<Object>> asyncInvocations = new ArrayList(portKeepers.size());
+    StringBuilder sb = new StringBuilder(100);
+    for (int i = 0; i < portKeepers.size(); i++) {
+      AvailablePort.Keeper keeper = portKeepers.get(i);
+      sb.append("localhost[").append(keeper.getPort()).append("]");
+      if (i < portKeepers.size() - 1) {
+        sb.append(',');
+      }
+    }
+    String locators = sb.toString();
+    Properties dsProps = getClusterProperties(locators, "false");
+    for (int i = 0; i < portKeepers.size(); i++) {
+      AvailablePort.Keeper keeper = portKeepers.get(i);
+      final int port = keeper.getPort();
+      DistributedTestUtils.deleteLocatorStateFile(port);
+      keeper.release();
+      AsyncInvocation<Object> startLocator = VM.getVM(i).invokeAsync("start locator " + i, () -> {
+        DUnitBlackboard blackboard = getBlackboard();
+        blackboard.signalGate("" + port);
+        try {
+          blackboard.waitForGate("startLocators", 5, MINUTES);
+        } catch (InterruptedException e) {
+          throw new RuntimeException("test was interrupted");
+        }
+        startLocatorBase(dsProps, port);
+        assertTrue(isSystemConnected());
+        System.out.println("Locator startup completed");
+      });
+      asyncInvocations.add(startLocator);
+      getBlackboard().waitForGate("" + port, 5, MINUTES);
+    }
+
+    getBlackboard().signalGate("startLocators");
+    int expectedCount = asyncInvocations.size() - 1;
+    for (int i = 0; i < asyncInvocations.size(); i++) {
+      asyncInvocations.get(i).await();
+    }
+    for (int i = 0; i < asyncInvocations.size(); i++) {
+      assertTrue(VM.getVM(i).invoke("assert all in same cluster", () -> CacheFactory
+          .getAnyInstance().getDistributedSystem().getAllOtherMembers().size() == expectedCount));
+    }
+    for (int i = 0; i < asyncInvocations.size(); i++) {
+      VM.getVM(i).invoke(() -> {
+        Locator.getLocator().stop();
+        system = null;
+      });
+    }
+  }
+
   /**
-   * Tests starting multiple locators in multiple VMs.
+   * Tests starting two locators and two servers in different JVMs
    */
   @Test
-  public void testMultipleLocators() {
+  public void testTwoLocatorsTwoServers() {
     VM vm0 = VM.getVM(0);
     VM vm1 = VM.getVM(1);
     VM vm2 = VM.getVM(2);
@@ -1904,6 +1958,7 @@ public class LocatorDUnitTest implements java.io.Serializable {
     properties.put(MEMBER_TIMEOUT, "2000");
     properties.put(LOG_LEVEL, logger.getLevel().name());
     properties.put(ENABLE_CLUSTER_CONFIGURATION, "false");
+    properties.put(USE_CLUSTER_CONFIGURATION, "false");
     properties.put(LOCATOR_WAIT_TIME, "10"); // seconds
     return properties;
   }
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
index e529c9e..d7712c1 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
@@ -279,13 +279,13 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
 
     public String toString() {
       StringBuffer sb = new StringBuffer(200);
-      sb.append("SearchState(locatorsContacted=").append(locatorsContacted)
+      sb.append("locatorsContacted=").append(locatorsContacted)
           .append("; findInViewResponses=").append(joinedMembersContacted)
           .append("; alreadyTried=").append(alreadyTried).append("; registrants=")
           .append(registrants).append("; possibleCoordinator=").append(possibleCoordinator)
           .append("; viewId=").append(viewId).append("; hasContactedAJoinedLocator=")
           .append(hasContactedAJoinedLocator).append("; view=").append(view).append("; responses=")
-          .append(responses).append(")");
+          .append(responses);
       return sb.toString();
     }
   }
@@ -323,6 +323,7 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
       long startTime = System.currentTimeMillis();
       long locatorGiveUpTime = startTime + locatorWaitTime;
       long giveupTime = startTime + timeout;
+      int minimumRetriesBeforeBecomingCoordinator = locators.size() * 2;
 
       for (int tries = 0; !this.isJoined && !this.isStopping; tries++) {
         logger.debug("searching for the membership coordinator");
@@ -333,8 +334,11 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
           logger.info("found possible coordinator {}", state.possibleCoordinator);
           if (localAddress.getNetMember().preferredForCoordinator()
               && state.possibleCoordinator.equals(this.localAddress)) {
+            // if we haven't contacted a member of a cluster maybe this node should
+            // become the coordinator.
             if (state.joinedMembersContacted <= 0 &&
-                (tries > 2 || System.currentTimeMillis() < giveupTime)) {
+                (tries >= minimumRetriesBeforeBecomingCoordinator ||
+                    state.locatorsContacted >= locators.size())) {
               synchronized (viewInstallationLock) {
                 becomeCoordinator();
               }
@@ -376,10 +380,11 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
               logger.debug("sleeping for {} before making another attempt to find the coordinator",
                   retrySleep);
               Thread.sleep(retrySleep);
-            }
+            } /* else { */
             // since we were given a coordinator that couldn't be used we should keep trying
             tries = 0;
             giveupTime = System.currentTimeMillis() + timeout;
+            // }
           }
         } catch (InterruptedException e) {
           logger.debug("retry sleep interrupted - giving up on joining the distributed system");