You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@geode.apache.org by bs...@apache.org on 2019/04/30 21:26:28 UTC

[geode] branch feature/GEODE-6724 created (now 0f41c5f)

This is an automated email from the ASF dual-hosted git repository.

bschuchardt pushed a change to branch feature/GEODE-6724
in repository https://gitbox.apache.org/repos/asf/geode.git.


      at 0f41c5f  GEODE-6724 split brain formed on concurrent locator startup

This branch includes the following new commits:

     new 0f41c5f  GEODE-6724 split brain formed on concurrent locator startup

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[geode] 01/01: GEODE-6724 split brain formed on concurrent locator startup

Posted by bs...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

bschuchardt pushed a commit to branch feature/GEODE-6724
in repository https://gitbox.apache.org/repos/asf/geode.git

commit 0f41c5f46fa731423f7b4d895cccbf8418b40da8
Author: Bruce Schuchardt <bs...@pivotal.io>
AuthorDate: Tue Apr 30 14:24:04 2019 -0700

    GEODE-6724 split brain formed on concurrent locator startup
    
    Ensure that either all locators have been contacted or a decent
    number of attempts to join have occurred before allowing a member to
    start its own cluster.
    
    If all locators have been contacted we ought to have a sufficient
    registration pool to choose a membership coordinator during concurrent
    startup.
---
 .../apache/geode/distributed/LocatorDUnitTest.java | 59 +++++++++++++++++++++-
 .../membership/gms/membership/GMSJoinLeave.java    | 13 +++--
 2 files changed, 66 insertions(+), 6 deletions(-)

diff --git a/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java b/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java
index 4268596..e07f572 100644
--- a/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java
+++ b/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java
@@ -14,6 +14,7 @@
  */
 package org.apache.geode.distributed;
 
+import static java.util.concurrent.TimeUnit.MINUTES;
 import static java.util.concurrent.TimeUnit.SECONDS;
 import static org.apache.geode.distributed.ConfigurationProperties.DISABLE_AUTO_RECONNECT;
 import static org.apache.geode.distributed.ConfigurationProperties.ENABLE_CLUSTER_CONFIGURATION;
@@ -1265,11 +1266,64 @@ public class LocatorDUnitTest implements java.io.Serializable {
     }
   }
 
+  @Test
+  public void testConcurrentLocatorStartup() throws Exception {
+    List<AvailablePort.Keeper> portKeepers =
+        AvailablePortHelper.getRandomAvailableTCPPortKeepers(4);
+    List<AsyncInvocation<Object>> asyncInvocations = new ArrayList(portKeepers.size());
+    StringBuilder sb = new StringBuilder(100);
+    for (int i = 0; i < portKeepers.size(); i++) {
+      AvailablePort.Keeper keeper = portKeepers.get(i);
+      sb.append("localhost[").append(keeper.getPort()).append("]");
+      if (i < portKeepers.size() - 1) {
+        sb.append(',');
+      }
+    }
+    String locators = sb.toString();
+    Properties dsProps = getClusterProperties(locators, "false");
+    for (int i = 0; i < portKeepers.size(); i++) {
+      AvailablePort.Keeper keeper = portKeepers.get(i);
+      final int port = keeper.getPort();
+      DistributedTestUtils.deleteLocatorStateFile(port);
+      keeper.release();
+      AsyncInvocation<Object> startLocator = VM.getVM(i).invokeAsync("start locator " + i, () -> {
+        DUnitBlackboard blackboard = getBlackboard();
+        blackboard.signalGate("" + port);
+        try {
+          blackboard.waitForGate("startLocators", 5, MINUTES);
+        } catch (InterruptedException e) {
+          throw new RuntimeException("test was interrupted");
+        }
+        startLocatorBase(dsProps, port);
+        assertTrue(isSystemConnected());
+        System.out.println("Locator startup completed");
+      });
+      asyncInvocations.add(startLocator);
+      getBlackboard().waitForGate("" + port, 5, MINUTES);
+    }
+
+    getBlackboard().signalGate("startLocators");
+    int expectedCount = asyncInvocations.size() - 1;
+    for (int i = 0; i < asyncInvocations.size(); i++) {
+      asyncInvocations.get(i).await();
+    }
+    for (int i = 0; i < asyncInvocations.size(); i++) {
+      assertTrue(VM.getVM(i).invoke("assert all in same cluster", () -> CacheFactory
+          .getAnyInstance().getDistributedSystem().getAllOtherMembers().size() == expectedCount));
+    }
+    for (int i = 0; i < asyncInvocations.size(); i++) {
+      VM.getVM(i).invoke(() -> {
+        Locator.getLocator().stop();
+        system = null;
+      });
+    }
+  }
+
   /**
-   * Tests starting multiple locators in multiple VMs.
+   * Tests starting two locators and two servers in different JVMs
    */
   @Test
-  public void testMultipleLocators() {
+  public void testTwoLocatorsTwoServers() {
     VM vm0 = VM.getVM(0);
     VM vm1 = VM.getVM(1);
     VM vm2 = VM.getVM(2);
@@ -1904,6 +1958,7 @@ public class LocatorDUnitTest implements java.io.Serializable {
     properties.put(MEMBER_TIMEOUT, "2000");
     properties.put(LOG_LEVEL, logger.getLevel().name());
     properties.put(ENABLE_CLUSTER_CONFIGURATION, "false");
+    properties.put(USE_CLUSTER_CONFIGURATION, "false");
     properties.put(LOCATOR_WAIT_TIME, "10"); // seconds
     return properties;
   }
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
index e529c9e..d7712c1 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
@@ -279,13 +279,13 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
 
     public String toString() {
       StringBuffer sb = new StringBuffer(200);
-      sb.append("SearchState(locatorsContacted=").append(locatorsContacted)
+      sb.append("locatorsContacted=").append(locatorsContacted)
           .append("; findInViewResponses=").append(joinedMembersContacted)
           .append("; alreadyTried=").append(alreadyTried).append("; registrants=")
           .append(registrants).append("; possibleCoordinator=").append(possibleCoordinator)
           .append("; viewId=").append(viewId).append("; hasContactedAJoinedLocator=")
           .append(hasContactedAJoinedLocator).append("; view=").append(view).append("; responses=")
-          .append(responses).append(")");
+          .append(responses);
       return sb.toString();
     }
   }
@@ -323,6 +323,7 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
       long startTime = System.currentTimeMillis();
       long locatorGiveUpTime = startTime + locatorWaitTime;
       long giveupTime = startTime + timeout;
+      int minimumRetriesBeforeBecomingCoordinator = locators.size() * 2;
 
       for (int tries = 0; !this.isJoined && !this.isStopping; tries++) {
         logger.debug("searching for the membership coordinator");
@@ -333,8 +334,11 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
           logger.info("found possible coordinator {}", state.possibleCoordinator);
           if (localAddress.getNetMember().preferredForCoordinator()
               && state.possibleCoordinator.equals(this.localAddress)) {
+            // if we haven't contacted a member of a cluster maybe this node should
+            // become the coordinator.
             if (state.joinedMembersContacted <= 0 &&
-                (tries > 2 || System.currentTimeMillis() < giveupTime)) {
+                (tries >= minimumRetriesBeforeBecomingCoordinator ||
+                    state.locatorsContacted >= locators.size())) {
               synchronized (viewInstallationLock) {
                 becomeCoordinator();
               }
@@ -376,10 +380,11 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
               logger.debug("sleeping for {} before making another attempt to find the coordinator",
                   retrySleep);
               Thread.sleep(retrySleep);
-            }
+            } /* else { */
             // since we were given a coordinator that couldn't be used we should keep trying
             tries = 0;
             giveupTime = System.currentTimeMillis() + timeout;
+            // }
           }
         } catch (InterruptedException e) {
           logger.debug("retry sleep interrupted - giving up on joining the distributed system");