You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@geode.apache.org by bs...@apache.org on 2018/01/09 00:00:55 UTC

[geode] 01/01: GEODE-3588 2 restarts of Locator results in split brain

This is an automated email from the ASF dual-hosted git repository.

bschuchardt pushed a commit to branch feature/GEODE-3588
in repository https://gitbox.apache.org/repos/asf/geode.git

commit 89bf34c39f3df4ed7b16d6c9a256e2d26b9d2267
Author: Bruce Schuchardt <bs...@pivotal.io>
AuthorDate: Mon Jan 8 15:57:09 2018 -0800

    GEODE-3588 2 restarts of Locator results in split brain
    
    Udo's fix for GEODE-870 added a new boolean instance variable to
    GMSJoinLeave to tell its ViewCreator thread to shut down.  This works
    but the state was never being reset after its first use.  This caused
    Subsequent ViewCreator threads to shut down immediately.  The only
    way to fix this condition without a patch is to restart the coordinator node.
    
    The patch moves this boolean variable to the ViewCreator thread so that
    it is automatically reset when a new ViewCreator is instantiated.
    
    I also did a little code cleanup, moving GMSJoinLeave methods from the
    end of the file to where its other methods are located and adding
    a setShutdownFlag() method during debugging so I could isolate what
    was happening.
---
 .../membership/gms/membership/GMSJoinLeave.java    | 140 +++++++++++----------
 .../membership/gms/messenger/JGroupsMessenger.java |   2 +-
 .../apache/geode/distributed/LocatorDUnitTest.java |  53 ++++++++
 .../gms/membership/GMSJoinLeaveJUnitTest.java      |   4 +-
 4 files changed, 133 insertions(+), 66 deletions(-)

diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
index 3378000..0db5cd6 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
@@ -255,11 +255,6 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
    */
   NetView quorumLostView;
 
-  /**
-   * a flag to mark a coordinator's viewCreator for shutdown
-   */
-  private boolean markViewCreatorForShutdown = false;
-
   static class SearchState {
     Set<InternalDistributedMember> alreadyTried = new HashSet<>();
     Set<InternalDistributedMember> registrants = new HashSet<>();
@@ -533,7 +528,7 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
    */
   private void processJoinRequest(JoinRequestMessage incomingRequest) {
 
-    logger.info("received join request from {}", incomingRequest.getMemberID());
+    logger.info("Received a join request from {}", incomingRequest.getMemberID());
 
     if (!ALLOW_OLD_VERSION_FOR_TESTING
         && incomingRequest.getMemberID().getVersionObject().compareTo(Version.CURRENT) < 0) {
@@ -570,7 +565,6 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
     // services.getMessenger().send(joinResponseMessage);
     // return;
     // }
-    logger.info("Received a join request from " + incomingRequest.getSender());
 
     recordViewRequest(incomingRequest);
   }
@@ -930,7 +924,7 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
       // if another server is the coordinator - GEODE-870
       if (isCoordinator && !localAddress.equals(view.getCoordinator())
           && getViewCreator() != null) {
-        markViewCreatorForShutdown = true;
+        getViewCreator().markViewCreatorForShutdown();
         this.isCoordinator = false;
       }
       installView(view);
@@ -1686,6 +1680,50 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
     processLeaveRequest(msg);
   }
 
+  boolean checkIfAvailable(InternalDistributedMember fmbr) {
+    // return the member id if it fails health checks
+    logger.info("checking state of member " + fmbr);
+    if (services.getHealthMonitor().checkIfAvailable(fmbr,
+        "Member failed to acknowledge a membership view", false)) {
+      logger.info("member " + fmbr + " passed availability check");
+      return true;
+    }
+    logger.info("member " + fmbr + " failed availability check");
+    return false;
+  }
+
+  private InternalDistributedMember getMemId(NetMember jgId,
+      List<InternalDistributedMember> members) {
+    for (InternalDistributedMember m : members) {
+      if (((GMSMember) m.getNetMember()).equals(jgId)) {
+        return m;
+      }
+    }
+    return null;
+  }
+
+  @Override
+  public InternalDistributedMember getMemberID(NetMember jgId) {
+    NetView v = currentView;
+    InternalDistributedMember ret = null;
+    if (v != null) {
+      ret = getMemId(jgId, v.getMembers());
+    }
+
+    if (ret == null) {
+      v = preparedView;
+      if (v != null) {
+        ret = getMemId(jgId, v.getMembers());
+      }
+    }
+
+    if (ret == null) {
+      return new InternalDistributedMember(jgId);
+    }
+
+    return ret;
+  }
+
   @Override
   public void disableDisconnectOnQuorumLossForTesting() {
     this.quorumRequired = false;
@@ -2004,6 +2042,8 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
     volatile boolean testFlagForRemovalRequest = false;
     // count of number of views abandoned due to conflicts
     volatile int abandonedViews = 0;
+    private boolean markViewCreatorForShutdown = false; // see GEODE-870
+
 
     /**
      * initial view to install. guarded by synch on ViewCreator
@@ -2027,7 +2067,7 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
     }
 
     void shutdown() {
-      shutdown = true;
+      setShutdownFlag();
       synchronized (viewRequests) {
         viewRequests.notifyAll();
         interrupt();
@@ -2097,18 +2137,34 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
           try {
             sleep(services.getConfig().getMemberTimeout());
           } catch (InterruptedException e2) {
-            shutdown = true;
+            setShutdownFlag();
             retry = false;
           }
         } catch (InterruptedException e) {
-          shutdown = true;
+          setShutdownFlag();
         } catch (DistributedSystemDisconnectedException e) {
-          shutdown = true;
+          setShutdownFlag();
         }
       } while (retry);
     }
 
     /**
+     * marks this ViewCreator as being shut down. It may be some short amount of time before the
+     * ViewCreator thread exits.
+     */
+    private void setShutdownFlag() {
+      shutdown = true;
+    }
+
+    /**
+     * This allows GMSJoinLeave to tell the ViewCreator to shut down after finishing its current
+     * task. See GEODE-870.
+     */
+    private void markViewCreatorForShutdown() {
+      this.markViewCreatorForShutdown = true;
+    }
+
+    /**
      * During initial view processing a prepared view was discovered. This method will extract its
      * new members and create a new initial view containing them.
      *
@@ -2214,19 +2270,19 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
               try {
                 sleep(services.getConfig().getMemberTimeout());
               } catch (InterruptedException e2) {
-                shutdown = true;
+                setShutdownFlag();
               }
             } catch (DistributedSystemDisconnectedException e) {
-              shutdown = true;
+              setShutdownFlag();
             } catch (InterruptedException e) {
               logger.info("View Creator thread interrupted");
-              shutdown = true;
+              setShutdownFlag();
             }
             requests = null;
           }
         }
       } finally {
-        shutdown = true;
+        setShutdownFlag();
         informToPendingJoinRequests();
         org.apache.geode.distributed.internal.membership.gms.interfaces.Locator locator =
             services.getLocator();
@@ -2443,7 +2499,7 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
           Set<InternalDistributedMember> crashes = newView.getActualCrashedMembers(currentView);
           forceDisconnect(LocalizedStrings.Network_partition_detected
               .toLocalizedString(crashes.size(), crashes));
-          shutdown = true;
+          setShutdownFlag();
           return;
         }
 
@@ -2498,7 +2554,7 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
             // this member may have been kicked out of the conflicting view
             if (failures.contains(localAddress)) {
               forceDisconnect("I am no longer a member of the distributed system");
-              shutdown = true;
+              setShutdownFlag();
               return;
             }
             List<InternalDistributedMember> newMembers = conflictingView.getNewMembers();
@@ -2568,13 +2624,13 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
       // can be transmitted to the new members w/o including it in the view message
 
       if (markViewCreatorForShutdown && getViewCreator() != null) {
-        shutdown = true;
+        setShutdownFlag();
       }
 
       // after sending a final view we need to stop this thread if
       // the GMS is shutting down
       if (isStopping()) {
-        shutdown = true;
+        setShutdownFlag();
       }
     }
 
@@ -2705,54 +2761,12 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
       return result;
     }
 
-    boolean getTestFlageForRemovalRequest() {
+    boolean getTestFlagForRemovalRequest() {
       return testFlagForRemovalRequest;
     }
-  }
 
-  boolean checkIfAvailable(InternalDistributedMember fmbr) {
-    // return the member id if it fails health checks
-    logger.info("checking state of member " + fmbr);
-    if (services.getHealthMonitor().checkIfAvailable(fmbr,
-        "Member failed to acknowledge a membership view", false)) {
-      logger.info("member " + fmbr + " passed availability check");
-      return true;
-    }
-    logger.info("member " + fmbr + " failed availability check");
-    return false;
-  }
-
-  private InternalDistributedMember getMemId(NetMember jgId,
-      List<InternalDistributedMember> members) {
-    for (InternalDistributedMember m : members) {
-      if (((GMSMember) m.getNetMember()).equals(jgId)) {
-        return m;
-      }
-    }
-    return null;
   }
 
-  @Override
-  public InternalDistributedMember getMemberID(NetMember jgId) {
-    NetView v = currentView;
-    InternalDistributedMember ret = null;
-    if (v != null) {
-      ret = getMemId(jgId, v.getMembers());
-    }
-
-    if (ret == null) {
-      v = preparedView;
-      if (v != null) {
-        ret = getMemId(jgId, v.getMembers());
-      }
-    }
-
-    if (ret == null) {
-      return new InternalDistributedMember(jgId);
-    }
-
-    return ret;
-  }
 
   static class ViewAbandonedException extends Exception {
   }
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
index f524f68..962572a 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
@@ -409,7 +409,7 @@ public class JGroupsMessenger implements Messenger {
     mbrs.addAll(v.getMembers().stream().map(JGAddress::new).collect(Collectors.toList()));
     ViewId vid = new ViewId(new JGAddress(v.getCoordinator()), v.getViewId());
     View jgv = new View(vid, new ArrayList<>(mbrs));
-    logger.trace("installing JGroups view: {}", jgv);
+    logger.trace("installing view into JGroups stack: {}", jgv);
     this.myChannel.down(new Event(Event.VIEW_CHANGE, jgv));
 
     addressesWithIoExceptionsProcessed.clear();
diff --git a/geode-core/src/test/java/org/apache/geode/distributed/LocatorDUnitTest.java b/geode-core/src/test/java/org/apache/geode/distributed/LocatorDUnitTest.java
index 94c0fd8..883194b 100644
--- a/geode-core/src/test/java/org/apache/geode/distributed/LocatorDUnitTest.java
+++ b/geode-core/src/test/java/org/apache/geode/distributed/LocatorDUnitTest.java
@@ -79,6 +79,7 @@ import org.apache.geode.distributed.internal.membership.gms.membership.GMSJoinLe
 import org.apache.geode.internal.Assert;
 import org.apache.geode.internal.AvailablePort;
 import org.apache.geode.internal.AvailablePortHelper;
+import org.apache.geode.internal.OSProcess;
 import org.apache.geode.internal.cache.GemFireCacheImpl;
 import org.apache.geode.internal.logging.InternalLogWriter;
 import org.apache.geode.internal.logging.LocalLogWriter;
@@ -2022,6 +2023,58 @@ public class LocatorDUnitTest extends JUnit4DistributedTestCase {
   }
 
   /**
+   * See GEODE-3588 - a locator is restarted twice with a server and ends up in a split-brain
+   */
+  @Test
+  public void testRestartLocatorMultipleTimes() throws Exception {
+    disconnectAllFromDS();
+    port1 = AvailablePort.getRandomAvailablePort(AvailablePort.SOCKET);
+    DistributedTestUtils.deleteLocatorStateFile(port1);
+    File logFile = new File("");
+    File stateFile = new File("locator" + port1 + "state.dat");
+    VM vm0 = Host.getHost(0).getVM(0);
+    final Properties p = new Properties();
+    p.setProperty(LOCATORS, Host.getHost(0).getHostName() + "[" + port1 + "]");
+    p.setProperty(MCAST_PORT, "0");
+    p.setProperty(ENABLE_CLUSTER_CONFIGURATION, "false");
+    p.setProperty(LOG_LEVEL, "finest");
+    addDSProps(p);
+    if (stateFile.exists()) {
+      stateFile.delete();
+    }
+
+    Locator locator = Locator.startLocatorAndDS(port1, logFile, p);
+
+    vm0.invoke(() -> {
+      DistributedSystem.connect(p);
+      return null;
+    });
+
+    try {
+      locator.stop();
+      locator = Locator.startLocatorAndDS(port1, logFile, p);
+      assertEquals(2, ((InternalDistributedSystem) locator.getDistributedSystem()).getDM()
+          .getViewMembers().size());
+
+      locator.stop();
+      locator = Locator.startLocatorAndDS(port1, logFile, p);
+      vm0.invoke("dump stack", () -> {
+        OSProcess.printStacks(0);
+      });
+      assertEquals(2, ((InternalDistributedSystem) locator.getDistributedSystem()).getDM()
+          .getViewMembers().size());
+
+    } finally {
+      vm0.invoke("disconnect", () -> {
+        DistributedSystem.connect(p).disconnect();
+        return null;
+      });
+      locator.stop();
+    }
+
+  }
+
+  /**
    * return the distributed member id for the ds on this vm
    */
   public static DistributedMember getDistributedMember(Properties props) {
diff --git a/geode-core/src/test/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java b/geode-core/src/test/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
index 1883777..9e7b8a2 100644
--- a/geode-core/src/test/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
+++ b/geode-core/src/test/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
@@ -1330,7 +1330,7 @@ public class GMSJoinLeaveJUnitTest {
     this.removeMember = null;
 
     assertTrue("testFlagForRemovalRequest should be true",
-        gmsJoinLeave.getViewCreator().getTestFlageForRemovalRequest());
+        gmsJoinLeave.getViewCreator().getTestFlagForRemovalRequest());
   }
 
   @Test
@@ -1353,7 +1353,7 @@ public class GMSJoinLeaveJUnitTest {
     this.leaveMember = null;
 
     assertTrue("testFlagForRemovalRequest should be true",
-        gmsJoinLeave.getViewCreator().getTestFlageForRemovalRequest());
+        gmsJoinLeave.getViewCreator().getTestFlagForRemovalRequest());
   }
 
   private void installView() throws Exception {

-- 
To stop receiving notification emails like this one, please contact
"commits@geode.apache.org" <co...@geode.apache.org>.