You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@geode.apache.org by bs...@apache.org on 2019/02/26 22:01:57 UTC

[geode] branch feature/GEODE-6451 updated: fixes for hung dunit runs The unicast receiver thread was becoming blocked if a forced disconnect occurred during reconnect because InternalDistributedSystem.disconnect sychronizes, for some reason, on GemFireCacheImpl.class. This reworks that logic to have the reconnect thread get a SystemConnectException forcing cleanup of the reconnecting InternalDistributedSystem in that thread.

This is an automated email from the ASF dual-hosted git repository.

bschuchardt pushed a commit to branch feature/GEODE-6451
in repository https://gitbox.apache.org/repos/asf/geode.git


The following commit(s) were added to refs/heads/feature/GEODE-6451 by this push:
     new 771969e  fixes for hung dunit runs The unicast receiver thread was becoming blocked if a forced disconnect occurred during reconnect because InternalDistributedSystem.disconnect sychronizes, for some reason, on GemFireCacheImpl.class.  This reworks that logic to have the reconnect thread get a SystemConnectException forcing cleanup of the reconnecting InternalDistributedSystem in that thread.
771969e is described below

commit 771969ea3d60e3522783c5a55b0efb42107d2d3d
Author: Bruce Schuchardt <bs...@pivotal.io>
AuthorDate: Tue Feb 26 14:01:14 2019 -0800

    fixes for hung dunit runs
    The unicast receiver thread was becoming blocked if a
    forced disconnect occurred during reconnect because
    InternalDistributedSystem.disconnect sychronizes, for some
    reason, on GemFireCacheImpl.class.  This reworks that
    logic to have the reconnect thread get a
    SystemConnectException forcing cleanup of the reconnecting
    InternalDistributedSystem in that thread.
---
 .../ClusterConfigLocatorRestartDUnitTest.java      |  2 ++
 .../internal/InternalDistributedSystem.java        |  2 ++
 .../distributed/internal/InternalLocator.java      |  8 +++++++-
 .../internal/membership/gms/ServiceConfig.java     |  5 ++++-
 .../internal/membership/gms/Services.java          |  1 +
 .../membership/gms/membership/GMSJoinLeave.java    | 22 +++++++++++++++++++---
 .../geode/internal/cache/GemFireCacheImpl.java     |  3 +++
 .../apache/geode/test/junit/rules/VMProvider.java  |  8 ++++----
 8 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java b/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java
index d750969..fe5fb75 100644
--- a/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java
+++ b/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java
@@ -55,6 +55,8 @@ public class ClusterConfigLocatorRestartDUnitTest {
   @Test
   public void serverRestartsAfterLocatorReconnects() throws Exception {
     IgnoredException.addIgnoredException("org.apache.geode.ForcedDisconnectException: for testing");
+    IgnoredException.addIgnoredException("cluster configuration service not available");
+    IgnoredException.addIgnoredException("This thread has been stalled");
 
     MemberVM locator0 = rule.startLocatorVM(0);
 
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
index 4011669..cb4a36f 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
@@ -2354,6 +2354,8 @@ public class InternalDistributedSystem extends DistributedSystem
   @Override
   public boolean isReconnecting() {
     InternalDistributedSystem rds = this.reconnectDS;
+    logger.info("BRUCE: attemptingToReconnect={} reconnectCancelled={} rds={} isconnected={}",
+        attemptingToReconnect, reconnectCancelled, rds, (rds == null ? "" : rds.isConnected()));
     if (!attemptingToReconnect) {
       return false;
     }
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java
index 28bd0bd..75200bf 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java
@@ -1032,7 +1032,13 @@ public class InternalLocator extends Locator implements ConnectListener, LogConf
             setLocator(this);
           }
         }
-        ds.waitUntilReconnected(waitTime, TimeUnit.MILLISECONDS);
+        try {
+          ds.waitUntilReconnected(waitTime, TimeUnit.MILLISECONDS);
+        } catch (CancelException e) {
+          logger.info("Attempt to reconnect failed and further attempts have been terminated");
+          this.stoppedForReconnect = false;
+          return false;
+        }
       }
       InternalDistributedSystem newSystem = (InternalDistributedSystem) ds.getReconnectedSystem();
       if (newSystem != null) {
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/ServiceConfig.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/ServiceConfig.java
index faf8ebc..0fe55f0 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/ServiceConfig.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/ServiceConfig.java
@@ -35,7 +35,7 @@ public class ServiceConfig {
   private final int[] membershipPortRange;
   private final long memberTimeout;
 
-  private final boolean isReconnecting;
+  private boolean isReconnecting;
   private Integer lossThreshold;
   private final Integer memberWeight;
   private boolean networkPartitionDetectionEnabled;
@@ -157,4 +157,7 @@ public class ServiceConfig {
     networkPartitionDetectionEnabled = theConfig.getEnableNetworkPartitionDetection();
   }
 
+  public void setIsReconnecting(boolean b) {
+    this.isReconnecting = false;
+  }
 }
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/Services.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/Services.java
index e8bc0b9..7d403ab 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/Services.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/Services.java
@@ -225,6 +225,7 @@ public class Services {
     }
     logger.info("Stopping membership services");
     this.stopping = true;
+    config.setIsReconnecting(false);
     try {
       this.timer.cancel();
     } finally {
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
index 8d6751e..a816e78 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
@@ -344,6 +344,9 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
             if (attemptToJoin()) {
               return true;
             }
+            if (this.isStopping) {
+              break;
+            }
             if (!state.possibleCoordinator.equals(localAddress)) {
               state.alreadyTried.add(state.possibleCoordinator);
             }
@@ -447,12 +450,13 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
       return isJoined;
     }
 
-    logger.debug("received join response {}", response);
+    logger.info("received join response {}", response);
     joinResponse[0] = null;
     String failReason = response.getRejectionMessage();
     if (failReason != null) {
       if (failReason.contains("Rejecting the attempt of a member using an older version")
-          || failReason.contains("15806")) {
+          || failReason.contains("15806")
+          || failReason.contains("ForcedDisconnectException")) {
         throw new SystemConnectException(failReason);
       } else if (failReason.contains("Failed to find credentials")) {
         throw new AuthenticationRequiredException(failReason);
@@ -1073,7 +1077,19 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
 
   private void forceDisconnect(String reason) {
     this.isStopping = true;
-    services.getManager().forceDisconnect(reason);
+    if (!isJoined) {
+      logger.fatal("BRUCE: forcedDisconnect invoked.  isReconnecting={} isJoined={}",
+          services.getConfig().isReconnecting(), isJoined);
+      joinResponse[0] =
+          new JoinResponseMessage(
+              "Stopping due to ForcedDisconnectException caused by '" + reason + "'", -1);
+      isJoined = false;
+      synchronized (joinResponse) {
+        joinResponse.notifyAll();
+      }
+    } else {
+      services.getManager().forceDisconnect(reason);
+    }
   }
 
   private void ackView(InstallViewMessage m) {
diff --git a/geode-core/src/main/java/org/apache/geode/internal/cache/GemFireCacheImpl.java b/geode-core/src/main/java/org/apache/geode/internal/cache/GemFireCacheImpl.java
index 2fa37ae..19f1e12 100755
--- a/geode-core/src/main/java/org/apache/geode/internal/cache/GemFireCacheImpl.java
+++ b/geode-core/src/main/java/org/apache/geode/internal/cache/GemFireCacheImpl.java
@@ -2142,6 +2142,9 @@ public class GemFireCacheImpl implements InternalCache, InternalClientCache, Has
     if (isClosed()) {
       return;
     }
+
+    // stopReconnecting();
+
     final boolean isDebugEnabled = logger.isDebugEnabled();
 
     synchronized (GemFireCacheImpl.class) {
diff --git a/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/VMProvider.java b/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/VMProvider.java
index de52d9e..9b50736 100644
--- a/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/VMProvider.java
+++ b/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/VMProvider.java
@@ -53,7 +53,7 @@ public abstract class VMProvider {
   }
 
   public void stop(boolean cleanWorkingDir) {
-    getVM().invoke(() -> {
+    getVM().invoke("stop cluster elements", () -> {
       // this did not clean up the files
       ClusterStartupRule.stopElementInsideVM();
       MemberStarterRule.disconnectDSIfAny();
@@ -66,19 +66,19 @@ public abstract class VMProvider {
   }
 
   public boolean isClient() {
-    return getVM().invoke(() -> {
+    return getVM().invoke("isClient", () -> {
       return ClusterStartupRule.clientCacheRule != null;
     });
   }
 
   public boolean isLocator() {
-    return getVM().invoke(() -> ClusterStartupRule.getLocator() != null);
+    return getVM().invoke("isLocator", () -> ClusterStartupRule.getLocator() != null);
   }
 
   // a server can be started without a cache server, so as long as this member has no locator,
   // it's deemed as a server
   public boolean isServer() {
-    return getVM().invoke(() -> ClusterStartupRule.getLocator() == null);
+    return getVM().invoke("isServer", () -> ClusterStartupRule.getLocator() == null);
   }
 
   public void invoke(final SerializableRunnableIF runnable) {