You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@geode.apache.org by bs...@apache.org on 2019/02/26 22:01:57 UTC
[geode] branch feature/GEODE-6451 updated: fixes for hung dunit
runs The unicast receiver thread was becoming blocked if a forced
disconnect occurred during reconnect because
InternalDistributedSystem.disconnect sychronizes, for some reason,
on GemFireCacheImpl.class. This reworks that logic to have the reconnect
thread get a SystemConnectException forcing cleanup of the reconnecting
InternalDistributedSystem in that thread.
This is an automated email from the ASF dual-hosted git repository.
bschuchardt pushed a commit to branch feature/GEODE-6451
in repository https://gitbox.apache.org/repos/asf/geode.git
The following commit(s) were added to refs/heads/feature/GEODE-6451 by this push:
new 771969e fixes for hung dunit runs The unicast receiver thread was becoming blocked if a forced disconnect occurred during reconnect because InternalDistributedSystem.disconnect sychronizes, for some reason, on GemFireCacheImpl.class. This reworks that logic to have the reconnect thread get a SystemConnectException forcing cleanup of the reconnecting InternalDistributedSystem in that thread.
771969e is described below
commit 771969ea3d60e3522783c5a55b0efb42107d2d3d
Author: Bruce Schuchardt <bs...@pivotal.io>
AuthorDate: Tue Feb 26 14:01:14 2019 -0800
fixes for hung dunit runs
The unicast receiver thread was becoming blocked if a
forced disconnect occurred during reconnect because
InternalDistributedSystem.disconnect sychronizes, for some
reason, on GemFireCacheImpl.class. This reworks that
logic to have the reconnect thread get a
SystemConnectException forcing cleanup of the reconnecting
InternalDistributedSystem in that thread.
---
.../ClusterConfigLocatorRestartDUnitTest.java | 2 ++
.../internal/InternalDistributedSystem.java | 2 ++
.../distributed/internal/InternalLocator.java | 8 +++++++-
.../internal/membership/gms/ServiceConfig.java | 5 ++++-
.../internal/membership/gms/Services.java | 1 +
.../membership/gms/membership/GMSJoinLeave.java | 22 +++++++++++++++++++---
.../geode/internal/cache/GemFireCacheImpl.java | 3 +++
.../apache/geode/test/junit/rules/VMProvider.java | 8 ++++----
8 files changed, 42 insertions(+), 9 deletions(-)
diff --git a/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java b/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java
index d750969..fe5fb75 100644
--- a/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java
+++ b/geode-core/src/distributedTest/java/org/apache/geode/management/internal/configuration/ClusterConfigLocatorRestartDUnitTest.java
@@ -55,6 +55,8 @@ public class ClusterConfigLocatorRestartDUnitTest {
@Test
public void serverRestartsAfterLocatorReconnects() throws Exception {
IgnoredException.addIgnoredException("org.apache.geode.ForcedDisconnectException: for testing");
+ IgnoredException.addIgnoredException("cluster configuration service not available");
+ IgnoredException.addIgnoredException("This thread has been stalled");
MemberVM locator0 = rule.startLocatorVM(0);
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
index 4011669..cb4a36f 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalDistributedSystem.java
@@ -2354,6 +2354,8 @@ public class InternalDistributedSystem extends DistributedSystem
@Override
public boolean isReconnecting() {
InternalDistributedSystem rds = this.reconnectDS;
+ logger.info("BRUCE: attemptingToReconnect={} reconnectCancelled={} rds={} isconnected={}",
+ attemptingToReconnect, reconnectCancelled, rds, (rds == null ? "" : rds.isConnected()));
if (!attemptingToReconnect) {
return false;
}
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java
index 28bd0bd..75200bf 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/InternalLocator.java
@@ -1032,7 +1032,13 @@ public class InternalLocator extends Locator implements ConnectListener, LogConf
setLocator(this);
}
}
- ds.waitUntilReconnected(waitTime, TimeUnit.MILLISECONDS);
+ try {
+ ds.waitUntilReconnected(waitTime, TimeUnit.MILLISECONDS);
+ } catch (CancelException e) {
+ logger.info("Attempt to reconnect failed and further attempts have been terminated");
+ this.stoppedForReconnect = false;
+ return false;
+ }
}
InternalDistributedSystem newSystem = (InternalDistributedSystem) ds.getReconnectedSystem();
if (newSystem != null) {
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/ServiceConfig.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/ServiceConfig.java
index faf8ebc..0fe55f0 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/ServiceConfig.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/ServiceConfig.java
@@ -35,7 +35,7 @@ public class ServiceConfig {
private final int[] membershipPortRange;
private final long memberTimeout;
- private final boolean isReconnecting;
+ private boolean isReconnecting;
private Integer lossThreshold;
private final Integer memberWeight;
private boolean networkPartitionDetectionEnabled;
@@ -157,4 +157,7 @@ public class ServiceConfig {
networkPartitionDetectionEnabled = theConfig.getEnableNetworkPartitionDetection();
}
+ public void setIsReconnecting(boolean b) {
+ this.isReconnecting = false;
+ }
}
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/Services.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/Services.java
index e8bc0b9..7d403ab 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/Services.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/Services.java
@@ -225,6 +225,7 @@ public class Services {
}
logger.info("Stopping membership services");
this.stopping = true;
+ config.setIsReconnecting(false);
try {
this.timer.cancel();
} finally {
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
index 8d6751e..a816e78 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
@@ -344,6 +344,9 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
if (attemptToJoin()) {
return true;
}
+ if (this.isStopping) {
+ break;
+ }
if (!state.possibleCoordinator.equals(localAddress)) {
state.alreadyTried.add(state.possibleCoordinator);
}
@@ -447,12 +450,13 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
return isJoined;
}
- logger.debug("received join response {}", response);
+ logger.info("received join response {}", response);
joinResponse[0] = null;
String failReason = response.getRejectionMessage();
if (failReason != null) {
if (failReason.contains("Rejecting the attempt of a member using an older version")
- || failReason.contains("15806")) {
+ || failReason.contains("15806")
+ || failReason.contains("ForcedDisconnectException")) {
throw new SystemConnectException(failReason);
} else if (failReason.contains("Failed to find credentials")) {
throw new AuthenticationRequiredException(failReason);
@@ -1073,7 +1077,19 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
private void forceDisconnect(String reason) {
this.isStopping = true;
- services.getManager().forceDisconnect(reason);
+ if (!isJoined) {
+ logger.fatal("BRUCE: forcedDisconnect invoked. isReconnecting={} isJoined={}",
+ services.getConfig().isReconnecting(), isJoined);
+ joinResponse[0] =
+ new JoinResponseMessage(
+ "Stopping due to ForcedDisconnectException caused by '" + reason + "'", -1);
+ isJoined = false;
+ synchronized (joinResponse) {
+ joinResponse.notifyAll();
+ }
+ } else {
+ services.getManager().forceDisconnect(reason);
+ }
}
private void ackView(InstallViewMessage m) {
diff --git a/geode-core/src/main/java/org/apache/geode/internal/cache/GemFireCacheImpl.java b/geode-core/src/main/java/org/apache/geode/internal/cache/GemFireCacheImpl.java
index 2fa37ae..19f1e12 100755
--- a/geode-core/src/main/java/org/apache/geode/internal/cache/GemFireCacheImpl.java
+++ b/geode-core/src/main/java/org/apache/geode/internal/cache/GemFireCacheImpl.java
@@ -2142,6 +2142,9 @@ public class GemFireCacheImpl implements InternalCache, InternalClientCache, Has
if (isClosed()) {
return;
}
+
+ // stopReconnecting();
+
final boolean isDebugEnabled = logger.isDebugEnabled();
synchronized (GemFireCacheImpl.class) {
diff --git a/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/VMProvider.java b/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/VMProvider.java
index de52d9e..9b50736 100644
--- a/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/VMProvider.java
+++ b/geode-dunit/src/main/java/org/apache/geode/test/junit/rules/VMProvider.java
@@ -53,7 +53,7 @@ public abstract class VMProvider {
}
public void stop(boolean cleanWorkingDir) {
- getVM().invoke(() -> {
+ getVM().invoke("stop cluster elements", () -> {
// this did not clean up the files
ClusterStartupRule.stopElementInsideVM();
MemberStarterRule.disconnectDSIfAny();
@@ -66,19 +66,19 @@ public abstract class VMProvider {
}
public boolean isClient() {
- return getVM().invoke(() -> {
+ return getVM().invoke("isClient", () -> {
return ClusterStartupRule.clientCacheRule != null;
});
}
public boolean isLocator() {
- return getVM().invoke(() -> ClusterStartupRule.getLocator() != null);
+ return getVM().invoke("isLocator", () -> ClusterStartupRule.getLocator() != null);
}
// a server can be started without a cache server, so as long as this member has no locator,
// it's deemed as a server
public boolean isServer() {
- return getVM().invoke(() -> ClusterStartupRule.getLocator() == null);
+ return getVM().invoke("isServer", () -> ClusterStartupRule.getLocator() == null);
}
public void invoke(final SerializableRunnableIF runnable) {