You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@geode.apache.org by bs...@apache.org on 2015/11/20 22:01:58 UTC
[04/50] [abbrv] incubator-geode git commit: GEODE-77: deadlock in
GMSJoinLeave
GEODE-77: deadlock in GMSJoinLeave
This removes the stateLock read-write lock in favor of using a
sync on viewInstallationLock, eliminating the possibility of
inversion between the two.
Project: http://git-wip-us.apache.org/repos/asf/incubator-geode/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-geode/commit/f3b1f1b1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-geode/tree/f3b1f1b1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-geode/diff/f3b1f1b1
Branch: refs/heads/develop
Commit: f3b1f1b17f69dc9d21a846971fbe26d813e9df0a
Parents: 0f7daf2
Author: Bruce Schuchardt <bs...@pivotal.io>
Authored: Tue Oct 20 11:26:26 2015 -0700
Committer: Bruce Schuchardt <bs...@pivotal.io>
Committed: Tue Oct 20 11:26:26 2015 -0700
----------------------------------------------------------------------
.../membership/gms/fd/GMSHealthMonitor.java | 5 +
.../membership/gms/membership/GMSJoinLeave.java | 159 +++++++++++--------
.../gms/messenger/JGroupsMessenger.java | 22 ++-
.../gms/mgr/GMSMembershipManager.java | 3 +-
.../gms/membership/GMSJoinLeaveJUnitTest.java | 8 +-
.../management/ClientHealthStatsDUnitTest.java | 1 +
6 files changed, 118 insertions(+), 80 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/f3b1f1b1/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
----------------------------------------------------------------------
diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/fd/GMSHealthMonitor.java b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
index 9d87014..6f7cb6b 100755
--- a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
+++ b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
@@ -284,6 +284,9 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
* @return
*/
private boolean doCheckMember(InternalDistributedMember pingMember) {
+ if (playingDead) {
+ return true;
+ }
//TODO: need to some tcp check
logger.trace("Checking member {}", pingMember);
final CheckRequestMessage prm = constructCheckRequestMessage(pingMember);
@@ -543,10 +546,12 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
@Override
public void beSick() {
this.beingSick = true;
+ sendSuspectMessage(localAddress, "beSick invoked on GMSHealthMonitor");
}
@Override
public void playDead() {
+ sendSuspectMessage(localAddress, "playDead invoked on GMSHealthMonitor");
this.playingDead = true;
}
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/f3b1f1b1/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/membership/GMSJoinLeave.java
----------------------------------------------------------------------
diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/membership/GMSJoinLeave.java b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/membership/GMSJoinLeave.java
index 4ebc20c..3f85cda 100755
--- a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/membership/GMSJoinLeave.java
+++ b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/membership/GMSJoinLeave.java
@@ -103,16 +103,13 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
/** have I connected to the distributed system? */
private volatile boolean isJoined;
- /** a lock governing GMS state */
- private ReadWriteLock stateLock = new ReentrantReadWriteLock();
-
- /** guarded by stateLock */
+ /** guarded by viewInstallationLock */
private boolean isCoordinator;
/** a synch object that guards view installation */
private final Object viewInstallationLock = new Object();
- /** the currently installed view */
+ /** the currently installed view. Guarded by viewInstallationLock */
private volatile NetView currentView;
/** the previous view **/
@@ -196,7 +193,9 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
try {
if (Boolean.getBoolean(BYPASS_DISCOVERY)) {
- becomeCoordinator();
+ synchronized(viewInstallationLock) {
+ becomeCoordinator();
+ }
return true;
}
@@ -216,7 +215,9 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
if (localAddress.getNetMember().preferredForCoordinator()
&& state.possibleCoordinator.equals(this.localAddress)) {
if (tries > 2 || System.currentTimeMillis() < giveupTime ) {
- becomeCoordinator();
+ synchronized(viewInstallationLock) {
+ becomeCoordinator();
+ }
return true;
}
} else {
@@ -315,8 +316,10 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
if (response.getCurrentView() != null) {
if (response.getBecomeCoordinator()) {
logger.info("I am being told to become the membership coordinator by {}", coord);
- this.currentView = response.getCurrentView();
- becomeCoordinator(null);
+ synchronized(viewInstallationLock) {
+ this.currentView = response.getCurrentView();
+ becomeCoordinator(null);
+ }
} else {
this.birthViewId = response.getMemberID().getVmViewId();
this.localAddress.setVmViewId(this.birthViewId);
@@ -420,7 +423,9 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
check.addCrashedMembers(removedMembers);
}
if (check.getCoordinator().equals(localAddress)) {
- becomeCoordinator(incomingRequest.getMemberID());
+ synchronized(viewInstallationLock) {
+ becomeCoordinator(incomingRequest.getMemberID());
+ }
}
}
else {
@@ -475,7 +480,9 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
check.removeAll(removedMembers);
}
if (check.getCoordinator().equals(localAddress)) {
- becomeCoordinator(mbr);
+ synchronized(viewInstallationLock) {
+ becomeCoordinator(mbr);
+ }
}
}
else {
@@ -515,65 +522,75 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
becomeCoordinator(null);
}
+
+ public void becomeCoordinatorForTest() {
+ synchronized(viewInstallationLock) {
+ becomeCoordinator();
+ }
+ }
+
/**
+ * Transitions this member into the coordinator role. This must
+ * be invoked under a synch on viewInstallationLock that was held
+ * at the time the decision was made to become coordinator so that
+ * the decision is atomic with actually becoming coordinator.
* @param oldCoordinator may be null
*/
private void becomeCoordinator(InternalDistributedMember oldCoordinator) {
boolean testing = unitTesting.contains("noRandomViewChange");
- stateLock.writeLock().lock();
- try {
- if (isCoordinator) {
- return;
+
+ assert Thread.holdsLock(viewInstallationLock);
+
+ if (isCoordinator) {
+ return;
+ }
+
+ logger.info("This member is becoming the membership coordinator with address {}", localAddress);
+ isCoordinator = true;
+ if (currentView == null) {
+ // create the initial membership view
+ NetView newView = new NetView(this.localAddress);
+ this.localAddress.setVmViewId(0);
+ installView(newView);
+ isJoined = true;
+ if (viewCreator == null || viewCreator.isShutdown()) {
+ viewCreator = new ViewCreator("GemFire Membership View Creator", Services.getThreadGroup());
+ viewCreator.setDaemon(true);
+ viewCreator.start();
}
- logger.info("This member is becoming the membership coordinator with address {}", localAddress);
- isCoordinator = true;
- if (currentView == null) {
- // create the initial membership view
- NetView newView = new NetView(this.localAddress);
- this.localAddress.setVmViewId(0);
- installView(newView);
- isJoined = true;
- if (viewCreator == null || viewCreator.isShutdown()) {
- viewCreator = new ViewCreator("GemFire Membership View Creator", Services.getThreadGroup());
- viewCreator.setDaemon(true);
- viewCreator.start();
+ } else {
+ // create and send out a new view
+ NetView newView;
+ Set<InternalDistributedMember> leaving = new HashSet<>();
+ Set<InternalDistributedMember> removals;
+ synchronized(viewInstallationLock) {
+ int rand = testing? 0 : NetView.RANDOM.nextInt(10);
+ int viewNumber = currentView.getViewId() + 5 + rand;
+ if (this.localAddress.getVmViewId() < 0) {
+ this.localAddress.setVmViewId(viewNumber);
}
- } else {
- // create and send out a new view
- NetView newView;
- Set<InternalDistributedMember> leaving = new HashSet<>();
- Set<InternalDistributedMember> removals;
- synchronized(viewInstallationLock) {
- int rand = testing? 0 : NetView.RANDOM.nextInt(10);
- int viewNumber = currentView.getViewId() + 5 + rand;
- if (this.localAddress.getVmViewId() < 0) {
- this.localAddress.setVmViewId(viewNumber);
- }
- List<InternalDistributedMember> mbrs = new ArrayList<>(currentView.getMembers());
- if (!mbrs.contains(localAddress)) {
- mbrs.add(localAddress);
- }
- synchronized(this.removedMembers) {
- removals = new HashSet<>(this.removedMembers);
- }
- if (oldCoordinator != null && !removals.contains(oldCoordinator)) {
- leaving.add(oldCoordinator);
- }
- mbrs.removeAll(removals);
- mbrs.removeAll(leaving);
- newView = new NetView(this.localAddress, viewNumber, mbrs, leaving,
- removals);
+ List<InternalDistributedMember> mbrs = new ArrayList<>(currentView.getMembers());
+ if (!mbrs.contains(localAddress)) {
+ mbrs.add(localAddress);
+ }
+ synchronized(this.removedMembers) {
+ removals = new HashSet<>(this.removedMembers);
}
- if (viewCreator == null || viewCreator.isShutdown()) {
- viewCreator = new ViewCreator("GemFire Membership View Creator", Services.getThreadGroup());
- viewCreator.setInitialView(newView, leaving, removals);
- viewCreator.setDaemon(true);
- viewCreator.start();
+ if (oldCoordinator != null && !removals.contains(oldCoordinator)) {
+ leaving.add(oldCoordinator);
}
+ mbrs.removeAll(removals);
+ mbrs.removeAll(leaving);
+ newView = new NetView(this.localAddress, viewNumber, mbrs, leaving,
+ removals);
+ }
+ if (viewCreator == null || viewCreator.isShutdown()) {
+ viewCreator = new ViewCreator("GemFire Membership View Creator", Services.getThreadGroup());
+ viewCreator.setInitialView(newView, leaving, removals);
+ viewCreator.setDaemon(true);
+ viewCreator.start();
}
- } finally {
- stateLock.writeLock().unlock();
}
}
@@ -1016,13 +1033,8 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
becomeCoordinator();
} else if (this.isCoordinator) {
// stop being coordinator
- stateLock.writeLock().lock();
- try {
- stopCoordinatorServices();
- this.isCoordinator = false;
- } finally {
- stateLock.writeLock().unlock();
- }
+ stopCoordinatorServices();
+ this.isCoordinator = false;
}
}
if (!this.isCoordinator) {
@@ -1899,15 +1911,24 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
try {
List<Future<InternalDistributedMember>> futures;
futures = svc.invokeAll(checkers);
-
+ long giveUpTime = System.currentTimeMillis() + viewAckTimeout;
for (Future<InternalDistributedMember> future: futures) {
+ long now = System.currentTimeMillis();
try {
- InternalDistributedMember mbr = future.get(viewAckTimeout, TimeUnit.MILLISECONDS);
+ InternalDistributedMember mbr = null;
+ long timeToWait = giveUpTime - now;
+ if (timeToWait <= 0) {
+ // TODO if timeToWait==0 is future.get() guaranteed to return immediately?
+ // It looks like some code paths invoke Object.wait(0), which waits forever.
+ timeToWait = 1;
+ }
+ mbr = future.get(timeToWait, TimeUnit.MILLISECONDS);
if (mbr != null) {
mbrs.remove(mbr);
}
} catch (java.util.concurrent.TimeoutException e) {
- // TODO should the member be removed if we can't verify it in time?
+ // timeout - member didn't pass the final check and will not be removed
+ // from the collection of members
} catch (ExecutionException e) {
logger.info("unexpected exception caught during member verification", e);
}
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/f3b1f1b1/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
----------------------------------------------------------------------
diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/messenger/JGroupsMessenger.java b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
index 7f6a40e..2afc1f8 100755
--- a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
+++ b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
@@ -2,6 +2,7 @@ package com.gemstone.gemfire.distributed.internal.membership.gms.messenger;
import static com.gemstone.gemfire.distributed.internal.membership.gms.GMSUtil.replaceStrings;
import static com.gemstone.gemfire.internal.DataSerializableFixedID.JOIN_RESPONSE;
+import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
@@ -63,7 +64,6 @@ import com.gemstone.gemfire.distributed.internal.membership.gms.Services;
import com.gemstone.gemfire.distributed.internal.membership.gms.interfaces.MessageHandler;
import com.gemstone.gemfire.distributed.internal.membership.gms.interfaces.Messenger;
import com.gemstone.gemfire.distributed.internal.membership.gms.messages.JoinResponseMessage;
-import com.gemstone.gemfire.distributed.internal.membership.gms.mgr.GMSMembershipManager;
import com.gemstone.gemfire.internal.ClassPathLoader;
import com.gemstone.gemfire.internal.HeapDataOutputStream;
import com.gemstone.gemfire.internal.OSProcess;
@@ -77,8 +77,6 @@ import com.gemstone.gemfire.internal.i18n.LocalizedStrings;
import com.gemstone.gemfire.internal.logging.log4j.LocalizedMessage;
import com.gemstone.gemfire.internal.tcp.MemberShunnedException;
-import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap;
-
public class JGroupsMessenger implements Messenger {
private static final Logger logger = Services.getLogger();
@@ -124,9 +122,9 @@ public class JGroupsMessenger implements Messenger {
private GMSPingPonger pingPonger = new GMSPingPonger();
- private volatile long pingsReceived;
-
private volatile long pongsReceived;
+
+ private boolean playingDead;
static {
@@ -431,10 +429,12 @@ public class JGroupsMessenger implements Messenger {
@Override
public void playDead() {
+ playingDead = true;
}
@Override
public void beHealthy() {
+ playingDead = false;
}
@Override
@@ -477,6 +477,17 @@ public class JGroupsMessenger implements Messenger {
throw new DistributedSystemDisconnectedException("Distributed System is shutting down");
}
+ if (playingDead) {
+ Set result = new HashSet<>();
+ InternalDistributedMember[] rec = msg.getRecipients();
+ if (rec != null) {
+ for (int i=0; i<rec.length; i++) {
+ result.add(rec[i]);
+ }
+ }
+ return result;
+ }
+
filterOutgoingMessage(msg);
InternalDistributedMember[] destinations = msg.getRecipients();
@@ -877,7 +888,6 @@ public class JGroupsMessenger implements Messenger {
//Respond to ping messages sent from other systems that are in a auto reconnect state
byte[] contents = jgmsg.getBuffer();
if (pingPonger.isPingMessage(contents)) {
- pingsReceived++;
try {
pingPonger.sendPongMessage(myChannel, jgAddress, jgmsg.getSrc());
}
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/f3b1f1b1/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/mgr/GMSMembershipManager.java
----------------------------------------------------------------------
diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/mgr/GMSMembershipManager.java b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/mgr/GMSMembershipManager.java
index 50dc99e..a102ac2 100755
--- a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/mgr/GMSMembershipManager.java
+++ b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/mgr/GMSMembershipManager.java
@@ -2894,6 +2894,8 @@ public class GMSMembershipManager implements MembershipManager, Manager
services.setShutdownCause(shutdownCause);
services.getCancelCriterion().cancel(reason);
+ AlertAppender.getInstance().shuttingDown();
+
if (!inhibitForceDisconnectLogging) {
logger.fatal(LocalizedMessage.create(
LocalizedStrings.GroupMembershipService_MEMBERSHIP_SERVICE_FAILURE_0, reason), shutdownCause);
@@ -2903,7 +2905,6 @@ public class GMSMembershipManager implements MembershipManager, Manager
saveCacheXmlForReconnect();
}
- AlertAppender.getInstance().shuttingDown();
Thread reconnectThread = new Thread (new Runnable() {
public void run() {
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/f3b1f1b1/gemfire-core/src/test/java/com/gemstone/gemfire/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
----------------------------------------------------------------------
diff --git a/gemfire-core/src/test/java/com/gemstone/gemfire/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java b/gemfire-core/src/test/java/com/gemstone/gemfire/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
index d29a6dd..42594cf 100644
--- a/gemfire-core/src/test/java/com/gemstone/gemfire/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
+++ b/gemfire-core/src/test/java/com/gemstone/gemfire/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
@@ -292,7 +292,7 @@ public class GMSJoinLeaveJUnitTest {
gmsJoinLeave.unitTesting.add("noRandomViewChange");
prepareAndInstallView();
gmsJoinLeave.getView().add(gmsJoinLeaveMemberId);
- gmsJoinLeave.becomeCoordinator();
+ gmsJoinLeave.becomeCoordinatorForTest();
LeaveRequestMessage msg = new LeaveRequestMessage(gmsJoinLeave.getMemberID(), mockMembers[0], reason);
msg.setSender(mockMembers[0]);
@@ -318,7 +318,7 @@ public class GMSJoinLeaveJUnitTest {
gmsJoinLeave.getView().add(gmsJoinLeaveMemberId);
gmsJoinLeave.getView().add(mockMembers[1]);
gmsJoinLeave.unitTesting.add("noRandomViewChange");
- gmsJoinLeave.becomeCoordinator();
+ gmsJoinLeave.becomeCoordinatorForTest();
RemoveMemberMessage msg = new RemoveMemberMessage(gmsJoinLeave.getMemberID(), mockMembers[0], reason);
msg.setSender(mockMembers[0]);
gmsJoinLeave.processMessage(msg);
@@ -345,7 +345,7 @@ public class GMSJoinLeaveJUnitTest {
prepareAndInstallView();
gmsJoinLeave.getView().add(gmsJoinLeaveMemberId);
gmsJoinLeave.getView().add(mockMembers[1]);
- gmsJoinLeave.becomeCoordinator();
+ gmsJoinLeave.becomeCoordinatorForTest();
JoinRequestMessage msg = new JoinRequestMessage(gmsJoinLeaveMemberId, mockMembers[2], null);
msg.setSender(mockMembers[2]);
gmsJoinLeave.processMessage(msg);
@@ -473,7 +473,7 @@ public class GMSJoinLeaveJUnitTest {
NetView oldView = gmsJoinLeave.getView();
oldView.add(gmsJoinLeaveMemberId);
InternalDistributedMember creator = oldView.getCreator();
- gmsJoinLeave.becomeCoordinator();
+ gmsJoinLeave.becomeCoordinatorForTest();
NetView view = new NetView(2, gmsJoinLeave.getView().getViewId()+1);
view.setCreator(creator);
view.add(creator);
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/f3b1f1b1/gemfire-core/src/test/java/com/gemstone/gemfire/management/ClientHealthStatsDUnitTest.java
----------------------------------------------------------------------
diff --git a/gemfire-core/src/test/java/com/gemstone/gemfire/management/ClientHealthStatsDUnitTest.java b/gemfire-core/src/test/java/com/gemstone/gemfire/management/ClientHealthStatsDUnitTest.java
index 891cb60..ab9d8ba 100644
--- a/gemfire-core/src/test/java/com/gemstone/gemfire/management/ClientHealthStatsDUnitTest.java
+++ b/gemfire-core/src/test/java/com/gemstone/gemfire/management/ClientHealthStatsDUnitTest.java
@@ -94,6 +94,7 @@ public class ClientHealthStatsDUnitTest extends DistributedTestCase {
server = host.getVM(1);
client = host.getVM(2);
client2 = host.getVM(3);
+ addExpectedException("Connection reset");
}
public void tearDown2() throws Exception {