You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@brooklyn.apache.org by he...@apache.org on 2014/08/25 08:37:49 UTC
[05/10] git commit: fix ha split brain test problem,
and better logging for ha
fix ha split brain test problem, and better logging for ha
Project: http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/commit/c4ecdb27
Tree: http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/tree/c4ecdb27
Diff: http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/diff/c4ecdb27
Branch: refs/heads/master
Commit: c4ecdb271a7dfe3b656644f7be53348d252c796b
Parents: 9f439dd
Author: Alex Heneveld <al...@cloudsoftcorp.com>
Authored: Thu Aug 7 19:55:48 2014 -0400
Committer: Alex Heneveld <al...@cloudsoftcorp.com>
Committed: Mon Aug 25 07:23:51 2014 +0100
----------------------------------------------------------------------
.../ha/HighAvailabilityManagerImpl.java | 26 ++++++++---
.../HighAvailabilityManagerSplitBrainTest.java | 49 +++++++++++++-------
2 files changed, 51 insertions(+), 24 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/c4ecdb27/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java b/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java
index c205ebe..e92a171 100644
--- a/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java
+++ b/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java
@@ -142,8 +142,10 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
public HighAvailabilityManagerImpl setPollPeriod(Duration val) {
this.pollPeriod = checkNotNull(val, "pollPeriod");
- if (running && pollingTask != null) {
- pollingTask.cancel(true);
+ if (running) {
+ if (pollingTask!=null) {
+ pollingTask.cancel(true);
+ }
registerPollTask();
}
return this;
@@ -244,6 +246,7 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
@Override
public void stop() {
+ LOG.debug("Stopping "+this);
boolean wasRunning = running; // ensure idempotent
running = false;
@@ -294,6 +297,7 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
}
};
+ LOG.debug("Registering poll task for "+this+", period "+pollPeriod);
if (pollPeriod==null || pollPeriod.equals(Duration.PRACTICALLY_FOREVER)) {
// don't schedule - used for tests
// (scheduling fires off one initial task in the background before the delay,
@@ -369,7 +373,11 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
}
protected boolean isHeartbeatOk(ManagementNodeSyncRecord masterNode, ManagementNodeSyncRecord meNode) {
- if (masterNode==null || meNode==null) return false;
+ if (masterNode==null) return false;
+ if (meNode==null) {
+ // we can't confirm it's healthy, but it appears so as far as we can tell
+ return true;
+ }
Long timestampMaster = masterNode.getRemoteTimestamp();
Long timestampMe = meNode.getRemoteTimestamp();
if (timestampMaster==null || timestampMe==null) return false;
@@ -382,11 +390,12 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
String nodeId = memento.getMasterNodeId();
ManagementNodeSyncRecord masterMemento = (nodeId == null) ? null : memento.getManagementNodes().get(nodeId);
+ ManagementNodeSyncRecord ourMemento = memento.getManagementNodes().get(ownNodeId);
boolean result = masterMemento != null && masterMemento.getStatus() == ManagementNodeState.MASTER
- && isHeartbeatOk(masterMemento, memento.getManagementNodes().get(ownNodeId));
+ && isHeartbeatOk(masterMemento, ourMemento);
- if (LOG.isDebugEnabled()) LOG.debug("Healthy-master check result={}; masterId={}; memento=",
- new Object[] {result, nodeId, (masterMemento == null ? "<none>" : masterMemento.toVerboseString())});
+ if (LOG.isDebugEnabled()) LOG.debug("Healthy-master check result={}; masterId={}; masterMemento={}; ourMemento={}",
+ new Object[] {result, nodeId, (masterMemento == null ? "<none>" : masterMemento.toVerboseString()), (ourMemento == null ? "<none>" : ourMemento.toVerboseString())});
return (result ? masterMemento : null);
}
@@ -663,4 +672,9 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
.build();
}
}
+
+ @Override
+ public String toString() {
+ return super.toString()+"[node:"+ownNodeId+";running="+running+"]";
+ }
}
http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/c4ecdb27/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerSplitBrainTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerSplitBrainTest.java b/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerSplitBrainTest.java
index ddf6b73..a11e57f 100644
--- a/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerSplitBrainTest.java
+++ b/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerSplitBrainTest.java
@@ -48,10 +48,12 @@ import brooklyn.test.entity.LocalManagementContextForTests;
import brooklyn.test.entity.TestApplication;
import brooklyn.util.collections.MutableList;
import brooklyn.util.collections.MutableMap;
+import brooklyn.util.exceptions.Exceptions;
import brooklyn.util.repeat.Repeater;
import brooklyn.util.time.Duration;
import brooklyn.util.time.Time;
+import com.google.common.base.Stopwatch;
import com.google.common.base.Ticker;
import com.google.common.collect.ImmutableList;
@@ -303,31 +305,42 @@ public class HighAvailabilityManagerSplitBrainTest {
Thread t = new Thread() { public void run() {
if (staggerStart!=null) Time.sleep(staggerStart.multiply(Math.random()));
n.ha.start(HighAvailabilityMode.AUTO);
+ n.ha.setPollPeriod(Duration.millis(20));
} };
spawned.add(t);
t.start();
}
- Assert.assertTrue(Repeater.create().every(Duration.millis(1)).limitTimeTo(Duration.THIRTY_SECONDS).until(new Callable<Boolean>() {
- @Override public Boolean call() throws Exception {
- ManagementPlaneSyncRecord memento = nodes.get(0).ha.getManagementPlaneSyncState();
- int masters=0, standbys=0, savedMasters=0, savedStandbys=0;
- for (HaMgmtNode n: nodes) {
- if (n.ha.getNodeState()==ManagementNodeState.MASTER) masters++;
- if (n.ha.getNodeState()==ManagementNodeState.STANDBY) standbys++;
- ManagementNodeSyncRecord m = memento.getManagementNodes().get(n.ownNodeId);
- if (m!=null) {
- if (m.getStatus()==ManagementNodeState.MASTER) savedMasters++;
- if (m.getStatus()==ManagementNodeState.STANDBY) savedStandbys++;
+ try {
+ final Stopwatch timer = Stopwatch.createStarted();
+ Assert.assertTrue(Repeater.create().backoff(Duration.millis(1), 1.2, Duration.millis(50)).limitTimeTo(Duration.THIRTY_SECONDS).until(new Callable<Boolean>() {
+ @Override public Boolean call() throws Exception {
+ ManagementPlaneSyncRecord memento = nodes.get(0).ha.getManagementPlaneSyncState();
+ int masters=0, standbys=0, savedMasters=0, savedStandbys=0;
+ for (HaMgmtNode n: nodes) {
+ if (n.ha.getNodeState()==ManagementNodeState.MASTER) masters++;
+ if (n.ha.getNodeState()==ManagementNodeState.STANDBY) standbys++;
+ ManagementNodeSyncRecord m = memento.getManagementNodes().get(n.ownNodeId);
+ if (m!=null) {
+ if (m.getStatus()==ManagementNodeState.MASTER) savedMasters++;
+ if (m.getStatus()==ManagementNodeState.STANDBY) savedStandbys++;
+ }
}
- }
- log.info("starting "+nodes.size()+" nodes: "+masters+" M + "+standbys+" zzz; "
- + memento.getManagementNodes().size()+" saved, "
+ log.info("while starting "+nodes.size()+" nodes: "+masters+" M + "+standbys+" zzz; "
+ + memento.getManagementNodes().size()+" saved, "
+ memento.getMasterNodeId()+" master, "+savedMasters+" M + "+savedStandbys+" zzz");
-
- return masters==1 && standbys==nodes.size()-1 && savedMasters==1 && savedStandbys==nodes.size()-1;
- }
- }).run());
+
+ if (timer.isRunning() && Duration.of(timer).compareTo(Duration.TEN_SECONDS)>0) {
+ log.warn("we seem to have a problem stabilizing"); //handy place to set a suspend-VM breakpoint!
+ timer.stop();
+ }
+ return masters==1 && standbys==nodes.size()-1 && savedMasters==1 && savedStandbys==nodes.size()-1;
+ }
+ }).run());
+ } catch (Throwable t) {
+ log.warn("Failed to stabilize (rethrowing): "+t, t);
+ throw Exceptions.propagate(t);
+ }
for (Thread t: spawned)
t.join(Duration.THIRTY_SECONDS.toMilliseconds());