You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@brooklyn.apache.org by he...@apache.org on 2014/08/25 08:37:49 UTC

[05/10] git commit: fix ha split brain test problem, and better logging for ha

fix ha split brain test problem, and better logging for ha


Project: http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/commit/c4ecdb27
Tree: http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/tree/c4ecdb27
Diff: http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/diff/c4ecdb27

Branch: refs/heads/master
Commit: c4ecdb271a7dfe3b656644f7be53348d252c796b
Parents: 9f439dd
Author: Alex Heneveld <al...@cloudsoftcorp.com>
Authored: Thu Aug 7 19:55:48 2014 -0400
Committer: Alex Heneveld <al...@cloudsoftcorp.com>
Committed: Mon Aug 25 07:23:51 2014 +0100

----------------------------------------------------------------------
 .../ha/HighAvailabilityManagerImpl.java         | 26 ++++++++---
 .../HighAvailabilityManagerSplitBrainTest.java  | 49 +++++++++++++-------
 2 files changed, 51 insertions(+), 24 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/c4ecdb27/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java b/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java
index c205ebe..e92a171 100644
--- a/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java
+++ b/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java
@@ -142,8 +142,10 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
     
     public HighAvailabilityManagerImpl setPollPeriod(Duration val) {
         this.pollPeriod = checkNotNull(val, "pollPeriod");
-        if (running && pollingTask != null) {
-            pollingTask.cancel(true);
+        if (running) {
+            if (pollingTask!=null) {
+                pollingTask.cancel(true);
+            }
             registerPollTask();
         }
         return this;
@@ -244,6 +246,7 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
 
     @Override
     public void stop() {
+        LOG.debug("Stopping "+this);
         boolean wasRunning = running; // ensure idempotent
         
         running = false;
@@ -294,6 +297,7 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
             }
         };
         
+        LOG.debug("Registering poll task for "+this+", period "+pollPeriod);
         if (pollPeriod==null || pollPeriod.equals(Duration.PRACTICALLY_FOREVER)) {
             // don't schedule - used for tests
             // (scheduling fires off one initial task in the background before the delay, 
@@ -369,7 +373,11 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
     }
     
     protected boolean isHeartbeatOk(ManagementNodeSyncRecord masterNode, ManagementNodeSyncRecord meNode) {
-        if (masterNode==null || meNode==null) return false;
+        if (masterNode==null) return false;
+        if (meNode==null) {
+            // we can't confirm it's healthy, but it appears so as far as we can tell
+            return true;
+        }
         Long timestampMaster = masterNode.getRemoteTimestamp();
         Long timestampMe = meNode.getRemoteTimestamp();
         if (timestampMaster==null || timestampMe==null) return false;
@@ -382,11 +390,12 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
         String nodeId = memento.getMasterNodeId();
         ManagementNodeSyncRecord masterMemento = (nodeId == null) ? null : memento.getManagementNodes().get(nodeId);
         
+        ManagementNodeSyncRecord ourMemento = memento.getManagementNodes().get(ownNodeId);
         boolean result = masterMemento != null && masterMemento.getStatus() == ManagementNodeState.MASTER
-                && isHeartbeatOk(masterMemento, memento.getManagementNodes().get(ownNodeId));
+                && isHeartbeatOk(masterMemento, ourMemento);
         
-        if (LOG.isDebugEnabled()) LOG.debug("Healthy-master check result={}; masterId={}; memento=",
-                new Object[] {result, nodeId, (masterMemento == null ? "<none>" : masterMemento.toVerboseString())});
+        if (LOG.isDebugEnabled()) LOG.debug("Healthy-master check result={}; masterId={}; masterMemento={}; ourMemento={}",
+                new Object[] {result, nodeId, (masterMemento == null ? "<none>" : masterMemento.toVerboseString()), (ourMemento == null ? "<none>" : ourMemento.toVerboseString())});
         
         return (result ? masterMemento : null);
     }
@@ -663,4 +672,9 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
                     .build();
         }
     }
+    
+    @Override
+    public String toString() {
+        return super.toString()+"[node:"+ownNodeId+";running="+running+"]";
+    }
 }

http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/c4ecdb27/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerSplitBrainTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerSplitBrainTest.java b/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerSplitBrainTest.java
index ddf6b73..a11e57f 100644
--- a/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerSplitBrainTest.java
+++ b/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerSplitBrainTest.java
@@ -48,10 +48,12 @@ import brooklyn.test.entity.LocalManagementContextForTests;
 import brooklyn.test.entity.TestApplication;
 import brooklyn.util.collections.MutableList;
 import brooklyn.util.collections.MutableMap;
+import brooklyn.util.exceptions.Exceptions;
 import brooklyn.util.repeat.Repeater;
 import brooklyn.util.time.Duration;
 import brooklyn.util.time.Time;
 
+import com.google.common.base.Stopwatch;
 import com.google.common.base.Ticker;
 import com.google.common.collect.ImmutableList;
 
@@ -303,31 +305,42 @@ public class HighAvailabilityManagerSplitBrainTest {
             Thread t = new Thread() { public void run() {
                 if (staggerStart!=null) Time.sleep(staggerStart.multiply(Math.random()));
                 n.ha.start(HighAvailabilityMode.AUTO);
+                n.ha.setPollPeriod(Duration.millis(20));
             } };
             spawned.add(t);
             t.start();
         }
 
-        Assert.assertTrue(Repeater.create().every(Duration.millis(1)).limitTimeTo(Duration.THIRTY_SECONDS).until(new Callable<Boolean>() {
-            @Override public Boolean call() throws Exception {
-                ManagementPlaneSyncRecord memento = nodes.get(0).ha.getManagementPlaneSyncState();
-                int masters=0, standbys=0, savedMasters=0, savedStandbys=0;
-                for (HaMgmtNode n: nodes) {
-                    if (n.ha.getNodeState()==ManagementNodeState.MASTER) masters++;
-                    if (n.ha.getNodeState()==ManagementNodeState.STANDBY) standbys++;
-                    ManagementNodeSyncRecord m = memento.getManagementNodes().get(n.ownNodeId);
-                    if (m!=null) {
-                        if (m.getStatus()==ManagementNodeState.MASTER) savedMasters++;
-                        if (m.getStatus()==ManagementNodeState.STANDBY) savedStandbys++;
+        try {
+            final Stopwatch timer = Stopwatch.createStarted();
+            Assert.assertTrue(Repeater.create().backoff(Duration.millis(1), 1.2, Duration.millis(50)).limitTimeTo(Duration.THIRTY_SECONDS).until(new Callable<Boolean>() {
+                @Override public Boolean call() throws Exception {
+                    ManagementPlaneSyncRecord memento = nodes.get(0).ha.getManagementPlaneSyncState();
+                    int masters=0, standbys=0, savedMasters=0, savedStandbys=0;
+                    for (HaMgmtNode n: nodes) {
+                        if (n.ha.getNodeState()==ManagementNodeState.MASTER) masters++;
+                        if (n.ha.getNodeState()==ManagementNodeState.STANDBY) standbys++;
+                        ManagementNodeSyncRecord m = memento.getManagementNodes().get(n.ownNodeId);
+                        if (m!=null) {
+                            if (m.getStatus()==ManagementNodeState.MASTER) savedMasters++;
+                            if (m.getStatus()==ManagementNodeState.STANDBY) savedStandbys++;
+                        }
                     }
-                }
-                log.info("starting "+nodes.size()+" nodes: "+masters+" M + "+standbys+" zzz; "
-                    + memento.getManagementNodes().size()+" saved, "
+                    log.info("while starting "+nodes.size()+" nodes: "+masters+" M + "+standbys+" zzz; "
+                        + memento.getManagementNodes().size()+" saved, "
                         + memento.getMasterNodeId()+" master, "+savedMasters+" M + "+savedStandbys+" zzz");
-                
-                return masters==1 && standbys==nodes.size()-1 && savedMasters==1 && savedStandbys==nodes.size()-1;
-            }
-        }).run());
+
+                    if (timer.isRunning() && Duration.of(timer).compareTo(Duration.TEN_SECONDS)>0) {
+                        log.warn("we seem to have a problem stabilizing");  //handy place to set a suspend-VM breakpoint!
+                        timer.stop();
+                    }
+                    return masters==1 && standbys==nodes.size()-1 && savedMasters==1 && savedStandbys==nodes.size()-1;
+                }
+            }).run());
+        } catch (Throwable t) {
+            log.warn("Failed to stabilize (rethrowing): "+t, t);
+            throw Exceptions.propagate(t);
+        }
         
         for (Thread t: spawned)
             t.join(Duration.THIRTY_SECONDS.toMilliseconds());