You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@brooklyn.apache.org by he...@apache.org on 2014/11/19 18:47:57 UTC

[02/13] incubator-brooklyn git commit: tidy think-we-are-master-when-we-are-not discrepancies

tidy think-we-are-master-when-we-are-not discrepancies

fix test which depended on promotion when master-becomes-master-again, and
when considering demotion look at whether we were master *locally* not globally,
on reassertion, rewrite the master file and the change.log;
and throw if we fail going in to hot backup


Project: http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/commit/911fb372
Tree: http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/tree/911fb372
Diff: http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/diff/911fb372

Branch: refs/heads/master
Commit: 911fb37250dce88481deb0c2c8de01eae79ae2a3
Parents: 1241ee1
Author: Alex Heneveld <al...@cloudsoftcorp.com>
Authored: Tue Nov 18 12:18:37 2014 +0000
Committer: Alex Heneveld <al...@cloudsoftcorp.com>
Committed: Wed Nov 19 13:53:06 2014 +0000

----------------------------------------------------------------------
 .../ha/HighAvailabilityManagerImpl.java         | 40 ++++++++++++--------
 ...ntPlaneSyncRecordPersisterToObjectStore.java |  2 +-
 .../HighAvailabilityManagerSplitBrainTest.java  | 15 +++++++-
 3 files changed, 39 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/911fb372/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java b/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java
index a225c1a..77636d0 100644
--- a/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java
+++ b/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java
@@ -61,6 +61,7 @@ import brooklyn.management.internal.ManagementTransitionInfo.ManagementTransitio
 import brooklyn.util.collections.MutableList;
 import brooklyn.util.collections.MutableMap;
 import brooklyn.util.exceptions.Exceptions;
+import brooklyn.util.exceptions.ReferenceWithError;
 import brooklyn.util.task.ScheduledTask;
 import brooklyn.util.task.Tasks;
 import brooklyn.util.text.Strings;
@@ -272,13 +273,14 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
         // TODO Small race in that we first check, and then we'll do checkMaster() on first poll,
         // so another node could have already become master or terminated in that window.
         ManagementNodeSyncRecord existingMaster = hasHealthyMaster();
-        boolean weAreMaster = existingMaster!=null && ownNodeId.equals(existingMaster.getNodeId());
+        boolean weAreRecognisedAsMaster = existingMaster!=null && ownNodeId.equals(existingMaster.getNodeId());
+        boolean weAreMasterLocally = getInternalNodeState()==ManagementNodeState.MASTER;
         
         // catch error in some tests where mgmt context has a different mgmt context
         if (managementContext.getHighAvailabilityManager()!=this)
             throw new IllegalStateException("Cannot start an HA manager on a management context with a different HA manager!");
         
-        if (weAreMaster) {
+        if (weAreMasterLocally) {
             // demotion may be required; do this before triggering an election
             switch (startMode) {
             case MASTER:
@@ -330,7 +332,7 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
             if (existingMaster == null) {
                 promoteToMaster();
                 LOG.info("Management node "+ownNodeId+" running as HA MASTER explicitly");
-            } else if (!weAreMaster) {
+            } else if (!weAreRecognisedAsMaster) {
                 throw new IllegalStateException("Master already exists; cannot run as master (master "+existingMaster.toVerboseString()+"); "
                     + "to trigger a promotion, set a priority and demote the current master");
             } else {
@@ -409,7 +411,8 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
                 publishHealth();
             }
             try {
-                attemptHotProxy(ManagementNodeState.of(startMode).get());
+                activateHotProxy(ManagementNodeState.of(startMode).get()).get();
+                // error above now throws
                 nodeStateTransitionComplete = true;
                 publishHealth();
                 
@@ -417,11 +420,13 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
                     LOG.info("Management node "+ownNodeId+" now running as HA "+getNodeState()+"; "
                         + managementContext.getApplications().size()+" application"+Strings.s(managementContext.getApplications().size())+" loaded");
                 } else {
+                    // shouldn't come here, we should have gotten an error above
                     LOG.warn("Management node "+ownNodeId+" unable to promote to "+startMode+" (currently "+getNodeState()+"); "
                         + "(see log for further details)");
                 }
             } catch (Exception e) {
                 LOG.warn("Management node "+ownNodeId+" unable to promote to "+startMode+" (currently "+getNodeState()+"); rethrowing: "+Exceptions.collapseText(e));
+                nodeStateTransitionComplete = true;
                 throw Exceptions.propagate(e);
             }
         } else {
@@ -736,7 +741,9 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
         String message = "Management node "+ownNodeId+" detected ";
         String currMasterSummary = currMasterNodeId + "(" + (currMasterNodeRecord==null ? "<none>" : timestampString(currMasterNodeRecord.getRemoteTimestamp())) + ")";
         if (weAreNewMaster && (ownNodeRecord.getStatus() == ManagementNodeState.MASTER)) {
-            LOG.warn(message + "we must reassert master status, as "+currMasterSummary+" attempted to steal");
+            LOG.warn(message + "we must reassert master status, as was stolen and then failed at "+
+                (currMasterNodeRecord==null ? "a node which has gone away" : currMasterSummary));
+            publishPromotionToMaster();
             publishHealth();
             return;
         }
@@ -785,7 +792,7 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
         try {
             managementContext.getRebindManager().rebind(managementContext.getCatalog().getRootClassLoader(), null, getInternalNodeState());
         } catch (Exception e) {
-            LOG.error("Management node enountered problem during rebind when promoting self to master; demoting to FAILED and rethrowing: "+e);
+            LOG.error("Management node "+managementContext.getManagementNodeId()+" enountered problem during rebind when promoting self to master; demoting to FAILED and rethrowing: "+e);
             demoteTo(ManagementNodeState.FAILED);
             throw Exceptions.propagate(e);
         }
@@ -834,8 +841,11 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
         
         if (toState==ManagementNodeState.HOT_BACKUP || toState==ManagementNodeState.HOT_STANDBY) {
             nodeStateTransitionComplete = false;
-            attemptHotProxy(toState);
-            nodeStateTransitionComplete = true;
+            try {
+                activateHotProxy(toState).get();
+            } finally {
+                nodeStateTransitionComplete = true;
+            }
             publishHealth();
         }
     }
@@ -873,9 +883,9 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
         ((BasicBrooklynCatalog)managementContext.getCatalog()).reset(CatalogDto.newEmptyInstance("<reset-by-ha-status-change>"));
     }
     
-    /** @deprecated since 0.7.0, use {@link #attemptHotProxy(ManagementNodeState)} */ @Deprecated
+    /** @deprecated since 0.7.0, use {@link #activateHotProxy(ManagementNodeState)} */ @Deprecated
     protected boolean attemptHotStandby() {
-        return attemptHotProxy(ManagementNodeState.HOT_STANDBY);
+        return activateHotProxy(ManagementNodeState.HOT_STANDBY).getWithoutError();
     }
     
     /** Starts hot standby or hot backup, in foreground
@@ -884,19 +894,19 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
      * but if it fails, this method will {@link #demoteTo(ManagementNodeState)} {@link ManagementNodeState#FAILED}.
      * <p>
      * @return whether the requested {@link ManagementNodeState} was possible;
-     * (if not, errors should be stored elsewhere) */
-    protected boolean attemptHotProxy(ManagementNodeState toState) {
+     * (if not, errors should be stored elsewhere), callers may want to rethrow */
+    protected ReferenceWithError<Boolean> activateHotProxy(ManagementNodeState toState) {
         try {
             Preconditions.checkState(nodeStateTransitionComplete==false, "Must be in transitioning state to go into "+toState);
             setInternalNodeState(toState);
             managementContext.getRebindManager().startReadOnly(toState);
             
-            return true;
+            return ReferenceWithError.newInstanceWithoutError(true);
         } catch (Exception e) {
             Exceptions.propagateIfFatal(e);
-            LOG.warn("Unable to promote "+ownNodeId+" to "+toState+", switching to FAILED: "+e, e);
+            LOG.warn("Unable to change "+ownNodeId+" to "+toState+", switching to FAILED: "+e, e);
             demoteTo(ManagementNodeState.FAILED);
-            return false;
+            return ReferenceWithError.newInstanceThrowingError(false, e);
         }
     }
     

http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/911fb372/core/src/main/java/brooklyn/management/ha/ManagementPlaneSyncRecordPersisterToObjectStore.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/brooklyn/management/ha/ManagementPlaneSyncRecordPersisterToObjectStore.java b/core/src/main/java/brooklyn/management/ha/ManagementPlaneSyncRecordPersisterToObjectStore.java
index c481cbe..07063ff 100644
--- a/core/src/main/java/brooklyn/management/ha/ManagementPlaneSyncRecordPersisterToObjectStore.java
+++ b/core/src/main/java/brooklyn/management/ha/ManagementPlaneSyncRecordPersisterToObjectStore.java
@@ -275,7 +275,7 @@ public class ManagementPlaneSyncRecordPersisterToObjectStore implements Manageme
             if (currentRemoteMaster==null) {
                 // okay to have nothing at remote
             } else if (!currentRemoteMaster.trim().equals(optionalExpectedId.trim())) {
-                LOG.warn("Master at server is "+currentRemoteMaster+"; expected "+optionalExpectedId+" "
+                LOG.warn("Master at server is "+(Strings.isBlank(currentRemoteMaster) ? "<none>" : currentRemoteMaster)+"; expected "+optionalExpectedId+" "
                     + (Strings.isNonBlank(nodeId) ? "and would set as "+nodeId : "and would clear") 
                     + ", so not applying (yet)");
                 return;

http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/911fb372/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerSplitBrainTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerSplitBrainTest.java b/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerSplitBrainTest.java
index 4ead25d..5a521cb 100644
--- a/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerSplitBrainTest.java
+++ b/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerSplitBrainTest.java
@@ -217,14 +217,25 @@ public class HighAvailabilityManagerSplitBrainTest {
             assertNestedRebindException(e);
         }
 
+        // re-check should re-assert successfully, no rebind expected as he was previously master
+        n1.ha.publishAndCheck(false);
+        ManagementPlaneSyncRecord memento;
+        memento = n1.ha.loadManagementPlaneSyncRecord(true);
+        assertEquals(memento.getManagementNodes().get(n1.ownNodeId).getStatus(), ManagementNodeState.MASTER);
+        assertEquals(memento.getManagementNodes().get(n2.ownNodeId).getStatus(), ManagementNodeState.FAILED);
+
+        // hot backup permitted by the TestEntityFailingRebind
+        n1.ha.changeMode(HighAvailabilityMode.HOT_BACKUP);
+        memento = n1.ha.loadManagementPlaneSyncRecord(true);
+        assertEquals(memento.getManagementNodes().get(n1.ownNodeId).getStatus(), ManagementNodeState.HOT_BACKUP);
         try {
-            n1.ha.publishAndCheck(false);
+            n1.ha.changeMode(HighAvailabilityMode.MASTER);
             fail("n1 rebind failure expected");
         } catch (Exception e) {
             assertNestedRebindException(e);
         }
 
-        ManagementPlaneSyncRecord memento = n1.ha.loadManagementPlaneSyncRecord(true);
+        memento = n1.ha.loadManagementPlaneSyncRecord(true);
         assertEquals(memento.getManagementNodes().get(n1.ownNodeId).getStatus(), ManagementNodeState.FAILED);
         assertEquals(memento.getManagementNodes().get(n2.ownNodeId).getStatus(), ManagementNodeState.FAILED);
     }