You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@brooklyn.apache.org by he...@apache.org on 2014/11/15 01:05:17 UTC

[08/21] incubator-brooklyn git commit: Make metrics on rebind, persistence, and HA available through REST API.

Make metrics on rebind, persistence, and HA available through REST API.

So that a management plane or human can determine server health and historic problems.


Project: http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/commit/b9c1b6fc
Tree: http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/tree/b9c1b6fc
Diff: http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/diff/b9c1b6fc

Branch: refs/heads/master
Commit: b9c1b6fca99165f62b712ec7fd7967f7e4b4e283
Parents: 9dd1a95
Author: Alex Heneveld <al...@cloudsoftcorp.com>
Authored: Thu Nov 13 03:42:54 2014 +0000
Committer: Alex Heneveld <al...@cloudsoftcorp.com>
Committed: Thu Nov 13 23:38:53 2014 +0000

----------------------------------------------------------------------
 .../entity/rebind/RebindExceptionHandler.java   |   6 +
 .../brooklyn/entity/rebind/RebindManager.java   |   5 +
 .../management/ha/HighAvailabilityManager.java  |   6 +
 .../rebind/PeriodicDeltaChangeListener.java     |  23 +--
 .../rebind/RebindExceptionHandlerImpl.java      |  54 +++++--
 .../entity/rebind/RebindManagerImpl.java        |  53 ++++++-
 .../persister/BrooklynPersistenceUtils.java     |   1 -
 .../persister/PersistenceActivityMetrics.java   |  83 ++++++++++
 .../ha/HighAvailabilityManagerImpl.java         | 155 ++++++++++++++-----
 .../NonDeploymentManagementContext.java         |   9 ++
 .../main/java/brooklyn/rest/api/ServerApi.java  |   7 +
 .../brooklyn/rest/resources/ServerResource.java |   7 +
 12 files changed, 346 insertions(+), 63 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/b9c1b6fc/api/src/main/java/brooklyn/entity/rebind/RebindExceptionHandler.java
----------------------------------------------------------------------
diff --git a/api/src/main/java/brooklyn/entity/rebind/RebindExceptionHandler.java b/api/src/main/java/brooklyn/entity/rebind/RebindExceptionHandler.java
index 61bcdaa..ce0af83 100644
--- a/api/src/main/java/brooklyn/entity/rebind/RebindExceptionHandler.java
+++ b/api/src/main/java/brooklyn/entity/rebind/RebindExceptionHandler.java
@@ -18,6 +18,8 @@
  */
 package brooklyn.entity.rebind;
 
+import java.util.List;
+
 import brooklyn.basic.BrooklynObject;
 import brooklyn.catalog.CatalogItem;
 import brooklyn.entity.Entity;
@@ -100,4 +102,8 @@ public interface RebindExceptionHandler {
     
     /** invoked after the complete rebind pass, always on success and possibly on failure */
     void onDone();
+    
+    List<Exception> getExceptions();
+    List<String> getWarnings();
+
 }

http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/b9c1b6fc/api/src/main/java/brooklyn/entity/rebind/RebindManager.java
----------------------------------------------------------------------
diff --git a/api/src/main/java/brooklyn/entity/rebind/RebindManager.java b/api/src/main/java/brooklyn/entity/rebind/RebindManager.java
index fcd71bc..8f34c5a 100644
--- a/api/src/main/java/brooklyn/entity/rebind/RebindManager.java
+++ b/api/src/main/java/brooklyn/entity/rebind/RebindManager.java
@@ -19,6 +19,7 @@
 package brooklyn.entity.rebind;
 
 import java.util.List;
+import java.util.Map;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 
@@ -124,4 +125,8 @@ public interface RebindManager {
      * setting if the process fails after the clear!) */
     @VisibleForTesting
     public void forcePersistNow(boolean full, @Nullable PersistenceExceptionHandler exceptionHandler);
+
+    /** Metrics about rebind, last success, etc. */
+    public Map<String,Object> getMetrics();
+    
 }

http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/b9c1b6fc/api/src/main/java/brooklyn/management/ha/HighAvailabilityManager.java
----------------------------------------------------------------------
diff --git a/api/src/main/java/brooklyn/management/ha/HighAvailabilityManager.java b/api/src/main/java/brooklyn/management/ha/HighAvailabilityManager.java
index de37123..597e407 100644
--- a/api/src/main/java/brooklyn/management/ha/HighAvailabilityManager.java
+++ b/api/src/main/java/brooklyn/management/ha/HighAvailabilityManager.java
@@ -18,6 +18,8 @@
  */
 package brooklyn.management.ha;
 
+import java.util.Map;
+
 import com.google.common.annotations.Beta;
 import com.google.common.annotations.VisibleForTesting;
 
@@ -118,4 +120,8 @@ public interface HighAvailabilityManager {
     
     @VisibleForTesting
     ManagementPlaneSyncRecordPersister getPersister();
+
+    /** Returns a collection of metrics */
+    Map<String,Object> getMetrics();
+    
 }

http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/b9c1b6fc/core/src/main/java/brooklyn/entity/rebind/PeriodicDeltaChangeListener.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/brooklyn/entity/rebind/PeriodicDeltaChangeListener.java b/core/src/main/java/brooklyn/entity/rebind/PeriodicDeltaChangeListener.java
index 08ca53c..da35459 100644
--- a/core/src/main/java/brooklyn/entity/rebind/PeriodicDeltaChangeListener.java
+++ b/core/src/main/java/brooklyn/entity/rebind/PeriodicDeltaChangeListener.java
@@ -38,10 +38,10 @@ import brooklyn.entity.Feed;
 import brooklyn.entity.basic.BrooklynTaskTags;
 import brooklyn.entity.basic.EntityInternal;
 import brooklyn.entity.rebind.persister.BrooklynPersistenceUtils;
+import brooklyn.entity.rebind.persister.PersistenceActivityMetrics;
 import brooklyn.internal.BrooklynFeatureEnablement;
 import brooklyn.location.Location;
 import brooklyn.management.ExecutionContext;
-import brooklyn.management.ExecutionManager;
 import brooklyn.management.Task;
 import brooklyn.mementos.BrooklynMementoPersister;
 import brooklyn.policy.Enricher;
@@ -50,7 +50,6 @@ import brooklyn.util.collections.MutableMap;
 import brooklyn.util.collections.MutableSet;
 import brooklyn.util.exceptions.Exceptions;
 import brooklyn.util.exceptions.RuntimeInterruptedException;
-import brooklyn.util.task.BasicExecutionContext;
 import brooklyn.util.task.ScheduledTask;
 import brooklyn.util.task.Tasks;
 import brooklyn.util.time.CountdownTimer;
@@ -59,6 +58,7 @@ import brooklyn.util.time.Time;
 
 import com.google.api.client.util.Lists;
 import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Stopwatch;
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Sets;
 
@@ -178,17 +178,14 @@ public class PeriodicDeltaChangeListener implements ChangeListener {
     
     private final Semaphore persistingMutex = new Semaphore(1);
     private final Object startMutex = new Object();
+
+    private PersistenceActivityMetrics metrics;
     
-    /** @deprecated since 0.7.0 pass in an {@link ExecutionContext} and a {@link Duration} */
-    @Deprecated
-    public PeriodicDeltaChangeListener(ExecutionManager executionManager, BrooklynMementoPersister persister, PersistenceExceptionHandler exceptionHandler, long periodMillis) {
-        this(new BasicExecutionContext(executionManager), persister, exceptionHandler, Duration.of(periodMillis, TimeUnit.MILLISECONDS));
-    }
-    
-    public PeriodicDeltaChangeListener(ExecutionContext executionContext, BrooklynMementoPersister persister, PersistenceExceptionHandler exceptionHandler, Duration period) {
+    public PeriodicDeltaChangeListener(ExecutionContext executionContext, BrooklynMementoPersister persister, PersistenceExceptionHandler exceptionHandler, PersistenceActivityMetrics metrics, Duration period) {
         this.executionContext = executionContext;
         this.persister = persister;
         this.exceptionHandler = exceptionHandler;
+        this.metrics = metrics;
         this.period = period;
         
         this.persistPoliciesEnabled = BrooklynFeatureEnablement.isEnabled(BrooklynFeatureEnablement.FEATURE_POLICY_PERSISTENCE_PROPERTY);
@@ -210,20 +207,28 @@ public class PeriodicDeltaChangeListener implements ChangeListener {
                 @Override public Task<Void> call() {
                     return Tasks.<Void>builder().dynamic(false).name("periodic-persister").body(new Callable<Void>() {
                         public Void call() {
+                            Stopwatch timer = Stopwatch.createStarted();
                             try {
                                 persistNow();
+                                metrics.noteSuccess(Duration.of(timer));
                                 return null;
                             } catch (RuntimeInterruptedException e) {
                                 LOG.debug("Interrupted persisting change-delta (rethrowing)", e);
+                                metrics.noteFailure(Duration.of(timer));
+                                metrics.noteError(e.toString());
                                 Thread.currentThread().interrupt();
                                 return null;
                             } catch (Exception e) {
                                 // Don't rethrow: the behaviour of executionManager is different from a scheduledExecutorService,
                                 // if we throw an exception, then our task will never get executed again
                                 LOG.error("Problem persisting change-delta", e);
+                                metrics.noteFailure(Duration.of(timer));
+                                metrics.noteError(e.toString());
                                 return null;
                             } catch (Throwable t) {
                                 LOG.warn("Problem persisting change-delta (rethrowing)", t);
+                                metrics.noteFailure(Duration.of(timer));
+                                metrics.noteError(t.toString());
                                 throw Exceptions.propagate(t);
                             }
                         }}).build();

http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/b9c1b6fc/core/src/main/java/brooklyn/entity/rebind/RebindExceptionHandlerImpl.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/brooklyn/entity/rebind/RebindExceptionHandlerImpl.java b/core/src/main/java/brooklyn/entity/rebind/RebindExceptionHandlerImpl.java
index ac41c8b..2b61173 100644
--- a/core/src/main/java/brooklyn/entity/rebind/RebindExceptionHandlerImpl.java
+++ b/core/src/main/java/brooklyn/entity/rebind/RebindExceptionHandlerImpl.java
@@ -41,6 +41,7 @@ import brooklyn.util.collections.QuorumCheck;
 import brooklyn.util.exceptions.Exceptions;
 import brooklyn.util.text.Strings;
 
+import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Sets;
 
@@ -62,9 +63,12 @@ public class RebindExceptionHandlerImpl implements RebindExceptionHandler {
     protected final Set<String> missingFeeds = Sets.newConcurrentHashSet();
     protected final Set<String> missingCatalogItems = Sets.newConcurrentHashSet();
     protected final Set<String> creationFailedIds = Sets.newConcurrentHashSet();
+    
     protected final Set<Exception> addPolicyFailures = Sets.newConcurrentHashSet();
     protected final Set<Exception> loadPolicyFailures = Sets.newConcurrentHashSet();
-    protected final List<Exception> exceptions = Collections.synchronizedList(Lists.<Exception>newArrayList());
+    
+    protected final Set<String> warnings = Collections.synchronizedSet(Sets.<String>newLinkedHashSet());
+    protected final Set<Exception> exceptions = Collections.synchronizedSet(Sets.<Exception>newLinkedHashSet());
     
     protected RebindContext context;
     protected boolean started = false;
@@ -113,6 +117,15 @@ public class RebindExceptionHandlerImpl implements RebindExceptionHandler {
         this.loadPolicyFailureMode = checkNotNull(builder.deserializePolicyFailureMode, "deserializePolicyFailureMode");
         this.danglingRefsQuorumRequiredHealthy = checkNotNull(builder.danglingRefsQuorumRequiredHealthy, "danglingRefsQuorumRequiredHealthy");
     }
+    
+    protected void warn(String message) {
+        warn(message, null);
+    }
+    protected void warn(String message, Throwable optionalError) {
+        if (optionalError==null) LOG.warn(message);
+        else LOG.warn(message, optionalError);
+        warnings.add(message);
+    }
 
     @Override
     public void onStart(RebindContext context) {
@@ -142,7 +155,7 @@ public class RebindExceptionHandlerImpl implements RebindExceptionHandler {
                         loadPolicyFailures.add(new IllegalStateException(errmsg, e));
                         break;
                     case CONTINUE:
-                        LOG.warn(errmsg+"; continuing", e);
+                        warn(errmsg+"; continuing: "+e, e);
                         break;
                     default:
                         throw new IllegalStateException("Unexpected state '"+loadPolicyFailureMode+"' for loadPolicyFailureMode");
@@ -160,7 +173,7 @@ public class RebindExceptionHandlerImpl implements RebindExceptionHandler {
         if (danglingRefFailureMode == RebindManager.RebindFailureMode.FAIL_FAST) {
             throw new IllegalStateException("No entity found with id "+id);
         } else {
-            LOG.warn("No entity found with id "+id+"; returning null");
+            warn("No entity found with id "+id+"; returning null");
             return null;
         }
     }
@@ -171,7 +184,7 @@ public class RebindExceptionHandlerImpl implements RebindExceptionHandler {
         if (danglingRefFailureMode == RebindManager.RebindFailureMode.FAIL_FAST) {
             throw new IllegalStateException("No location found with id "+id);
         } else {
-            LOG.warn("No location found with id "+id+"; returning null");
+            warn("No location found with id "+id+"; returning null");
             return null;
         }
     }
@@ -182,7 +195,7 @@ public class RebindExceptionHandlerImpl implements RebindExceptionHandler {
         if (danglingRefFailureMode == RebindManager.RebindFailureMode.FAIL_FAST) {
             throw new IllegalStateException("No policy found with id "+id);
         } else {
-            LOG.warn("No policy found with id "+id+"; returning null");
+            warn("No policy found with id "+id+"; returning null");
             return null;
         }
     }
@@ -193,7 +206,7 @@ public class RebindExceptionHandlerImpl implements RebindExceptionHandler {
         if (danglingRefFailureMode == RebindManager.RebindFailureMode.FAIL_FAST) {
             throw new IllegalStateException("No enricher found with id "+id);
         } else {
-            LOG.warn("No enricher found with id "+id+"; returning null");
+            warn("No enricher found with id "+id+"; returning null");
             return null;
         }
     }
@@ -204,7 +217,7 @@ public class RebindExceptionHandlerImpl implements RebindExceptionHandler {
         if (danglingRefFailureMode == RebindManager.RebindFailureMode.FAIL_FAST) {
             throw new IllegalStateException("No feed found with id "+id);
         } else {
-            LOG.warn("No feed found with id "+id+"; returning null");
+            warn("No feed found with id "+id+"; returning null");
             return null;
         }
     }
@@ -215,7 +228,7 @@ public class RebindExceptionHandlerImpl implements RebindExceptionHandler {
         if (danglingRefFailureMode == RebindManager.RebindFailureMode.FAIL_FAST) {
             throw new IllegalStateException("No catalog item found with id "+id);
         } else {
-            LOG.warn("No catalog item found with id "+id+"; returning null");
+            warn("No catalog item found with id "+id+"; returning null");
             return null;
         }
     }
@@ -256,7 +269,7 @@ public class RebindExceptionHandlerImpl implements RebindExceptionHandler {
                 addPolicyFailures.add(new IllegalStateException(errmsg, e));
                 break;
             case CONTINUE:
-                LOG.warn(errmsg+"; continuing", e);
+                warn(errmsg+"; continuing", e);
                 creationFailedIds.add(instance.getId());
                 break;
             default:
@@ -282,7 +295,7 @@ public class RebindExceptionHandlerImpl implements RebindExceptionHandler {
             addPolicyFailures.add(new IllegalStateException(errmsg, e));
             break;
         case CONTINUE:
-            LOG.warn(errmsg+"; continuing", e);
+            warn(errmsg+"; continuing", e);
             break;
         default:
             throw new IllegalStateException("Unexpected state '"+addPolicyFailureMode+"' for addPolicyFailureMode");
@@ -301,7 +314,7 @@ public class RebindExceptionHandlerImpl implements RebindExceptionHandler {
             addPolicyFailures.add(new IllegalStateException(errmsg, e));
             break;
         case CONTINUE:
-            LOG.warn(errmsg+"; continuing", e);
+            warn(errmsg+"; continuing", e);
             break;
         default:
             throw new IllegalStateException("Unexpected state '"+addPolicyFailureMode+"' for addPolicyFailureMode");
@@ -320,7 +333,7 @@ public class RebindExceptionHandlerImpl implements RebindExceptionHandler {
             addPolicyFailures.add(new IllegalStateException(errmsg, e));
             break;
         case CONTINUE:
-            LOG.warn(errmsg+"; continuing", e);
+            warn(errmsg+"; continuing", e);
             break;
         default:
             throw new IllegalStateException("Unexpected state '"+addPolicyFailureMode+"' for addPolicyFailureMode");
@@ -349,7 +362,7 @@ public class RebindExceptionHandlerImpl implements RebindExceptionHandler {
                     LOG.debug("Rebind: while interrupted, received "+errmsg+"/"+e+"; throwing interruption", e);
                 throw Exceptions.propagate(new InterruptedException("Detected interruptiong while not sleeping, due to secondary error rebinding: "+errmsg+"/"+e));
             }
-            LOG.warn("Rebind: continuing after "+errmsg, e);
+            warn("Rebind: continuing after "+errmsg, e);
         }
     }
     
@@ -364,6 +377,8 @@ public class RebindExceptionHandlerImpl implements RebindExceptionHandler {
             throw Exceptions.propagate(e);
         
         onDoneImpl(e);
+        exceptions.add(e);
+        
         throw new IllegalStateException("Rebind failed", e); // should have thrown exception above
     }
     
@@ -386,7 +401,7 @@ public class RebindExceptionHandlerImpl implements RebindExceptionHandler {
                 allExceptions.add(new IllegalStateException("Dangling references ("+totalDangling+" of "+totalItems+") present without rebind context"));
             } else {
                 if (!danglingRefsQuorumRequiredHealthy.isQuorate(totalFound, totalItems)) {
-                    LOG.warn("Dangling item"+Strings.s(totalDangling)+" ("+totalDangling+" of "+totalItems+") found on rebind exceeds quorum, assuming failed: "+danglingIds);
+                    warn("Dangling item"+Strings.s(totalDangling)+" ("+totalDangling+" of "+totalItems+") found on rebind exceeds quorum, assuming failed: "+danglingIds);
                     allExceptions.add(new IllegalStateException("Too many dangling references: "+totalDangling+" of "+totalItems));
                 } else {
                     LOG.info("Dangling item"+Strings.s(totalDangling)+" ("+totalDangling+" of "+totalItems+") found on rebind, assuming deleted: "+danglingIds);
@@ -439,4 +454,15 @@ public class RebindExceptionHandlerImpl implements RebindExceptionHandler {
             throw compoundException;
         }
     }
+    
+    @Override
+    public List<Exception> getExceptions() {
+        return ImmutableList.copyOf(exceptions);
+    }
+    
+    @Override
+    public List<String> getWarnings() {
+        return ImmutableList.copyOf(warnings);
+    }
+    
 }

http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/b9c1b6fc/core/src/main/java/brooklyn/entity/rebind/RebindManagerImpl.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/brooklyn/entity/rebind/RebindManagerImpl.java b/core/src/main/java/brooklyn/entity/rebind/RebindManagerImpl.java
index 4252a25..e24a818 100644
--- a/core/src/main/java/brooklyn/entity/rebind/RebindManagerImpl.java
+++ b/core/src/main/java/brooklyn/entity/rebind/RebindManagerImpl.java
@@ -58,6 +58,7 @@ import brooklyn.entity.proxying.InternalLocationFactory;
 import brooklyn.entity.proxying.InternalPolicyFactory;
 import brooklyn.entity.rebind.persister.BrooklynMementoPersisterToObjectStore;
 import brooklyn.entity.rebind.persister.BrooklynPersistenceUtils;
+import brooklyn.entity.rebind.persister.PersistenceActivityMetrics;
 import brooklyn.event.feed.AbstractFeed;
 import brooklyn.internal.BrooklynFeatureEnablement;
 import brooklyn.location.Location;
@@ -90,6 +91,7 @@ import brooklyn.mementos.TreeNode;
 import brooklyn.policy.Enricher;
 import brooklyn.policy.Policy;
 import brooklyn.policy.basic.AbstractPolicy;
+import brooklyn.util.collections.MutableList;
 import brooklyn.util.collections.MutableMap;
 import brooklyn.util.collections.QuorumCheck;
 import brooklyn.util.collections.QuorumCheck.QuorumChecks;
@@ -174,7 +176,12 @@ public class RebindManagerImpl implements RebindManager {
     private RebindFailureMode addPolicyFailureMode;
     private RebindFailureMode loadPolicyFailureMode;
     private QuorumCheck danglingRefsQuorumRequiredHealthy;
+    
+    private PersistenceActivityMetrics rebindMetrics = new PersistenceActivityMetrics();
+    private PersistenceActivityMetrics persistMetrics = new PersistenceActivityMetrics();
 
+    Integer firstRebindAppCount, firstRebindEntityCount, firstRebindItemCount;
+    
     /**
      * For tracking if rebinding, for {@link AbstractEnricher#isRebinding()} etc.
      *  
@@ -256,7 +263,7 @@ public class RebindManagerImpl implements RebindManager {
         }
         this.persistenceStoreAccess = checkNotNull(val, "persister");
         
-        this.persistenceRealChangeListener = new PeriodicDeltaChangeListener(managementContext.getServerExecutionContext(), persistenceStoreAccess, exceptionHandler, periodicPersistPeriod);
+        this.persistenceRealChangeListener = new PeriodicDeltaChangeListener(managementContext.getServerExecutionContext(), persistenceStoreAccess, exceptionHandler, persistMetrics, periodicPersistPeriod);
         this.persistencePublicChangeListener = new SafeChangeListener(persistenceRealChangeListener);
         
         if (persistenceRunning) {
@@ -530,8 +537,8 @@ public class RebindManagerImpl implements RebindManager {
             rebindActive.acquire();
         } catch (InterruptedException e1) { Exceptions.propagate(e1); }
         RebindTracker.setRebinding();
+        Stopwatch timer = Stopwatch.createStarted();
         try {
-            Stopwatch timer = Stopwatch.createStarted();
             Reflections reflections = new Reflections(classLoader);
             RebindContextImpl rebindContext = new RebindContextImpl(exceptionHandler, classLoader);
             
@@ -970,6 +977,14 @@ public class RebindManagerImpl implements RebindManager {
             }
 
             exceptionHandler.onDone();
+            
+            rebindMetrics.noteSuccess(Duration.of(timer));
+            noteErrors(exceptionHandler, null);
+            if (firstRebindAppCount==null) {
+                firstRebindAppCount = apps.size();
+                firstRebindEntityCount = rebindContext.getEntities().size();
+                firstRebindItemCount = rebindContext.getAllBrooklynObjects().size();
+            }
 
             if (!isEmpty) {
                 BrooklynLogging.log(LOG, shouldLogRebinding() ? LoggingLevel.INFO : LoggingLevel.DEBUG, 
@@ -990,13 +1005,30 @@ public class RebindManagerImpl implements RebindManager {
             return apps;
 
         } catch (Exception e) {
+            rebindMetrics.noteFailure(Duration.of(timer));
+            
+            Exceptions.propagateIfFatal(e);
+            noteErrors(exceptionHandler, e);
             throw exceptionHandler.onFailed(e);
+            
         } finally {
             rebindActive.release();
             RebindTracker.reset();
         }
     }
 
+    private void noteErrors(final RebindExceptionHandler exceptionHandler, Exception primaryException) {
+        List<Exception> exceptions = exceptionHandler.getExceptions();
+        List<String> warnings = exceptionHandler.getWarnings();
+        if (primaryException!=null || !exceptions.isEmpty() || !warnings.isEmpty()) {
+            List<String> messages = MutableList.<String>of();
+            if (primaryException!=null) messages.add(primaryException.toString());
+            for (Exception e: exceptions) messages.add(e.toString());
+            for (String w: warnings) messages.add(w);
+            rebindMetrics.noteError(messages);
+        }
+    }
+    
     private String findCatalogItemId(ClassLoader cl, Map<String, EntityMementoManifest> entityIdToManifest, EntityMementoManifest entityManifest) {
         if (entityManifest.getCatalogItemId() != null) {
             return entityManifest.getCatalogItemId();
@@ -1418,7 +1450,24 @@ public class RebindManagerImpl implements RebindManager {
     }
 
     @Override
+    public Map<String, Object> getMetrics() {
+        Map<String,Object> result = MutableMap.of();
+
+        result.put("rebind", rebindMetrics.asMap());
+        result.put("persist", persistMetrics.asMap());
+        
+        // include first rebind counts, so we know whether we rebinded or not
+        result.put("firstRebindCounts", MutableMap.of(
+            "applications", firstRebindAppCount,
+            "entities", firstRebindEntityCount,
+            "allItems", firstRebindItemCount));
+        
+        return result;
+    }
+
+    @Override
     public String toString() {
         return super.toString()+"[mgmt="+managementContext.getManagementNodeId()+"]";
     }
+
 }

http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/b9c1b6fc/core/src/main/java/brooklyn/entity/rebind/persister/BrooklynPersistenceUtils.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/brooklyn/entity/rebind/persister/BrooklynPersistenceUtils.java b/core/src/main/java/brooklyn/entity/rebind/persister/BrooklynPersistenceUtils.java
index 54990b3..8b5317d 100644
--- a/core/src/main/java/brooklyn/entity/rebind/persister/BrooklynPersistenceUtils.java
+++ b/core/src/main/java/brooklyn/entity/rebind/persister/BrooklynPersistenceUtils.java
@@ -29,7 +29,6 @@ import brooklyn.config.BrooklynServerConfig;
 import brooklyn.config.BrooklynServerPaths;
 import brooklyn.entity.Entity;
 import brooklyn.entity.Feed;
-import brooklyn.entity.basic.AbstractEntity;
 import brooklyn.entity.basic.Entities;
 import brooklyn.entity.basic.EntityInternal;
 import brooklyn.entity.rebind.BrooklynObjectType;

http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/b9c1b6fc/core/src/main/java/brooklyn/entity/rebind/persister/PersistenceActivityMetrics.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/brooklyn/entity/rebind/persister/PersistenceActivityMetrics.java b/core/src/main/java/brooklyn/entity/rebind/persister/PersistenceActivityMetrics.java
new file mode 100644
index 0000000..8315156
--- /dev/null
+++ b/core/src/main/java/brooklyn/entity/rebind/persister/PersistenceActivityMetrics.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package brooklyn.entity.rebind.persister;
+
+import java.util.List;
+import java.util.Map;
+
+import brooklyn.util.collections.MutableList;
+import brooklyn.util.collections.MutableMap;
+import brooklyn.util.time.Duration;
+
+public class PersistenceActivityMetrics {
+    
+    final static int MAX_ERRORS = 200;
+    
+    long count=0, failureCount=0;
+    Long lastSuccessTime, lastDuration, lastFailureTime;
+    List<Map<String,Object>> errorMessages = MutableList.of();
+
+    public void noteSuccess(Duration duration) {
+        count++;
+        lastSuccessTime = System.currentTimeMillis();
+        lastDuration = duration.toMilliseconds();
+    }
+    
+    public void noteFailure(Duration duration) {
+        count++;
+        failureCount++;
+        lastFailureTime = System.currentTimeMillis();
+        lastDuration = duration.toMilliseconds();
+    }
+
+    public void noteError(String error) {
+        noteErrorObject(error);
+    }
+    
+    public void noteError(List<?> error) {
+        noteErrorObject(error);
+    }
+    
+    /** error should be json-serializable; exceptions can be problematic */
+    protected synchronized void noteErrorObject(Object error) {
+        errorMessages.add(0, MutableMap.<String,Object>of("error", error, "timestamp", System.currentTimeMillis()));
+        while (errorMessages.size() > MAX_ERRORS) {
+            errorMessages.remove(errorMessages.size()-1);
+        }
+    }
+    
+    public synchronized Map<String,Object> asMap() {
+        Map<String,Object> result = MutableMap.of();
+        result.put("count", count);
+        result.put("lastSuccessTimeUtc", lastSuccessTime);
+        result.put("lastSuccessTimeMillisSince", since(lastSuccessTime));
+        result.put("lastDuration", lastDuration);
+        result.put("failureCount", failureCount);
+        result.put("lastFailureTimeUtc", lastFailureTime);
+        result.put("lastFailureTimeMillisSince", since(lastFailureTime));
+        result.put("errorMessages", MutableList.copyOf(errorMessages));
+        return result;
+    }
+
+    private Long since(Long time) {
+        if (time==null) return null;
+        return System.currentTimeMillis() - time;
+    }
+    
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/b9c1b6fc/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java b/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java
index 62b06c0..73d431d 100644
--- a/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java
+++ b/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java
@@ -23,6 +23,8 @@ import static com.google.common.base.Preconditions.checkState;
 
 import java.io.IOException;
 import java.net.URI;
+import java.util.List;
+import java.util.Map;
 import java.util.concurrent.Callable;
 
 import javax.annotation.Nullable;
@@ -42,6 +44,7 @@ import brooklyn.entity.basic.ConfigKeys;
 import brooklyn.entity.basic.EntityInternal;
 import brooklyn.entity.rebind.RebindManager;
 import brooklyn.entity.rebind.persister.BrooklynPersistenceUtils;
+import brooklyn.entity.rebind.persister.PersistenceActivityMetrics;
 import brooklyn.entity.rebind.plane.dto.BasicManagementNodeSyncRecord;
 import brooklyn.entity.rebind.plane.dto.ManagementPlaneSyncRecordImpl;
 import brooklyn.entity.rebind.plane.dto.ManagementPlaneSyncRecordImpl.Builder;
@@ -54,6 +57,7 @@ import brooklyn.management.internal.LocalEntityManager;
 import brooklyn.management.internal.LocationManagerInternal;
 import brooklyn.management.internal.ManagementContextInternal;
 import brooklyn.management.internal.ManagementTransitionInfo.ManagementTransitionMode;
+import brooklyn.util.collections.MutableList;
 import brooklyn.util.collections.MutableMap;
 import brooklyn.util.exceptions.Exceptions;
 import brooklyn.util.task.ScheduledTask;
@@ -66,6 +70,7 @@ import com.google.common.annotations.Beta;
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Function;
 import com.google.common.base.Preconditions;
+import com.google.common.base.Stopwatch;
 import com.google.common.base.Ticker;
 import com.google.common.collect.Iterables;
 
@@ -132,13 +137,21 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
     private volatile boolean nodeStateTransitionComplete = false;
     private volatile long priority = 0;
     
+    private final static int MAX_NODE_STATE_HISTORY = 200;
+    private final List<Map<String,Object>> nodeStateHistory = MutableList.of();
+    
     private volatile transient Duration pollPeriodLocalOverride;
     private volatile transient Duration heartbeatTimeoutOverride;
 
     private volatile ManagementPlaneSyncRecord lastSyncRecord;
     
+    private volatile PersistenceActivityMetrics managementStateWritePersistenceMetrics = new PersistenceActivityMetrics();
+    private volatile PersistenceActivityMetrics managementStateReadPersistenceMetrics = new PersistenceActivityMetrics();
+    private final long startTimeUtc;
+    
     public HighAvailabilityManagerImpl(ManagementContextInternal managementContext) {
         this.managementContext = managementContext;
+        startTimeUtc = localTickerUtc.read();
     }
 
     @Override
@@ -228,9 +241,9 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
     @Override
     public void start(HighAvailabilityMode startMode) {
         nodeStateTransitionComplete = true;
-        // always start in standby; it may get promoted to master or hot_standby in this method
+        // always start in standby, unless hot backup; it may get promoted to master or hot_standby in this method
         // (depending on startMode; but for startMode STANDBY or HOT_STANDBY it will not promote until the next election)
-        nodeState = ManagementNodeState.STANDBY;
+        setInternalNodeState(startMode==HighAvailabilityMode.HOT_BACKUP ? ManagementNodeState.HOT_BACKUP : ManagementNodeState.STANDBY);
         disabled = false;
         running = true;
         changeMode(startMode, true, true);
@@ -246,13 +259,13 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
     public void changeMode(HighAvailabilityMode startMode, boolean preventElectionOnExplicitStandbyMode, boolean failOnExplicitStandbyModeIfNoMaster) {
         if (!running) {
             // if was not running then start as disabled mode, then proceed as normal
-            LOG.info("HA changing mode to "+startMode+" from "+nodeState+" when not running, forcing an intermediate start as DISABLED then will convert to "+startMode);
+            LOG.info("HA changing mode to "+startMode+" from "+getInternalNodeState()+" when not running, forcing an intermediate start as DISABLED then will convert to "+startMode);
             start(HighAvailabilityMode.DISABLED);
         }
         if (getNodeState()==ManagementNodeState.FAILED || getNodeState()==ManagementNodeState.INITIALIZING) {
             if (startMode!=HighAvailabilityMode.DISABLED) {
                 // if coming from FAILED (or INITIALIZING because we skipped start call) then treat as cold standby
-                nodeState = ManagementNodeState.STANDBY; 
+                setInternalNodeState(ManagementNodeState.STANDBY);
             }
         }
         
@@ -272,7 +285,7 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
             case MASTER:
             case AUTO:
             case DISABLED:
-                // no action needed, will do anything necessary below
+                // no action needed, will do anything necessary below (or above)
                 break;
             case HOT_STANDBY: 
             case HOT_BACKUP: 
@@ -288,17 +301,18 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
         case AUTO:
             // don't care; let's start and see if we promote ourselves
             publishAndCheck(true);
-            switch (nodeState) {
+            switch (getInternalNodeState()) {
             case HOT_BACKUP:
-                if (!nodeStateTransitionComplete) throw new IllegalStateException("Cannot switch to AUTO when in the middle of a transition to "+nodeState);
-                // else change us to hot standby and continue to below
-                nodeState = ManagementNodeState.HOT_STANDBY;
+                if (!nodeStateTransitionComplete) throw new IllegalStateException("Cannot switch to AUTO when in the middle of a transition to "+getInternalNodeState());
+                // else change us to standby, desiring to go to hot standby, and continue to below
+                setInternalNodeState(ManagementNodeState.STANDBY);
+                startMode = HighAvailabilityMode.HOT_STANDBY;
             case HOT_STANDBY:
             case STANDBY:
                 ManagementPlaneSyncRecord newState = loadManagementPlaneSyncRecord(true);
                 String masterNodeId = newState.getMasterNodeId();
                 ManagementNodeSyncRecord masterNodeDetails = newState.getManagementNodes().get(masterNodeId);
-                LOG.info("Management node "+ownNodeId+" running as HA " + nodeState + " autodetected, " +
+                LOG.info("Management node "+ownNodeId+" running as HA " + getInternalNodeState() + " autodetected, " +
                     (Strings.isBlank(masterNodeId) ? "no master currently (other node should promote itself soon)" : "master "
                         + (existingMaster==null ? "(new) " : "")
                         + "is "+masterNodeId +
@@ -308,7 +322,7 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
                 LOG.info("Management node "+ownNodeId+" running as HA MASTER autodetected");
                 break;
             default:
-                throw new IllegalStateException("Management node "+ownNodeId+" set to HA AUTO, encountered unexpected mode "+nodeState);
+                throw new IllegalStateException("Management node "+ownNodeId+" set to HA AUTO, encountered unexpected mode "+getInternalNodeState());
             }
             break;
         case MASTER:
@@ -322,16 +336,23 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
                 LOG.info("Management node "+ownNodeId+" already running as HA MASTER, when set explicitly");
             }
             break;
+        case HOT_BACKUP:
         case STANDBY:
         case HOT_STANDBY:
-            if (!preventElectionOnExplicitStandbyMode)
-                publishAndCheck(true);
-            if (failOnExplicitStandbyModeIfNoMaster && existingMaster==null) {
-                LOG.error("Management node "+ownNodeId+" detected no master when "+startMode+" requested and existing master required; failing.");
-                throw new IllegalStateException("No existing master; cannot start as "+startMode);
+            if (getInternalNodeState()==ManagementNodeState.STANDBY || getInternalNodeState()==ManagementNodeState.HOT_STANDBY) {
+                if (!preventElectionOnExplicitStandbyMode)
+                    publishAndCheck(true);
+                if (failOnExplicitStandbyModeIfNoMaster && existingMaster==null) {
+                    LOG.error("Management node "+ownNodeId+" detected no master when "+startMode+" requested and existing master required; failing.");
+                    throw new IllegalStateException("No existing master; cannot start as "+startMode);
+                }
+            }
+            if (startMode==HighAvailabilityMode.HOT_BACKUP) {
+                setInternalNodeState(ManagementNodeState.HOT_BACKUP);
+            } else {
+                setInternalNodeState(ManagementNodeState.STANDBY);
+                // might jump to hot_standby next
             }
-            // continue to below (above lines skipped for hot backup)
-        case HOT_BACKUP:
             String message = "Management node "+ownNodeId+" running as HA "+getNodeState()+" (";
             if (getNodeState().toString().equals(startMode.toString()))
                 message += "explicitly requested";
@@ -355,7 +376,7 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
             break;
         case DISABLED:
             // safe just to run even if we weren't master
-            LOG.info("Management node "+ownNodeId+" HA DISABLED (was "+nodeState+")");
+            LOG.info("Management node "+ownNodeId+" HA DISABLED (was "+getInternalNodeState()+")");
             demoteTo(ManagementNodeState.FAILED);
             if (pollingTask!=null) pollingTask.cancel(true);
             break;
@@ -370,7 +391,7 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
                 startMode = HighAvailabilityMode.STANDBY;
             }
         }
-        if ((nodeState==ManagementNodeState.STANDBY && startMode==HighAvailabilityMode.HOT_STANDBY) || 
+        if ((getInternalNodeState()==ManagementNodeState.STANDBY && startMode==HighAvailabilityMode.HOT_STANDBY) || 
                 (startMode==HighAvailabilityMode.HOT_BACKUP)) {
             nodeStateTransitionComplete = false;
             if (startMode==HighAvailabilityMode.HOT_STANDBY) {
@@ -422,7 +443,7 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
         boolean wasRunning = running;
         
         running = false;
-        nodeState = newState;
+        setInternalNodeState(newState);
         if (pollingTask != null) pollingTask.cancel(true);
         
         if (wasRunning) {
@@ -437,22 +458,40 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
     
     /** returns the node state this node is trying to be in */
     public ManagementNodeState getTransitionTargetNodeState() {
+        return getInternalNodeState();
+    }
+    
+    protected ManagementNodeState getInternalNodeState() {
         return nodeState;
     }
     
+    protected void setInternalNodeState(ManagementNodeState newState) {
+        synchronized (nodeStateHistory) {
+            if (this.nodeState != newState) {
+                nodeStateHistory.add(0, MutableMap.<String,Object>of("state", newState, "timestamp", currentTimeMillis()));
+                while (nodeStateHistory.size()>MAX_NODE_STATE_HISTORY) {
+                    nodeStateHistory.remove(nodeStateHistory.size()-1);
+                }
+            }
+            
+            this.nodeState = newState;
+        }
+    }
+
     @SuppressWarnings("deprecation")
     @Override
     public ManagementNodeState getNodeState() {
-        if (nodeState==ManagementNodeState.FAILED) return nodeState;
+        ManagementNodeState myNodeState = getInternalNodeState();
+        if (myNodeState==ManagementNodeState.FAILED) return getInternalNodeState();
         // if target is master then we claim already being master, to prevent other nodes from taking it
         // (we may fail subsequently of course)
-        if (nodeState==ManagementNodeState.MASTER) return nodeState;
+        if (myNodeState==ManagementNodeState.MASTER) return myNodeState;
         
         // for backwards compatibility; remove in 0.8.0
-        if (nodeState==ManagementNodeState.UNINITIALISED) return ManagementNodeState.INITIALIZING;
+        if (myNodeState==ManagementNodeState.UNINITIALISED) return ManagementNodeState.INITIALIZING;
         
         if (!nodeStateTransitionComplete) return ManagementNodeState.INITIALIZING;
-        return nodeState;
+        return myNodeState;
     }
 
     public ManagementPlaneSyncRecord getLastManagementPlaneSyncRecord() {
@@ -516,10 +555,19 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
             return;
         }
         
-        ManagementNodeSyncRecord memento = createManagementNodeSyncRecord(false);
-        Delta delta = ManagementPlaneSyncRecordDeltaImpl.builder().node(memento).build();
-        persister.delta(delta);
-        if (LOG.isTraceEnabled()) LOG.trace("Published management-node health: {}", memento);
+        Stopwatch timer = Stopwatch.createStarted();
+        try {
+            ManagementNodeSyncRecord memento = createManagementNodeSyncRecord(false);
+            Delta delta = ManagementPlaneSyncRecordDeltaImpl.builder().node(memento).build();
+            persister.delta(delta);
+            managementStateWritePersistenceMetrics.noteSuccess(Duration.of(timer));
+            if (LOG.isTraceEnabled()) LOG.trace("Published management-node health: {}", memento);
+        } catch (Throwable t) {
+            managementStateWritePersistenceMetrics.noteFailure(Duration.of(timer));
+            managementStateWritePersistenceMetrics.noteError(t.toString());
+            LOG.debug("Error publishing management-node health (rethrowing): "+t);
+            throw Exceptions.propagate(t);
+        }
     }
     
     protected synchronized void publishDemotion(boolean demotingFromMaster) {
@@ -707,8 +755,8 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
                 LOG.warn("Problem in promption-listener (continuing)", e);
             }
         }
-        boolean wasHot = (nodeState==ManagementNodeState.HOT_STANDBY || nodeState==ManagementNodeState.HOT_BACKUP);
-        nodeState = ManagementNodeState.MASTER;
+        boolean wasHot = (getInternalNodeState()==ManagementNodeState.HOT_STANDBY || getInternalNodeState()==ManagementNodeState.HOT_BACKUP);
+        setInternalNodeState(ManagementNodeState.MASTER);
         publishPromotionToMaster();
         try {
             if (wasHot) {
@@ -717,7 +765,7 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
                 managementContext.getRebindManager().stopReadOnly();
                 clearManagedItems(ManagementTransitionMode.REBINDING_DESTROYED);
             }
-            managementContext.getRebindManager().rebind(managementContext.getCatalog().getRootClassLoader(), null, nodeState);
+            managementContext.getRebindManager().rebind(managementContext.getCatalog().getRootClassLoader(), null, getInternalNodeState());
         } catch (Exception e) {
             LOG.error("Management node enountered problem during rebind when promoting self to master; demoting to FAILED and rethrowing: "+e);
             demoteTo(ManagementNodeState.FAILED);
@@ -746,7 +794,7 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
             LOG.warn("Ignoring demote-from-master request, as HighAvailabilityManager is no longer running");
             return;
         }
-        boolean wasMaster = nodeState == ManagementNodeState.MASTER;
+        boolean wasMaster = (getInternalNodeState() == ManagementNodeState.MASTER);
         if (wasMaster) backupOnDemotionIfNeeded();
         ManagementTransitionMode mode = (wasMaster ? ManagementTransitionMode.REBINDING_NO_LONGER_PRIMARY : ManagementTransitionMode.REBINDING_DESTROYED);
 
@@ -756,9 +804,9 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
         case FAILED: 
         case HOT_BACKUP:
         case STANDBY:
-            nodeState = toState; break;
+            setInternalNodeState(toState); break;
         case HOT_STANDBY:
-            nodeState = ManagementNodeState.STANDBY; break;
+            setInternalNodeState(ManagementNodeState.STANDBY); break;
         default:
             throw new IllegalStateException("Illegal target state: "+toState);
         }
@@ -822,7 +870,7 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
     protected boolean attemptHotProxy(ManagementNodeState toState) {
         try {
             Preconditions.checkState(nodeStateTransitionComplete==false, "Must be in transitioning state to go into "+toState);
-            nodeState = toState;
+            setInternalNodeState(toState);
             managementContext.getRebindManager().startReadOnly(toState);
             
             return true;
@@ -859,6 +907,8 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
         
         int maxLoadAttempts = 5;
         Exception lastException = null;
+        Stopwatch timer = Stopwatch.createStarted();
+
         for (int i = 0; i < maxLoadAttempts; i++) {
             try {
                 ManagementPlaneSyncRecord result = persister.loadSyncRecord();
@@ -881,6 +931,11 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
                     }
                     result = builder.build();
                 }
+                
+                if (i>0) {
+                    managementStateReadPersistenceMetrics.noteError("Succeeded only on attempt "+(i+1)+": "+lastException);
+                }
+                managementStateReadPersistenceMetrics.noteSuccess(Duration.of(timer));
                 return result;
             } catch (IOException e) {
                 if (i < (maxLoadAttempts - 1)) {
@@ -889,7 +944,11 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
                 lastException = e;
             }
         }
-        throw new IllegalStateException("Failed to load mangement-plane memento "+maxLoadAttempts+" consecutive times", lastException);
+        String message = "Failed to load mangement-plane memento "+maxLoadAttempts+" consecutive times";
+        managementStateReadPersistenceMetrics.noteError(message+": "+lastException);
+        managementStateReadPersistenceMetrics.noteFailure(Duration.of(timer));
+
+        throw new IllegalStateException(message, lastException);
     }
 
     protected ManagementNodeSyncRecord createManagementNodeSyncRecord(boolean useLocalTimestampAsRemoteTimestamp) {
@@ -944,4 +1003,26 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager {
     public String toString() {
         return super.toString()+"[node:"+ownNodeId+";running="+running+"]";
     }
+    
+    @Override
+    public Map<String,Object> getMetrics() {
+        Map<String,Object> result = MutableMap.of();
+        
+        result.put("state", getNodeState());
+        result.put("uptime", Time.makeTimeStringRounded(Duration.millis(currentTimeMillis()-startTimeUtc)));
+        result.put("currentTimeUtc", currentTimeMillis());
+        result.put("startTimeUtc", startTimeUtc);
+        result.put("highAvailability", MutableMap.<String,Object>of(
+            "priority", getPriority(),
+            "pollPeriod", getPollPeriod().toMilliseconds(),
+            "heartbeatTimeout", getHeartbeatTimeout().toMilliseconds(),
+            "history", nodeStateHistory));
+        
+        result.putAll(managementContext.getRebindManager().getMetrics());
+        result.put("managementStatePersistence", 
+            MutableMap.of("read", managementStateReadPersistenceMetrics, "write", managementStateWritePersistenceMetrics));
+        
+        return result;
+    }
+    
 }

http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/b9c1b6fc/core/src/main/java/brooklyn/management/internal/NonDeploymentManagementContext.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/brooklyn/management/internal/NonDeploymentManagementContext.java b/core/src/main/java/brooklyn/management/internal/NonDeploymentManagementContext.java
index 2771475..c04a31c 100644
--- a/core/src/main/java/brooklyn/management/internal/NonDeploymentManagementContext.java
+++ b/core/src/main/java/brooklyn/management/internal/NonDeploymentManagementContext.java
@@ -530,6 +530,11 @@ public class NonDeploymentManagementContext implements ManagementContextInternal
         public BrooklynMementoRawData retrieveMementoRawData() {
             throw new IllegalStateException("Non-deployment context "+NonDeploymentManagementContext.this+" is not valid for this operation.");
         }
+
+        @Override
+        public Map<String, Object> getMetrics() {
+            throw new IllegalStateException("Non-deployment context "+NonDeploymentManagementContext.this+" is not valid for this operation.");
+        }
     }
 
     /**
@@ -590,6 +595,10 @@ public class NonDeploymentManagementContext implements ManagementContextInternal
         public long getPriority() {
             throw new IllegalStateException("Non-deployment context "+NonDeploymentManagementContext.this+" is not valid for this operation.");
         }
+        @Override
+        public Map<String, Object> getMetrics() {
+            throw new IllegalStateException("Non-deployment context "+NonDeploymentManagementContext.this+" is not valid for this operation.");
+        }
     }
     
 }

http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/b9c1b6fc/usage/rest-api/src/main/java/brooklyn/rest/api/ServerApi.java
----------------------------------------------------------------------
diff --git a/usage/rest-api/src/main/java/brooklyn/rest/api/ServerApi.java b/usage/rest-api/src/main/java/brooklyn/rest/api/ServerApi.java
index 7977ea8..246e31a 100644
--- a/usage/rest-api/src/main/java/brooklyn/rest/api/ServerApi.java
+++ b/usage/rest-api/src/main/java/brooklyn/rest/api/ServerApi.java
@@ -18,6 +18,8 @@
  */
 package brooklyn.rest.api;
 
+import java.util.Map;
+
 import javax.ws.rs.Consumes;
 import javax.ws.rs.DefaultValue;
 import javax.ws.rs.FormParam;
@@ -97,6 +99,11 @@ public interface ServerApi {
     @ApiOperation(value = "Returns the HA state of this management node")
     public ManagementNodeState getHighAvailabilityNodeState();
     
+    @GET
+    @Path("/ha/metrics")
+    @ApiOperation(value = "Returns a collection of HA metrics")
+    public Map<String,Object> getHighAvailabilityMetrics();
+    
     @POST
     @Path("/ha/state")
     @ApiOperation(value = "Changes the HA state of this management node")

http://git-wip-us.apache.org/repos/asf/incubator-brooklyn/blob/b9c1b6fc/usage/rest-server/src/main/java/brooklyn/rest/resources/ServerResource.java
----------------------------------------------------------------------
diff --git a/usage/rest-server/src/main/java/brooklyn/rest/resources/ServerResource.java b/usage/rest-server/src/main/java/brooklyn/rest/resources/ServerResource.java
index 87fce42..50f99e1 100644
--- a/usage/rest-server/src/main/java/brooklyn/rest/resources/ServerResource.java
+++ b/usage/rest-server/src/main/java/brooklyn/rest/resources/ServerResource.java
@@ -23,6 +23,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Map;
 import java.util.Properties;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
@@ -57,6 +58,7 @@ import brooklyn.rest.domain.VersionSummary;
 import brooklyn.rest.transform.HighAvailabilityTransformer;
 import brooklyn.rest.util.WebResourceUtils;
 import brooklyn.util.ResourceUtils;
+import brooklyn.util.collections.MutableMap;
 import brooklyn.util.exceptions.Exceptions;
 import brooklyn.util.file.ArchiveBuilder;
 import brooklyn.util.flags.TypeCoercions;
@@ -255,6 +257,11 @@ public class ServerResource extends AbstractBrooklynRestResource implements Serv
     }
 
     @Override
+    public Map<String, Object> getHighAvailabilityMetrics() {
+        return mgmt().getHighAvailabilityManager().getMetrics();
+    }
+    
+    @Override
     public long getHighAvailabitlityPriority() {
         return mgmt().getHighAvailabilityManager().getPriority();
     }