You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by zh...@apache.org on 2018/03/02 06:35:39 UTC

[30/50] [abbrv] hbase git commit: HBASE-19598 Fix TestAssignmentManagerMetrics flaky test

HBASE-19598 Fix TestAssignmentManagerMetrics flaky test

Master never left waitForMasterActive because it never checked state of
the clusterUp flag. The test here was aborting regionserver and then
just exiting. The minihbasecluster shutdown sets the cluster down flag
but we were never looking at it so Master thread was staying up.

M hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
The tableOnMaster check in waitForMasterActive looks wrong. It was
making it so a 'normal' Master was getting stuck in here. This is not
the place to worry about tablesOnMaster. That is for the balancer to be
concerned with. There is a problem with Master hosting
system-tables-only. After further study, Master can carry regions like a
regionserver but making it so it carries system tables only is tricky
given meta assign happens ahead of all others which means that the
Master needs to have checked-in as a regionserver super early... It
needs work. Punted for now.

M hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
Mostly renaming so lists and maps of region infos have same name as they
have elsewhere in code base and cleaning up confusion that may arise
when we talk of servers-for-system-tables....It is talking about
something else in the code changes here that is other than the normal
understanding. It is about filtering regionservers by their version
numbers so we favor regions with higher version numbers. Needs to go
back up into the balancer.

M hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java
It was possible for the Master to be given regions if no regionservers
available (as per the failing unit test in this case).

M hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
Minor reordering moving the waitForMasterActive later in the initialize
and wrapping each test in a check if we are to keep looping (which
checks cluster status flag).

M hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerMetrics.java
This was an old test from the days when Master carried system tables.
Updated test and fixed metrics. Metrics count the hbase:meta along with
the userspace region so upped expected numbers (previously the
hbase:meta was hosted on the master so metrics were not incremented).

M hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestRegionsOnMasterOptions.java
I took a look at this test again but nope, needs a load of work still to
make it pass.

M hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKUtil.java
Stop being so whiney.


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/016cf0c6
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/016cf0c6
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/016cf0c6

Branch: refs/heads/HBASE-19064
Commit: 016cf0c64ba867f740a19472f0fe716ee67306b1
Parents: 372c88a
Author: Michael Stack <st...@apache.org>
Authored: Wed Feb 28 22:08:49 2018 -0800
Committer: Michael Stack <st...@apache.org>
Committed: Wed Feb 28 22:25:24 2018 -0800

----------------------------------------------------------------------
 .../apache/hadoop/hbase/ipc/RpcExecutor.java    |  2 +-
 .../org/apache/hadoop/hbase/master/HMaster.java | 10 ++--
 .../hadoop/hbase/master/ServerManager.java      |  2 +-
 .../hadoop/hbase/master/SplitLogManager.java    |  6 +--
 .../master/assignment/AssignmentManager.java    | 57 +++++++++++---------
 .../hbase/master/balancer/BaseLoadBalancer.java | 10 +++-
 .../hbase/regionserver/HRegionServer.java       | 34 ++++++------
 .../hadoop/hbase/TestClientClusterMetrics.java  |  8 ++-
 .../hadoop/hbase/TestClientClusterStatus.java   |  2 +-
 .../master/TestAssignmentManagerMetrics.java    | 39 ++++++--------
 .../balancer/TestRegionsOnMasterOptions.java    |  5 +-
 .../apache/hadoop/hbase/zookeeper/ZKUtil.java   | 10 ++--
 12 files changed, 91 insertions(+), 94 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/016cf0c6/hbase-server/src/main/java/org/apache/hadoop/hbase/ipc/RpcExecutor.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/ipc/RpcExecutor.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/ipc/RpcExecutor.java
index 7470758..c3f613e 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/ipc/RpcExecutor.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/ipc/RpcExecutor.java
@@ -327,7 +327,7 @@ public abstract class RpcExecutor {
           int failedCount = failedHandlerCount.incrementAndGet();
           if (this.handlerFailureThreshhold >= 0
               && failedCount > handlerCount * this.handlerFailureThreshhold) {
-            String message = "Number of failed RpcServer handler runs exceeded threshhold "
+            String message = "Number of failed RpcServer handler runs exceeded threshold "
                 + this.handlerFailureThreshhold + "; reason: " + StringUtils.stringifyException(e);
             if (abortable != null) {
               abortable.abort(message, e);

http://git-wip-us.apache.org/repos/asf/hbase/blob/016cf0c6/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
index b0dd0b4..3d69888 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
@@ -653,14 +653,12 @@ public class HMaster extends HRegionServer implements MasterServices {
   }
 
   /**
-   * If configured to put regions on active master,
-   * wait till a backup master becomes active.
-   * Otherwise, loop till the server is stopped or aborted.
+   * Wait here if backup Master. This avoids showing backup masters as regionservers in master
+   * web UI, or assigning any region to them.
    */
   @Override
-  protected void waitForMasterActive(){
-    boolean tablesOnMaster = LoadBalancer.isTablesOnMaster(conf);
-    while (!(tablesOnMaster && activeMaster) && !isStopped() && !isAborted()) {
+  protected void waitForMasterActive() {
+    while (!this.activeMaster && keepLooping()) {
       sleeper.sleep();
     }
   }

http://git-wip-us.apache.org/repos/asf/hbase/blob/016cf0c6/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
index 06d6c8b..6488559 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
@@ -884,7 +884,7 @@ public class ServerManager {
     if (isClusterShutdown()) {
       this.master.stop("Cluster shutdown");
     }
-    LOG.info("Finished waiting on RegionServer count=" + count + "; waited=" + slept + "ms," +
+    LOG.info("RegionServer count=" + count + "; waited=" + slept + "ms," +
         " expected min=" + minToStart + " server(s), max=" +  getStrForMax(maxToStart) + " server(s),"+
         " master is "+ (this.master.isStopped() ? "stopped.": "running"));
   }

http://git-wip-us.apache.org/repos/asf/hbase/blob/016cf0c6/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
index 2e2f8bf..adda8c0 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
@@ -260,9 +260,7 @@ public class SplitLogManager {
     if (batch.done != batch.installed) {
       batch.isDead = true;
       SplitLogCounters.tot_mgr_log_split_batch_err.increment();
-      LOG.warn("error while splitting logs in " + logDirs + " installed = " + batch.installed
-          + " but only " + batch.done + " done");
-      String msg = "error or interrupted while splitting logs in " + logDirs + " Task = " + batch;
+      String msg = "Error or interrupted while splitting WALs in " + logDirs + "; task=" + batch;
       status.abort(msg);
       throw new IOException(msg);
     }
@@ -476,7 +474,7 @@ public class SplitLogManager {
 
     @Override
     public String toString() {
-      return ("installed = " + installed + " done = " + done + " error = " + error);
+      return ("installed=" + installed + ", done=" + done + ", error=" + error);
     }
   }
 

http://git-wip-us.apache.org/repos/asf/hbase/blob/016cf0c6/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
index a48ed75..58edfda 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
@@ -1727,15 +1727,15 @@ public class AssignmentManager implements ServerListener {
     }
 
     // TODO: Optimize balancer. pass a RegionPlan?
-    final HashMap<RegionInfo, ServerName> retainMap = new HashMap<>();
-    final List<RegionInfo> userHRIs = new ArrayList<>(regions.size());
+    final HashMap<RegionInfo, ServerName> retainPlan = new HashMap<>();
+    final List<RegionInfo> userRegionInfos = new ArrayList<>();
     // Regions for system tables requiring reassignment
-    final List<RegionInfo> systemHRIs = new ArrayList<>();
+    final List<RegionInfo> systemRegionInfos = new ArrayList<>();
     for (RegionStateNode regionStateNode: regions.values()) {
       boolean sysTable = regionStateNode.isSystemTable();
-      final List<RegionInfo> hris = sysTable? systemHRIs: userHRIs;
+      final List<RegionInfo> hris = sysTable? systemRegionInfos: userRegionInfos;
       if (regionStateNode.getRegionLocation() != null) {
-        retainMap.put(regionStateNode.getRegionInfo(), regionStateNode.getRegionLocation());
+        retainPlan.put(regionStateNode.getRegionInfo(), regionStateNode.getRegionLocation());
       } else {
         hris.add(regionStateNode.getRegionInfo());
       }
@@ -1748,38 +1748,42 @@ public class AssignmentManager implements ServerListener {
     for (int i = 0; servers.size() < 1; ++i) {
       // Report every fourth time around this loop; try not to flood log.
       if (i % 4 == 0) {
-        LOG.warn("No servers available; cannot place " + regions.size() + " unassigned regions.");
+        // Log every 4th time; we wait 250ms below so means every second.
+        LOG.warn("No server available; unable to find a location for " + regions.size() +
+            " regions. waiting...");
       }
 
       if (!isRunning()) {
-        LOG.debug("Stopped! Dropping assign of " + regions.size() + " queued regions.");
+        LOG.debug("Aborting assignment-queue with " + regions.size() + " unassigned");
         return;
       }
       Threads.sleep(250);
+      // Refresh server list.
       servers = master.getServerManager().createDestinationServersList();
     }
 
-    if (!systemHRIs.isEmpty()) {
+    if (!systemRegionInfos.isEmpty()) {
       // System table regions requiring reassignment are present, get region servers
-      // not available for system table regions
+      // not available for system table regions. Here we are filtering out any regionservers
+      // that might be running older versions of the RegionServer; we want system tables on any
+      // newer servers that may be present. Newer servers means we are probably doing a rolling
+      // upgrade.
       final List<ServerName> excludeServers = getExcludedServersForSystemTable();
       List<ServerName> serversForSysTables = servers.stream()
           .filter(s -> !excludeServers.contains(s)).collect(Collectors.toList());
       if (serversForSysTables.isEmpty()) {
-        LOG.warn("Filtering old server versions and the excluded produced an empty set; " +
-            "instead considering all candidate servers!");
+        LOG.warn("All servers excluded! Considering all servers!");
       }
-      LOG.debug("Processing assignQueue; systemServersCount=" + serversForSysTables.size() +
-          ", allServersCount=" + servers.size());
-      processAssignmentPlans(regions, null, systemHRIs,
+      LOG.debug("Candidate servers to host system regions=" + serversForSysTables.size() +
+          "; totalServersCount=" + servers.size());
+      processAssignmentPlans(regions, null, systemRegionInfos,
           serversForSysTables.isEmpty()? servers: serversForSysTables);
     }
-
-    processAssignmentPlans(regions, retainMap, userHRIs, servers);
+    processAssignmentPlans(regions, retainPlan, userRegionInfos, servers);
   }
 
   private void processAssignmentPlans(final HashMap<RegionInfo, RegionStateNode> regions,
-      final HashMap<RegionInfo, ServerName> retainMap, final List<RegionInfo> hris,
+      final HashMap<RegionInfo, ServerName> retain, final List<RegionInfo> hris,
       final List<ServerName> servers) {
     boolean isTraceEnabled = LOG.isTraceEnabled();
     if (isTraceEnabled) {
@@ -1788,15 +1792,15 @@ public class AssignmentManager implements ServerListener {
 
     final LoadBalancer balancer = getBalancer();
     // ask the balancer where to place regions
-    if (retainMap != null && !retainMap.isEmpty()) {
+    if (retain != null && !retain.isEmpty()) {
       if (isTraceEnabled) {
-        LOG.trace("retain assign regions=" + retainMap);
+        LOG.trace("Retain assign regions=" + retain);
       }
       try {
-        acceptPlan(regions, balancer.retainAssignment(retainMap, servers));
+        acceptPlan(regions, balancer.retainAssignment(retain, servers));
       } catch (HBaseIOException e) {
         LOG.warn("unable to retain assignment", e);
-        addToPendingAssignment(regions, retainMap.keySet());
+        addToPendingAssignment(regions, retain.keySet());
       }
     }
 
@@ -1805,12 +1809,12 @@ public class AssignmentManager implements ServerListener {
     if (!hris.isEmpty()) {
       Collections.sort(hris, RegionInfo.COMPARATOR);
       if (isTraceEnabled) {
-        LOG.trace("round robin regions=" + hris);
+        LOG.trace("Round-robin regions=" + hris);
       }
       try {
         acceptPlan(regions, balancer.roundRobinAssignment(hris, servers));
       } catch (HBaseIOException e) {
-        LOG.warn("unable to round-robin assignment", e);
+        LOG.warn("Unable to round-robin assignment", e);
         addToPendingAssignment(regions, hris);
       }
     }
@@ -1822,7 +1826,7 @@ public class AssignmentManager implements ServerListener {
     final long st = System.currentTimeMillis();
 
     if (plan == null) {
-      throw new HBaseIOException("unable to compute plans for regions=" + regions.size());
+      throw new HBaseIOException("Unable to compute plans for " + regions.size() + " regions");
     }
 
     if (plan.isEmpty()) return;
@@ -1858,8 +1862,9 @@ public class AssignmentManager implements ServerListener {
   }
 
   /**
-   * Get a list of servers that this region cannot be assigned to.
-   * For system tables, we must assign them to a server with highest version.
+   * Get a list of servers that this region can NOT be assigned to.
+   * For system tables, we must assign them to a server with highest version (rolling upgrade
+   * scenario).
    */
   public List<ServerName> getExcludedServersForSystemTable() {
     // TODO: This should be a cached list kept by the ServerManager rather than calculated on each

http://git-wip-us.apache.org/repos/asf/hbase/blob/016cf0c6/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java
index 36f57f2..d4b6103 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java
@@ -1229,10 +1229,16 @@ public abstract class BaseLoadBalancer implements LoadBalancer {
     if (regions == null || regions.isEmpty()) {
       return assignments;
     }
+    if (!this.tablesOnMaster) {
+      // Make sure Master is not in set of possible servers.
+      if (servers != null && !servers.isEmpty()) {
+        servers.remove(this.masterServerName);
+      }
+    }
 
-    int numServers = servers == null ? 0 : servers.size();
+    int numServers = servers == null? 0: servers.size();
     if (numServers == 0) {
-      LOG.warn("Wanted to do round robin assignment but no servers to assign to");
+      LOG.warn("Wanted to round-robin assignment but no server(s) to assign to.");
       return null;
     }
 

http://git-wip-us.apache.org/repos/asf/hbase/blob/016cf0c6/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
index 7415b77..f114e20 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
@@ -874,22 +874,15 @@ public class HRegionServer extends HasThread implements
       }
     }
 
-    // In case colocated master, wait here till it's active.
-    // So backup masters won't start as regionservers.
-    // This is to avoid showing backup masters as regionservers
-    // in master web UI, or assigning any region to them.
-    waitForMasterActive();
-    if (isStopped() || isAborted()) {
-      return; // No need for further initialization
-    }
-
-    // watch for snapshots and other procedures
-    try {
-      rspmHost = new RegionServerProcedureManagerHost();
-      rspmHost.loadProcedures(conf);
-      rspmHost.initialize(this);
-    } catch (KeeperException e) {
-      this.abort("Failed to reach coordination cluster when creating procedure handler.", e);
+    // Watch for snapshots and other procedures. Check we have not been stopped before proceeding.
+    if (keepLooping()) {
+      try {
+        rspmHost = new RegionServerProcedureManagerHost();
+        rspmHost.loadProcedures(conf);
+        rspmHost.initialize(this);
+      } catch (KeeperException e) {
+        this.abort("Failed setup of RegionServerProcedureManager.", e);
+      }
     }
   }
 
@@ -930,7 +923,10 @@ public class HRegionServer extends HasThread implements
     }
 
     try {
-      if (!isStopped() && !isAborted()) {
+      // If we are backup server instance, wait till we become active master before proceeding.
+      waitForMasterActive();
+
+      if (keepLooping()) {
         ShutdownHook.install(conf, fs, this, Thread.currentThread());
         // Initialize the RegionServerCoprocessorHost now that our ephemeral
         // node was created, in case any coprocessors want to use ZooKeeper
@@ -2139,7 +2135,7 @@ public class HRegionServer extends HasThread implements
    */
   public void stop(final String msg, final boolean force, final User user) {
     if (!this.stopped) {
-      LOG.info("***** STOPPING region server '" + this + "' *****");
+      LOG.info("STOPPING server '" + this);
       if (this.rsHost != null) {
         // when forced via abort don't allow CPs to override
         try {
@@ -2571,7 +2567,7 @@ public class HRegionServer extends HasThread implements
    * @return True if we should break loop because cluster is going down or
    * this server has been stopped or hdfs has gone bad.
    */
-  private boolean keepLooping() {
+  protected boolean keepLooping() {
     return !this.stopped && isClusterUp();
   }
 

http://git-wip-us.apache.org/repos/asf/hbase/blob/016cf0c6/hbase-server/src/test/java/org/apache/hadoop/hbase/TestClientClusterMetrics.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestClientClusterMetrics.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestClientClusterMetrics.java
index a2605f2..d44f212 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestClientClusterMetrics.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestClientClusterMetrics.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -20,6 +20,7 @@ package org.apache.hadoop.hbase;
 import java.io.IOException;
 import java.util.EnumSet;
 import java.util.List;
+import java.util.Map;
 import java.util.Optional;
 import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -149,10 +150,7 @@ public class TestClientClusterMetrics {
     Assert.assertNotNull(metrics);
     // exclude a dead region server
     Assert.assertEquals(SLAVES -1, numRs);
-    // live servers = nums of regionservers
-    // By default, HMaster don't carry any regions so it won't report its load.
-    // Hence, it won't be in the server list.
-    Assert.assertEquals(numRs, metrics.getLiveServerMetrics().size());
+    Assert.assertEquals(numRs + 1 /*Master*/, metrics.getLiveServerMetrics().size());
     Assert.assertTrue(metrics.getRegionCount() > 0);
     Assert.assertNotNull(metrics.getDeadServerNames());
     Assert.assertEquals(1, metrics.getDeadServerNames().size());

http://git-wip-us.apache.org/repos/asf/hbase/blob/016cf0c6/hbase-server/src/test/java/org/apache/hadoop/hbase/TestClientClusterStatus.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestClientClusterStatus.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestClientClusterStatus.java
index 392ff6e..a70bc94 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestClientClusterStatus.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestClientClusterStatus.java
@@ -144,7 +144,7 @@ public class TestClientClusterStatus {
     // live servers = nums of regionservers
     // By default, HMaster don't carry any regions so it won't report its load.
     // Hence, it won't be in the server list.
-    Assert.assertEquals(status.getServers().size(), numRs);
+    Assert.assertEquals(status.getServers().size(), numRs + 1/*Master*/);
     Assert.assertTrue(status.getRegionsCount() > 0);
     Assert.assertNotNull(status.getDeadServerNames());
     Assert.assertEquals(1, status.getDeadServersSize());

http://git-wip-us.apache.org/repos/asf/hbase/blob/016cf0c6/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerMetrics.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerMetrics.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerMetrics.java
index aa3a20c..1e0c08d 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerMetrics.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerMetrics.java
@@ -24,20 +24,21 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.CompatibilityFactory;
 import org.apache.hadoop.hbase.HBaseClassTestRule;
 import org.apache.hadoop.hbase.HBaseTestingUtility;
-import org.apache.hadoop.hbase.HColumnDescriptor;
 import org.apache.hadoop.hbase.HConstants;
-import org.apache.hadoop.hbase.HTableDescriptor;
 import org.apache.hadoop.hbase.MiniHBaseCluster;
 import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
+import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
 import org.apache.hadoop.hbase.client.Put;
 import org.apache.hadoop.hbase.client.Table;
+import org.apache.hadoop.hbase.client.TableDescriptor;
+import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
 import org.apache.hadoop.hbase.test.MetricsAssertHelper;
 import org.apache.hadoop.hbase.testclassification.MediumTests;
 import org.apache.hadoop.hbase.util.Bytes;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
 import org.junit.ClassRule;
-import org.junit.Ignore;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.experimental.categories.Category;
@@ -47,7 +48,6 @@ import org.slf4j.LoggerFactory;
 
 import static org.junit.Assert.fail;
 
-@Ignore // Disabled temporarily; reenable 
 @Category(MediumTests.class)
 public class TestAssignmentManagerMetrics {
 
@@ -83,10 +83,7 @@ public class TestAssignmentManagerMetrics {
     // set msgInterval to 1 second
     conf.setInt("hbase.regionserver.msginterval", msgInterval);
 
-    // set tablesOnMaster to none
-    conf.set("hbase.balancer.tablesOnMaster", "none");
-
-    // set client sync wait timeout to 5sec
+    // Set client sync wait timeout to 5sec
     conf.setInt("hbase.client.sync.wait.timeout.msec", 2500);
     conf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 1);
     conf.setInt(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT, 2500);
@@ -132,27 +129,25 @@ public class TestAssignmentManagerMetrics {
           amSource);
 
       // alter table with a non-existing coprocessor
-      HTableDescriptor htd = new HTableDescriptor(TABLENAME);
-      HColumnDescriptor hcd = new HColumnDescriptor(FAMILY);
-
-      htd.addFamily(hcd);
-
-      String spec = "hdfs:///foo.jar|com.foo.FooRegionObserver|1001|arg1=1,arg2=2";
-      htd.addCoprocessorWithSpec(spec);
-
+      ColumnFamilyDescriptor hcd = ColumnFamilyDescriptorBuilder.newBuilder(FAMILY).build();
+      TableDescriptor htd = TableDescriptorBuilder.newBuilder(TABLENAME).addColumnFamily(hcd).
+          addCoprocessorWithSpec("hdfs:///foo.jar|com.foo.FooRegionObserver|1001|arg1=1,arg2=2").
+          build();
       try {
-        TEST_UTIL.getAdmin().modifyTable(TABLENAME, htd);
+        TEST_UTIL.getAdmin().modifyTable(htd);
         fail("Expected region failed to open");
       } catch (IOException e) {
-        // expected, the RS will crash and the assignment will spin forever waiting for a RS
-        // to assign the region. the region will not go to FAILED_OPEN because in this case
-        // we have just one RS and it will do one retry.
+        // Expected, the RS will crash and the assignment will spin forever waiting for a RS
+        // to assign the region.
+        LOG.info("Expected exception", e);
       }
 
       // Sleep 3 seconds, wait for doMetrics chore catching up
       Thread.sleep(msgInterval * 3);
-      metricsHelper.assertGauge(MetricsAssignmentManagerSource.RIT_COUNT_NAME, 1, amSource);
-      metricsHelper.assertGauge(MetricsAssignmentManagerSource.RIT_COUNT_OVER_THRESHOLD_NAME, 1,
+      // Two regions in RIT -- meta and the testRITAssignementManagerMetrics table region.
+      metricsHelper.assertGauge(MetricsAssignmentManagerSource.RIT_COUNT_NAME, 2, amSource);
+      // Both are over the threshold because no RegionServer to assign to.
+      metricsHelper.assertGauge(MetricsAssignmentManagerSource.RIT_COUNT_OVER_THRESHOLD_NAME, 2,
           amSource);
 
     } finally {

http://git-wip-us.apache.org/repos/asf/hbase/blob/016cf0c6/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestRegionsOnMasterOptions.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestRegionsOnMasterOptions.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestRegionsOnMasterOptions.java
index 9f551c8..77af68a 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestRegionsOnMasterOptions.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestRegionsOnMasterOptions.java
@@ -51,6 +51,7 @@ import org.slf4j.LoggerFactory;
  * It then does kill combinations to make sure the distribution is more than just for startup.
  * NOTE: Regions on Master does not work well. See HBASE-19828. Until addressed, disabling this
  * test.
+ * NOTE: System-tables only on Master doesn't work. TODO.
  */
 @Ignore
 @Category({MediumTests.class})
@@ -111,8 +112,8 @@ public class TestRegionsOnMasterOptions {
     checkBalance(0, rsCount);
   }
 
-  @Ignore // Fix this. The Master startup doesn't allow Master reporting as a RegionServer, not
-  // until way late after the Master startup finishes. Needs more work.
+  @Ignore // Needs a bunch of work. We need to assign meta first and do it ahead of all others.
+  // This special handling messes up being able to host system tables only on Master w/o hacks.
   @Test
   public void testSystemTablesOnMaster() throws Exception {
     c.setBoolean(LoadBalancer.TABLES_ON_MASTER, true);

http://git-wip-us.apache.org/repos/asf/hbase/blob/016cf0c6/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKUtil.java
----------------------------------------------------------------------
diff --git a/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKUtil.java b/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKUtil.java
index 7bb4752..37fc461 100644
--- a/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKUtil.java
+++ b/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKUtil.java
@@ -611,13 +611,13 @@ public final class ZKUtil {
       logRetrievedMsg(zkw, znode, data, false);
       return data;
     } catch (KeeperException.NoNodeException e) {
-      LOG.debug(zkw.prefix("Unable to get data of znode " + znode + " " +
-          "because node does not exist (not an error)"));
+      LOG.debug(zkw.prefix("failed to get data of " + znode + " " +
+          "; does not exist (not an error)"));
       return null;
     } catch (KeeperException e) {
-      LOG.warn(zkw.prefix("Unable to get data of znode " + znode), e);
-      zkw.keeperException(e);
-      return null;
+      LOG.debug(zkw.prefix("failed to get data of " + znode + "; " + e.getMessage()));
+      // Rethrow
+      throw e;
     }
   }