You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by se...@apache.org on 2019/06/10 23:15:26 UTC

[hbase] branch branch-2 updated: HBASE-22408 add dead and unknown server open regions metric to AM

This is an automated email from the ASF dual-hosted git repository.

sershe pushed a commit to branch branch-2
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/branch-2 by this push:
     new 89ea709  HBASE-22408 add dead and unknown server open regions metric to AM
89ea709 is described below

commit 89ea709cba0fe3f6090748f6b0b9d726d03493ba
Author: Sergey Shelukhin <se...@apache.org>
AuthorDate: Mon Jun 10 15:43:27 2019 -0700

    HBASE-22408 add dead and unknown server open regions metric to AM
    
    Signed-off-by: Duo Zhang <zh...@apache.org>
---
 .../master/MetricsAssignmentManagerSource.java     |  6 ++
 .../master/MetricsAssignmentManagerSourceImpl.java | 14 ++++
 .../hbase/master/MetricsAssignmentManager.java     |  8 ++
 .../apache/hadoop/hbase/master/ServerManager.java  | 17 ++++-
 .../hbase/master/assignment/AssignmentManager.java | 89 +++++++++++++++++++++-
 .../hbase/master/assignment/RegionStates.java      |  8 +-
 6 files changed, 137 insertions(+), 5 deletions(-)

diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSource.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSource.java
index e714fd1..822e407 100644
--- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSource.java
+++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSource.java
@@ -50,6 +50,8 @@ public interface MetricsAssignmentManagerSource extends BaseSource {
   String RIT_COUNT_OVER_THRESHOLD_NAME = "ritCountOverThreshold";
   String RIT_OLDEST_AGE_NAME = "ritOldestAge";
   String RIT_DURATION_NAME = "ritDuration";
+  String DEAD_SERVER_OPEN_REGIONS = "deadServerOpenRegions";
+  String UNKNOWN_SERVER_OPEN_REGIONS = "unknownServerOpenRegions";
 
   String RIT_COUNT_DESC = "Current number of Regions In Transition (Gauge).";
   String RIT_COUNT_OVER_THRESHOLD_DESC =
@@ -92,6 +94,10 @@ public interface MetricsAssignmentManagerSource extends BaseSource {
 
   void updateRitDuration(long duration);
 
+  void updateDeadServerOpenRegions(int deadRegions);
+
+  void updateUnknownServerOpenRegions(int unknownRegions);
+
   /**
    * TODO: Remove. This may not be needed now as assign and unassign counts are tracked separately
    * Increment the count of operations (assign/unassign).
diff --git a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSourceImpl.java b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSourceImpl.java
index ad692f2..d623de9 100644
--- a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSourceImpl.java
+++ b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSourceImpl.java
@@ -34,6 +34,8 @@ public class MetricsAssignmentManagerSourceImpl
   private MutableGaugeLong ritCountOverThresholdGauge;
   private MutableGaugeLong ritOldestAgeGauge;
   private MetricHistogram ritDurationHisto;
+  private MutableGaugeLong deadServerOpenRegions;
+  private MutableGaugeLong unknownServerOpenRegions;
 
   private MutableFastCounter operationCounter;
 
@@ -63,6 +65,8 @@ public class MetricsAssignmentManagerSourceImpl
     ritOldestAgeGauge = metricsRegistry.newGauge(RIT_OLDEST_AGE_NAME, RIT_OLDEST_AGE_DESC, 0L);
     ritDurationHisto = metricsRegistry.newTimeHistogram(RIT_DURATION_NAME, RIT_DURATION_DESC);
     operationCounter = metricsRegistry.getCounter(OPERATION_COUNT_NAME, 0L);
+    deadServerOpenRegions = metricsRegistry.newGauge(DEAD_SERVER_OPEN_REGIONS, "", 0);
+    unknownServerOpenRegions = metricsRegistry.newGauge(UNKNOWN_SERVER_OPEN_REGIONS, "", 0);
 
     /**
      * NOTE: Please refer to HBASE-9774 and HBASE-14282. Based on these two issues, HBase is
@@ -105,6 +109,16 @@ public class MetricsAssignmentManagerSourceImpl
   }
 
   @Override
+  public void updateDeadServerOpenRegions(int deadRegions) {
+    deadServerOpenRegions.set(deadRegions);
+  }
+
+  @Override
+  public void updateUnknownServerOpenRegions(int unknownRegions) {
+    unknownServerOpenRegions.set(unknownRegions);
+  }
+
+  @Override
   public OperationMetrics getAssignMetrics() {
     return assignMetrics;
   }
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManager.java
index 8b214f8..38aeef2 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManager.java
@@ -96,6 +96,14 @@ public class MetricsAssignmentManager {
     assignmentManagerSource.incrementOperationCounter();
   }
 
+  public void updateDeadServerOpenRegions(int deadRegions) {
+    assignmentManagerSource.updateDeadServerOpenRegions(deadRegions);
+  }
+
+  public void updateUnknownServerOpenRegions(int unknownRegions) {
+    assignmentManagerSource.updateUnknownServerOpenRegions(unknownRegions);
+  }
+
   /**
    * @return Set of common metrics for assign procedure
    */
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
index 1ddfa9b..82d7ab8 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
@@ -574,8 +574,9 @@ public class ServerManager {
     }
   }
 
+  // Note: this is currently invoked from RPC, not just tests. Locking in this class needs cleanup.
   @VisibleForTesting
-  public void moveFromOnlineToDeadServers(final ServerName sn) {
+  public synchronized void moveFromOnlineToDeadServers(final ServerName sn) {
     synchronized (onlineServers) {
       if (!this.onlineServers.containsKey(sn)) {
         LOG.trace("Expiration of {} but server not online", sn);
@@ -861,6 +862,20 @@ public class ServerManager {
     return serverName != null && onlineServers.containsKey(serverName);
   }
 
+  public enum ServerLiveState {
+    LIVE,
+    DEAD,
+    UNKNOWN
+  }
+
+  /**
+   * @return whether the server is online, dead, or unknown.
+   */
+  public synchronized ServerLiveState isServerKnownAndOnline(ServerName serverName) {
+    return onlineServers.containsKey(serverName) ? ServerLiveState.LIVE
+      : (deadservers.isDeadServer(serverName) ? ServerLiveState.DEAD : ServerLiveState.UNKNOWN);
+  }
+
   /**
    * Check if a server is known to be dead.  A server can be online,
    * or known to be dead, or unknown to this manager (i.e, not online,
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
index 1dc6c18..5ad3ba4 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
@@ -22,6 +22,7 @@ import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -53,6 +54,7 @@ import org.apache.hadoop.hbase.master.MetricsAssignmentManager;
 import org.apache.hadoop.hbase.master.RegionPlan;
 import org.apache.hadoop.hbase.master.RegionState;
 import org.apache.hadoop.hbase.master.RegionState.State;
+import org.apache.hadoop.hbase.master.ServerManager;
 import org.apache.hadoop.hbase.master.TableStateManager;
 import org.apache.hadoop.hbase.master.balancer.FavoredStochasticBalancer;
 import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
@@ -127,6 +129,10 @@ public class AssignmentManager {
       "hbase.assignment.rit.chore.interval.msec";
   private static final int DEFAULT_RIT_CHORE_INTERVAL_MSEC = 60 * 1000;
 
+  public static final String DEAD_REGION_METRIC_CHORE_INTERVAL_MSEC_CONF_KEY =
+      "hbase.assignment.dead.region.metric.chore.interval.msec";
+  private static final int DEFAULT_DEAD_REGION_METRIC_CHORE_INTERVAL_MSEC = 120 * 1000;
+
   public static final String ASSIGN_MAX_ATTEMPTS =
       "hbase.assignment.maximum.attempts";
   private static final int DEFAULT_ASSIGN_MAX_ATTEMPTS = Integer.MAX_VALUE;
@@ -145,6 +151,7 @@ public class AssignmentManager {
 
   private final MetricsAssignmentManager metrics;
   private final RegionInTransitionChore ritChore;
+  private final DeadServerMetricRegionChore deadMetricChore;
   private final MasterServices master;
 
   private final AtomicBoolean running = new AtomicBoolean(false);
@@ -190,6 +197,14 @@ public class AssignmentManager {
     int ritChoreInterval = conf.getInt(RIT_CHORE_INTERVAL_MSEC_CONF_KEY,
         DEFAULT_RIT_CHORE_INTERVAL_MSEC);
     this.ritChore = new RegionInTransitionChore(ritChoreInterval);
+
+    int deadRegionChoreInterval = conf.getInt(DEAD_REGION_METRIC_CHORE_INTERVAL_MSEC_CONF_KEY,
+        DEFAULT_DEAD_REGION_METRIC_CHORE_INTERVAL_MSEC);
+    if (deadRegionChoreInterval > 0) {
+      this.deadMetricChore = new DeadServerMetricRegionChore(deadRegionChoreInterval);
+    } else {
+      this.deadMetricChore = null;
+    }
   }
 
   public void start() throws IOException, KeeperException {
@@ -271,6 +286,9 @@ public class AssignmentManager {
     // Remove the RIT chore
     if (hasProcExecutor) {
       master.getMasterProcedureExecutor().removeChore(this.ritChore);
+      if (this.deadMetricChore != null) {
+        master.getMasterProcedureExecutor().removeChore(this.deadMetricChore);
+      }
     }
 
     // Stop the Assignment Thread
@@ -1130,6 +1148,69 @@ public class AssignmentManager {
     }
   }
 
+  private static class DeadServerMetricRegionChore
+      extends ProcedureInMemoryChore<MasterProcedureEnv> {
+    public DeadServerMetricRegionChore(final int timeoutMsec) {
+      super(timeoutMsec);
+    }
+
+    @Override
+    protected void periodicExecute(final MasterProcedureEnv env) {
+      final ServerManager sm = env.getMasterServices().getServerManager();
+      final AssignmentManager am = env.getAssignmentManager();
+      // To minimize inconsistencies we are not going to snapshot live servers in advance in case
+      // new servers are added; OTOH we don't want to add heavy sync for a consistent view since
+      // this is for metrics. Instead, we're going to check each regions as we go; to avoid making
+      // too many checks, we maintain a local lists of server, limiting us to false negatives. If
+      // we miss some recently-dead server, we'll just see it next time.
+      Set<ServerName> recentlyLiveServers = new HashSet<>();
+      int deadRegions = 0, unknownRegions = 0;
+      for (RegionStateNode rsn : am.getRegionStates().getRegionStateNodes()) {
+        if (rsn.getState() != State.OPEN) {
+          continue; // Opportunistic check, should quickly skip RITs, offline tables, etc.
+        }
+        ServerName sn;
+        State state;
+        rsn.lock();
+        try {
+          sn = rsn.getRegionLocation();
+          state = rsn.getState();
+        } finally {
+          rsn.unlock();
+        }
+        if (state != State.OPEN) {
+          continue; // Mostly skipping RITs that are already being take care of.
+        }
+        if (sn == null) {
+          ++unknownRegions; // Opened on null?
+          continue;
+        }
+        if (recentlyLiveServers.contains(sn)) {
+          continue;
+        }
+        ServerManager.ServerLiveState sls = sm.isServerKnownAndOnline(sn);
+        switch (sls) {
+          case LIVE:
+            recentlyLiveServers.add(sn);
+            break;
+          case DEAD:
+            ++deadRegions;
+            break;
+          case UNKNOWN:
+            ++unknownRegions;
+            break;
+          default: throw new AssertionError("Unexpected " + sls);
+        }
+      }
+      if (deadRegions > 0 || unknownRegions > 0) {
+        LOG.info("Found {} OPEN regions on dead servers and {} OPEN regions on unknown servers",
+          deadRegions, unknownRegions);
+      }
+
+      am.updateDeadServerRegionMetrics(deadRegions, unknownRegions);
+    }
+  }
+
   public RegionInTransitionStat computeRegionInTransitionStat() {
     final RegionInTransitionStat rit = new RegionInTransitionStat(getConfiguration());
     rit.update(this);
@@ -1230,6 +1311,11 @@ public class AssignmentManager {
     metrics.updateRITCountOverThreshold(ritStat.getTotalRITsOverThreshold());
   }
 
+  private void updateDeadServerRegionMetrics(int deadRegions, int unknownRegions) {
+    metrics.updateDeadServerOpenRegions(deadRegions);
+    metrics.updateUnknownServerOpenRegions(unknownRegions);
+  }
+
   private void handleRegionOverStuckWarningThreshold(final RegionInfo regionInfo) {
     final RegionStateNode regionNode = regionStates.getRegionStateNode(regionInfo);
     //if (regionNode.isStuck()) {
@@ -1255,8 +1341,9 @@ public class AssignmentManager {
     }
     LOG.info("Number of RegionServers={}", master.getServerManager().countOfRegionServers());
 
-    // Start the RIT chore
+    // Start the chores
     master.getMasterProcedureExecutor().addChore(this.ritChore);
+    master.getMasterProcedureExecutor().addChore(this.deadMetricChore);
 
     long costMs = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startTime);
     LOG.info("Joined the cluster in {}", StringUtils.humanTimeDiff(costMs));
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java
index 4728d1f..0efeb32 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java
@@ -178,12 +178,14 @@ public class RegionStates {
     return regions;
   }
 
-  Collection<RegionStateNode> getRegionStateNodes() {
-    return regionsMap.values();
+  /** @return A view of region state nodes for all the regions. */
+  public Collection<RegionStateNode> getRegionStateNodes() {
+    return Collections.unmodifiableCollection(regionsMap.values());
   }
 
+  /** @return A snapshot of region state nodes for all the regions. */
   public ArrayList<RegionState> getRegionStates() {
-    final ArrayList<RegionState> regions = new ArrayList<RegionState>(regionsMap.size());
+    final ArrayList<RegionState> regions = new ArrayList<>(regionsMap.size());
     for (RegionStateNode node: regionsMap.values()) {
       regions.add(node.toRegionState());
     }