You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2018/10/05 00:19:30 UTC

hbase git commit: HBASE-21242 [amv2] Miscellaneous minor log and assign procedure create improvements

Repository: hbase
Updated Branches:
  refs/heads/branch-2.1 8fc90a23a -> 9d34b4581


HBASE-21242 [amv2] Miscellaneous minor log and assign procedure create improvements

For RIT Duration, do better than print ms/seconds. Remove redundant UI
column dedicated to duration when we log it in the status field too.

Make bypass log at INFO level.

Make it so on complete of subprocedure, we note count of outstanding
siblings so we have a clue how much further the parent has to go before
it is done (Helpful when hundreds of servers doing SCP).

Have the SCP run the AP preflight check before creating an AP; saves
creation of thousands of APs during fixup.

Don't log tablename three times when reporting remote call failed.

If lock is held already, note who has it. Also log after we get lock
or if we have to wait rather than log on entrance though we may
later have to wait (or we may have just picked up the lock).

Signed-off-by: Mike Drob <md...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/9d34b458
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/9d34b458
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/9d34b458

Branch: refs/heads/branch-2.1
Commit: 9d34b4581c485deb3a659c7a435d557494e61462
Parents: 8fc90a2
Author: Michael Stack <st...@apache.org>
Authored: Wed Sep 26 21:22:46 2018 -0700
Committer: Michael Stack <st...@apache.org>
Committed: Thu Oct 4 17:18:13 2018 -0700

----------------------------------------------------------------------
 .../apache/hadoop/hbase/master/RegionState.java |  6 +--
 .../hadoop/hbase/procedure2/Procedure.java      |  3 ++
 .../hbase/procedure2/ProcedureExecutor.java     | 30 +++++++++------
 .../master/AssignmentManagerStatusTmpl.jamon    |  3 +-
 .../master/assignment/AssignProcedure.java      | 40 ++++++++++++++++----
 .../assignment/RegionTransitionProcedure.java   |  3 +-
 .../master/assignment/UnassignProcedure.java    |  2 +-
 .../procedure/MasterProcedureScheduler.java     |  5 ++-
 .../master/procedure/ServerCrashProcedure.java  |  8 ++++
 9 files changed, 73 insertions(+), 27 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/9d34b458/hbase-client/src/main/java/org/apache/hadoop/hbase/master/RegionState.java
----------------------------------------------------------------------
diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/master/RegionState.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/master/RegionState.java
index 7289ce8..a1e2ca6 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/master/RegionState.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/master/RegionState.java
@@ -371,9 +371,9 @@ public class RegionState {
   public String toDescriptiveString() {
     long relTime = System.currentTimeMillis() - stamp;
     return hri.getRegionNameAsString()
-      + " state=" + state
-      + ", ts=" + new Date(stamp) + " (" + (relTime/1000) + "s ago)"
-      + ", server=" + serverName;
+      + " state=" + state + ", ts=" + new Date(stamp) + " (" +
+        java.time.Duration.ofMillis(relTime).toString() +
+        " ago), server=" + serverName;
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/hbase/blob/9d34b458/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java
----------------------------------------------------------------------
diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java
index 2b66961..01dc1be 100644
--- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java
+++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java
@@ -883,6 +883,9 @@ public abstract class Procedure<TEnvironment> implements Comparable<Procedure<TE
     return childrenLatch > 0;
   }
 
+  /**
+   * @return Count of children outstanding (Badly named).
+   */
   protected synchronized int getChildrenLatch() {
     return childrenLatch;
   }

http://git-wip-us.apache.org/repos/asf/hbase/blob/9d34b458/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java
----------------------------------------------------------------------
diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java
index 58e0610..cb191aa 100644
--- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java
+++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java
@@ -1015,21 +1015,21 @@ public class ProcedureExecutor<TEnvironment> {
       return false;
     }
 
-    LOG.debug("Begin bypass {} with lockWait={}, override={}, recursive={}",
+    LOG.info("Begin bypass {} with lockWait={}, override={}, recursive={}",
         procedure, lockWait, override, recursive);
     IdLock.Entry lockEntry = procExecutionLock.tryLockEntry(procedure.getProcId(), lockWait);
     if (lockEntry == null && !override) {
-      LOG.debug("Waited {} ms, but {} is still running, skipping bypass with force={}",
+      LOG.info("Waited {} ms, but {} is still running, skipping bypass with override={}",
           lockWait, procedure, override);
       return false;
     } else if (lockEntry == null) {
-      LOG.debug("Waited {} ms, but {} is still running, begin bypass with force={}",
+      LOG.info("Waited {} ms, but {} is still running, begin bypass with override={}",
           lockWait, procedure, override);
     }
     try {
       // check whether the procedure is already finished
       if (procedure.isFinished()) {
-        LOG.debug("{} is already finished, skipping bypass", procedure);
+        LOG.info("{} is already finished, skipping bypass", procedure);
         return false;
       }
 
@@ -1054,7 +1054,7 @@ public class ProcedureExecutor<TEnvironment> {
               }
             });
         } else {
-          LOG.debug("{} has children, skipping bypass", procedure);
+          LOG.info("{} has children, skipping bypass", procedure);
           return false;
         }
       }
@@ -1063,7 +1063,7 @@ public class ProcedureExecutor<TEnvironment> {
       if (procedure.hasParent() && procedure.getState() != ProcedureState.RUNNABLE
           && procedure.getState() != ProcedureState.WAITING
           && procedure.getState() != ProcedureState.WAITING_TIMEOUT) {
-        LOG.debug("Bypassing procedures in RUNNABLE, WAITING and WAITING_TIMEOUT states "
+        LOG.info("Bypassing procedures in RUNNABLE, WAITING and WAITING_TIMEOUT states "
                 + "(with no parent), {}",
             procedure);
         // Question: how is the bypass done here?
@@ -1075,7 +1075,7 @@ public class ProcedureExecutor<TEnvironment> {
       // finished yet
       Procedure current = procedure;
       while (current != null) {
-        LOG.debug("Bypassing {}", current);
+        LOG.info("Bypassing {}", current);
         current.bypass(getEnvironment());
         store.update(procedure);
         long parentID = current.getParentProcId();
@@ -1095,9 +1095,9 @@ public class ProcedureExecutor<TEnvironment> {
       if (lockEntry != null) {
         // add the procedure to run queue,
         scheduler.addFront(procedure);
-        LOG.debug("Bypassing {} and its ancestors successfully, adding to queue", procedure);
+        LOG.info("Bypassing {} and its ancestors successfully, adding to queue", procedure);
       } else {
-        LOG.debug("Bypassing {} and its ancestors successfully, but since it is already running, "
+        LOG.info("Bypassing {} and its ancestors successfully, but since it is already running, "
             + "skipping add to queue", procedure);
       }
       return true;
@@ -1507,9 +1507,17 @@ public class ProcedureExecutor<TEnvironment> {
       procStack.release(proc);
 
       if (proc.isSuccess()) {
-        // update metrics on finishing the procedure
+        // update metrics on finishing the procedure.
         proc.updateMetricsOnFinish(getEnvironment(), proc.elapsedTime(), true);
-        LOG.info("Finished " + proc + " in " + StringUtils.humanTimeDiff(proc.elapsedTime()));
+        // Print out count of outstanding siblings if this procedure has a parent.
+        Procedure<TEnvironment> parent = null;
+        if (proc.hasParent()) {
+          parent = procedures.get(proc.getParentProcId());
+        }
+        LOG.info("Finished {} in {}{}",
+            proc,
+            StringUtils.humanTimeDiff(proc.elapsedTime()),
+            parent != null? (", unfinishedSiblingCount=" + parent.getChildrenLatch()): "");
         // Finalize the procedure state
         if (proc.getProcId() == rootProcId) {
           procedureFinished(proc);

http://git-wip-us.apache.org/repos/asf/hbase/blob/9d34b458/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon b/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon
index bb3686b..51dd438 100644
--- a/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon
+++ b/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon
@@ -73,7 +73,7 @@ int numOfPages = (int) Math.ceil(numOfRITs * 1.0 / ritsPerPage);
              <div class="tab-pane" id="tab_rits<% (recordItr / ritsPerPage) + 1 %>">
                  </%if>
                  <table class="table table-striped" style="margin-bottom:0px;"><tr><th>Region</th>
-                     <th>State</th><th>RIT time (ms)</th> <th>Retries </th></tr>
+                     <th>State</th><th>Retries </th></tr>
              </%if>
 
              <%if ritStat.isRegionTwiceOverThreshold(rs.getRegion()) %>
@@ -96,7 +96,6 @@ int numOfPages = (int) Math.ceil(numOfRITs * 1.0 / ritsPerPage);
                         <td><% rs.getRegion().getEncodedName() %></td><td>
                         <% RegionInfoDisplay.getDescriptiveNameFromRegionStateForDisplay(rs,
                             assignmentManager.getConfiguration()) %></td>
-                        <td><% (currentTime - rs.getStamp()) %> </td>
                         <td> <% retryStatus %> </td>
                      </tr>
                      <%java recordItr++; %>

http://git-wip-us.apache.org/repos/asf/hbase/blob/9d34b458/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java
index ac4a67f..a0297b3 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java
@@ -26,7 +26,9 @@ import org.apache.hadoop.hbase.ServerName;
 import org.apache.hadoop.hbase.TableName;
 import org.apache.hadoop.hbase.client.RegionInfo;
 import org.apache.hadoop.hbase.client.RetriesExhaustedException;
+import org.apache.hadoop.hbase.client.TableState;
 import org.apache.hadoop.hbase.exceptions.UnexpectedStateException;
+import org.apache.hadoop.hbase.master.MasterServices;
 import org.apache.hadoop.hbase.master.RegionState.State;
 import org.apache.hadoop.hbase.master.TableStateManager;
 import org.apache.hadoop.hbase.master.assignment.RegionStates.RegionStateNode;
@@ -164,19 +166,43 @@ public class AssignProcedure extends RegionTransitionProcedure {
     }
   }
 
-  @Override
-  protected boolean startTransition(final MasterProcedureEnv env, final RegionStateNode regionNode)
+  /**
+   * Used by ServerCrashProcedure too skip creating Assigns if not needed.
+   * @return Skip out on the assign; returns 'true'/assign if exception.
+   */
+  public static boolean assign(MasterServices masterServices, RegionInfo ri) {
+    try {
+      return assign(masterServices,
+          masterServices.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(ri));
+    } catch (IOException e) {
+      LOG.warn("Letting assign proceed", e);
+    }
+    return true;
+  }
+
+  protected static boolean assign(MasterServices masterServices, final RegionStateNode regionNode)
       throws IOException {
     // If the region is already open we can't do much...
-    if (regionNode.isInState(State.OPEN) && isServerOnline(env, regionNode)) {
-      LOG.info("Assigned, not reassigning; " + this + "; " + regionNode.toShortString());
+    if (regionNode.isInState(State.OPEN) &&
+        masterServices.getServerManager().isServerOnline(regionNode.getRegionLocation())) {
+      LOG.info("Assigned, not reassigning {}", regionNode.toShortString());
       return false;
     }
     // Don't assign if table is in disabling or disabled state.
-    TableStateManager tsm = env.getMasterServices().getTableStateManager();
+    TableStateManager tsm = masterServices.getTableStateManager();
     TableName tn = regionNode.getRegionInfo().getTable();
-    if (tsm.getTableState(tn).isDisabledOrDisabling()) {
-      LOG.info("Table " + tn + " state=" + tsm.getTableState(tn) + ", skipping " + this);
+    TableState ts = tsm.getTableState(tn);
+    if (ts.isDisabledOrDisabling()) {
+      LOG.info("{} so SKIPPING assign of {}", ts, regionNode.getRegionInfo().getEncodedName());
+      return false;
+    }
+    return true;
+  }
+
+  @Override
+  protected boolean startTransition(final MasterProcedureEnv env, final RegionStateNode regionNode)
+      throws IOException {
+    if (!assign(env.getMasterServices(), regionNode)) {
       return false;
     }
     // If the region is SPLIT, we can't assign it. But state might be CLOSED, rather than

http://git-wip-us.apache.org/repos/asf/hbase/blob/9d34b458/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
index e80cdf3..9f1dd38 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
@@ -241,8 +241,7 @@ public abstract class RegionTransitionProcedure
   public synchronized void remoteCallFailed(final MasterProcedureEnv env,
       final ServerName serverName, final IOException exception) {
     final RegionStateNode regionNode = getRegionState(env);
-    LOG.warn("Remote call failed {}; {}; {}; exception={}", serverName,
-        this, regionNode.toShortString(), exception.getClass().getSimpleName(), exception);
+    LOG.warn("Remote call failed {}; {}", regionNode.toShortString(), this, exception);
     if (remoteCallFailed(env, regionNode, exception)) {
       // NOTE: This call to wakeEvent puts this Procedure back on the scheduler.
       // Thereafter, another Worker can be in here so DO NOT MESS WITH STATE beyond

http://git-wip-us.apache.org/repos/asf/hbase/blob/9d34b458/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java
index b001885..8a391da 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java
@@ -306,7 +306,7 @@ public class UnassignProcedure extends RegionTransitionProcedure {
           exception.getClass().getSimpleName());
       if (!env.getMasterServices().getServerManager().expireServer(serverName)) {
         // Failed to queue an expire. Lots of possible reasons including it may be already expired.
-        // In ServerCrashProcedure and RecoverMetaProcedure, there is a handleRIT stage where we
+        // In ServerCrashProcedure, there is a handleRIT stage where we
         // will iterator over all the RIT procedures for the related regions of a crashed RS and
         // fail them with ServerCrashException. You can see the isSafeToProceed method above for
         // more details.

http://git-wip-us.apache.org/repos/asf/hbase/blob/9d34b458/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureScheduler.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureScheduler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureScheduler.java
index 12b4267..02d8b10 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureScheduler.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureScheduler.java
@@ -686,7 +686,6 @@ public class MasterProcedureScheduler extends AbstractProcedureScheduler {
       boolean hasLock = true;
       final LockAndQueue[] regionLocks = new LockAndQueue[regionInfo.length];
       for (int i = 0; i < regionInfo.length; ++i) {
-        LOG.info("{} checking lock on {}", procedure, regionInfo[i].getEncodedName());
         assert regionInfo[i] != null;
         assert regionInfo[i].getTable() != null;
         assert regionInfo[i].getTable().equals(table): regionInfo[i] + " " + procedure;
@@ -694,12 +693,16 @@ public class MasterProcedureScheduler extends AbstractProcedureScheduler {
 
         regionLocks[i] = locking.getRegionLock(regionInfo[i].getEncodedName());
         if (!regionLocks[i].tryExclusiveLock(procedure)) {
+          LOG.info("Waiting xlock for {} held by pid={}", procedure,
+              regionLocks[i].getExclusiveLockProcIdOwner());
           waitProcedure(regionLocks[i], procedure);
           hasLock = false;
           while (i-- > 0) {
             regionLocks[i].releaseExclusiveLock(procedure);
           }
           break;
+        } else {
+          LOG.info("xlock for {}", procedure);
         }
       }
 

http://git-wip-us.apache.org/repos/asf/hbase/blob/9d34b458/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
index 775c8c2..6ec0079 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
@@ -30,6 +30,7 @@ import org.apache.hadoop.hbase.client.RegionInfoBuilder;
 import org.apache.hadoop.hbase.client.RegionReplicaUtil;
 import org.apache.hadoop.hbase.master.MasterServices;
 import org.apache.hadoop.hbase.master.MasterWalManager;
+import org.apache.hadoop.hbase.master.assignment.AssignProcedure;
 import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
 import org.apache.hadoop.hbase.master.assignment.RegionTransitionProcedure;
 import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
@@ -176,6 +177,13 @@ public class ServerCrashProcedure
             // it does the check by calling am#isLogSplittingDone.
             List<RegionInfo> toAssign = handleRIT(env, regionsOnCrashedServer);
             AssignmentManager am = env.getAssignmentManager();
+            // Do not create assigns for Regions on disabling or disabled Tables.
+            // We do this inside in the AssignProcedure.
+            int size = toAssign.size();
+            if (toAssign.removeIf(r -> !AssignProcedure.assign(env.getMasterServices(), r))) {
+              LOG.debug("Dropped {} assigns because against disabling/disabled tables",
+                  size - toAssign.size());
+            }
             // CreateAssignProcedure will try to use the old location for the region deploy.
             addChildProcedure(am.createAssignProcedures(toAssign));
             setNextState(ServerCrashState.SERVER_CRASH_HANDLE_RIT2);