You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by vj...@apache.org on 2023/03/21 05:37:17 UTC

[hbase] branch branch-2.5 updated: HBASE-27684: add client metrics related to user region lock. (#5081)

This is an automated email from the ASF dual-hosted git repository.

vjasani pushed a commit to branch branch-2.5
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/branch-2.5 by this push:
     new 56613517fba HBASE-27684: add client metrics related to user region lock. (#5081)
56613517fba is described below

commit 56613517fba124ee36ca5096e5408e1cbe77114b
Author: Victor <vl...@gmail.com>
AuthorDate: Mon Mar 20 22:36:08 2023 -0700

    HBASE-27684: add client metrics related to user region lock. (#5081)
    
    Signed-off-by: Andrew Purtell <ap...@apache.org>
    Signed-off-by: David Manning <da...@salesforce.com>
    Signed-off-by: Rushabh Shah <sh...@apache.org>
    Signed-off-by: Tanuj Khurana <tk...@apache.org>
---
 .../hbase/client/ConnectionImplementation.java     | 15 +++++++
 .../hadoop/hbase/client/MetricsConnection.java     | 48 ++++++++++++++++++++++
 .../apache/hadoop/hbase/client/TestMetaCache.java  | 23 +++++++++++
 3 files changed, 86 insertions(+)

diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionImplementation.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionImplementation.java
index 0c529fb62aa..b60da28c657 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionImplementation.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionImplementation.java
@@ -994,6 +994,7 @@ public class ConnectionImplementation implements ClusterConnection, Closeable {
       // Query the meta region
       long pauseBase = connectionConfig.getPauseMillis();
       takeUserRegionLock();
+      final long lockStartTime = EnvironmentEdgeManager.currentTime();
       try {
         // We don't need to check if useCache is enabled or not. Even if useCache is false
         // we already cleared the cache for this row before acquiring userRegion lock so if this
@@ -1104,6 +1105,10 @@ public class ConnectionImplementation implements ClusterConnection, Closeable {
         }
       } finally {
         userRegionLock.unlock();
+        // update duration of the lock being held
+        if (metrics != null) {
+          metrics.updateUserRegionLockHeld(EnvironmentEdgeManager.currentTime() - lockStartTime);
+        }
       }
       try {
         Thread.sleep(ConnectionUtils.getPauseTime(pauseBase, tries));
@@ -1117,9 +1122,19 @@ public class ConnectionImplementation implements ClusterConnection, Closeable {
   void takeUserRegionLock() throws IOException {
     try {
       long waitTime = connectionConfig.getMetaOperationTimeout();
+      if (metrics != null) {
+        metrics.updateUserRegionLockQueue(userRegionLock.getQueueLength());
+      }
+      final long waitStartTime = EnvironmentEdgeManager.currentTime();
       if (!userRegionLock.tryLock(waitTime, TimeUnit.MILLISECONDS)) {
+        if (metrics != null) {
+          metrics.incrUserRegionLockTimeout();
+        }
         throw new LockTimeoutException("Failed to get user region lock in" + waitTime + " ms. "
           + " for accessing meta region server.");
+      } else if (metrics != null) {
+        // successfully grabbed the lock, start timer of holding the lock
+        metrics.updateUserRegionLockWaiting(EnvironmentEdgeManager.currentTime() - waitStartTime);
       }
     } catch (InterruptedException ie) {
       LOG.error("Interrupted while waiting for a lock", ie);
diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MetricsConnection.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MetricsConnection.java
index 923114d61ad..1d337372927 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MetricsConnection.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MetricsConnection.java
@@ -359,6 +359,10 @@ public final class MetricsConnection implements StatisticTrackable {
   private final Counter nsLookups;
   private final Counter nsLookupsFailed;
   private final Timer overloadedBackoffTimer;
+  private final Counter userRegionLockTimeoutCount;
+  private final Timer userRegionLockWaitingTimer;
+  private final Timer userRegionLockHeldTimer;
+  private final Histogram userRegionLockQueueHist;
 
   // dynamic metrics
 
@@ -443,6 +447,15 @@ public final class MetricsConnection implements StatisticTrackable {
     this.nsLookups = registry.counter(name(this.getClass(), NS_LOOKUPS, scope));
     this.nsLookupsFailed = registry.counter(name(this.getClass(), NS_LOOKUPS_FAILED, scope));
 
+    this.userRegionLockTimeoutCount =
+      registry.counter(name(this.getClass(), "userRegionLockTimeoutCount", scope));
+    this.userRegionLockWaitingTimer =
+      registry.timer(name(this.getClass(), "userRegionLockWaitingDuration", scope));
+    this.userRegionLockHeldTimer =
+      registry.timer(name(this.getClass(), "userRegionLockHeldDuration", scope));
+    this.userRegionLockQueueHist =
+      registry.histogram(name(MetricsConnection.class, "userRegionLockQueueLength", scope));
+
     this.overloadedBackoffTimer =
       registry.timer(name(this.getClass(), "overloadedBackoffDurationMs", scope));
 
@@ -602,6 +615,41 @@ public final class MetricsConnection implements StatisticTrackable {
     overloadedBackoffTimer.update(time, timeUnit);
   }
 
+  /** incr */
+  public void incrUserRegionLockTimeout() {
+    userRegionLockTimeoutCount.inc();
+  }
+
+  /** get */
+  public Counter getUserRegionLockTimeout() {
+    return userRegionLockTimeoutCount;
+  }
+
+  public Timer getUserRegionLockWaitingTimer() {
+    return userRegionLockWaitingTimer;
+  }
+
+  public Timer getUserRegionLockHeldTimer() {
+    return userRegionLockHeldTimer;
+  }
+
+  public Histogram getUserRegionLockQueue() {
+    return userRegionLockQueueHist;
+  }
+
+  /** update */
+  public void updateUserRegionLockWaiting(long duration) {
+    userRegionLockWaitingTimer.update(duration, TimeUnit.MILLISECONDS);
+  }
+
+  public void updateUserRegionLockHeld(long duration) {
+    userRegionLockHeldTimer.update(duration, TimeUnit.MILLISECONDS);
+  }
+
+  public void updateUserRegionLockQueue(int count) {
+    userRegionLockQueueHist.update(count);
+  }
+
   /** Return the connection count of the metrics within a scope */
   public long getConnectionCount() {
     return connectionCount.getCount();
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaCache.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaCache.java
index b7f520655c0..e79f0dc8e5d 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaCache.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaCache.java
@@ -563,6 +563,7 @@ public class TestMetaCache {
     conf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 0);
     conf.setLong(HConstants.HBASE_CLIENT_META_OPERATION_TIMEOUT, 2000);
     conf.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 2000);
+    conf.setBoolean(MetricsConnection.CLIENT_SIDE_METRICS_ENABLED_KEY, true);
 
     try (ConnectionImplementation conn =
       (ConnectionImplementation) ConnectionFactory.createConnection(conf)) {
@@ -587,6 +588,28 @@ public class TestMetaCache {
 
       assertTrue(client1.getException() instanceof LockTimeoutException
         ^ client2.getException() instanceof LockTimeoutException);
+
+      // obtain the client metrics
+      MetricsConnection metrics = conn.getConnectionMetrics();
+      long queueCount = metrics.getUserRegionLockQueue().getCount();
+      assertEquals("Queue of userRegionLock should be updated twice. queueCount: " + queueCount,
+        queueCount, 2);
+
+      long timeoutCount = metrics.getUserRegionLockTimeout().getCount();
+      assertEquals("Timeout of userRegionLock should happen once. timeoutCount: " + timeoutCount,
+        timeoutCount, 1);
+
+      long waitingTimerCount = metrics.getUserRegionLockWaitingTimer().getCount();
+      assertEquals("userRegionLock should be grabbed successfully once. waitingTimerCount: "
+        + waitingTimerCount, waitingTimerCount, 1);
+
+      long heldTimerCount = metrics.getUserRegionLockHeldTimer().getCount();
+      assertEquals(
+        "userRegionLock should be held successfully once. heldTimerCount: " + heldTimerCount,
+        heldTimerCount, 1);
+      double heldTime = metrics.getUserRegionLockHeldTimer().getSnapshot().getMax();
+      assertTrue("Max held time should be greater than 2 seconds. heldTime: " + heldTime,
+        heldTime >= 2E9);
     }
   }