You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by ap...@apache.org on 2018/10/02 01:51:23 UTC

[2/5] hbase git commit: HBASE-18549 Add metrics for failed replication queue recovery

HBASE-18549 Add metrics for failed replication queue recovery

Signed-off-by: Andrew Purtell <ap...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/8a5537b5
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/8a5537b5
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/8a5537b5

Branch: refs/heads/branch-2
Commit: 8a5537b5f5a8eed1fd94830672b7df299a5e4d4f
Parents: 79ec1a1
Author: Xu Cang <xc...@salesforce.com>
Authored: Wed Aug 29 15:49:51 2018 -0700
Committer: Andrew Purtell <ap...@apache.org>
Committed: Mon Oct 1 18:39:03 2018 -0700

----------------------------------------------------------------------
 .../regionserver/MetricsReplicationSourceSource.java  |  2 ++
 .../MetricsReplicationGlobalSourceSource.java         |  8 +++++++-
 .../MetricsReplicationSourceSourceImpl.java           |  3 +++
 .../hbase/replication/ReplicationQueueStorage.java    |  7 +++++++
 .../hbase/replication/ZKReplicationQueueStorage.java  |  3 ++-
 .../hbase/replication/regionserver/MetricsSource.java |  4 ++++
 .../regionserver/ReplicationSourceManager.java        | 10 +++++++++-
 .../hbase/replication/TestReplicationEndpoint.java    | 14 +++++++++++++-
 8 files changed, 47 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/8a5537b5/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java
----------------------------------------------------------------------
diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java
index f56139b..1a17f37 100644
--- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java
+++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java
@@ -51,6 +51,7 @@ public interface MetricsReplicationSourceSource extends BaseSource {
   public static final String SOURCE_REPEATED_LOG_FILE_BYTES = "source.repeatedLogFileBytes";
   public static final String SOURCE_COMPLETED_LOGS = "source.completedLogs";
   public static final String SOURCE_COMPLETED_RECOVERY_QUEUES = "source.completedRecoverQueues";
+  public static final String SOURCE_FAILED_RECOVERY_QUEUES = "source.failedRecoverQueues";
 
   void setLastShippedAge(long age);
   void incrSizeOfLogQueue(int size);
@@ -74,4 +75,5 @@ public interface MetricsReplicationSourceSource extends BaseSource {
   void incrRepeatedFileBytes(final long bytes);
   void incrCompletedWAL();
   void incrCompletedRecoveryQueue();
+  void incrFailedRecoveryQueue();
 }

http://git-wip-us.apache.org/repos/asf/hbase/blob/8a5537b5/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationGlobalSourceSource.java
----------------------------------------------------------------------
diff --git a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationGlobalSourceSource.java b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationGlobalSourceSource.java
index 9a86cf2..4e8c810 100644
--- a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationGlobalSourceSource.java
+++ b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationGlobalSourceSource.java
@@ -52,6 +52,7 @@ public class MetricsReplicationGlobalSourceSource implements MetricsReplicationS
   private final MutableFastCounter repeatedFileBytes;
   private final MutableFastCounter completedWAL;
   private final MutableFastCounter completedRecoveryQueue;
+  private final MutableFastCounter failedRecoveryQueue;
 
   public MetricsReplicationGlobalSourceSource(MetricsReplicationSourceImpl rms) {
     this.rms = rms;
@@ -89,6 +90,8 @@ public class MetricsReplicationGlobalSourceSource implements MetricsReplicationS
     completedWAL = rms.getMetricsRegistry().getCounter(SOURCE_COMPLETED_LOGS, 0L);
     completedRecoveryQueue = rms.getMetricsRegistry()
             .getCounter(SOURCE_COMPLETED_RECOVERY_QUEUES, 0L);
+    failedRecoveryQueue = rms.getMetricsRegistry()
+            .getCounter(SOURCE_FAILED_RECOVERY_QUEUES, 0L);
   }
 
   @Override public void setLastShippedAge(long age) {
@@ -199,7 +202,10 @@ public class MetricsReplicationGlobalSourceSource implements MetricsReplicationS
   public void incrCompletedRecoveryQueue() {
     completedRecoveryQueue.incr(1L);
   }
-
+  @Override
+  public void incrFailedRecoveryQueue() {
+    failedRecoveryQueue.incr(1L);
+  }
   @Override
   public void init() {
     rms.init();

http://git-wip-us.apache.org/repos/asf/hbase/blob/8a5537b5/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSourceImpl.java
----------------------------------------------------------------------
diff --git a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSourceImpl.java b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSourceImpl.java
index 719c916..0ad5052 100644
--- a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSourceImpl.java
+++ b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSourceImpl.java
@@ -258,6 +258,9 @@ public class MetricsReplicationSourceSourceImpl implements MetricsReplicationSou
   }
 
   @Override
+  public void incrFailedRecoveryQueue() {/*no op*/}
+
+  @Override
   public void init() {
     rms.init();
   }

http://git-wip-us.apache.org/repos/asf/hbase/blob/8a5537b5/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueStorage.java
----------------------------------------------------------------------
diff --git a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueStorage.java b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueStorage.java
index 84653ad..59278e9 100644
--- a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueStorage.java
+++ b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueStorage.java
@@ -202,4 +202,11 @@ public interface ReplicationQueueStorage {
    * created hfile references during the call may not be included.
    */
   Set<String> getAllHFileRefs() throws ReplicationException;
+
+  /**
+   * Get full znode name for given region server
+   * @param serverName the name of the region server
+   * @return full znode name
+   */
+  String getRsNode(ServerName serverName);
 }

http://git-wip-us.apache.org/repos/asf/hbase/blob/8a5537b5/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ZKReplicationQueueStorage.java
----------------------------------------------------------------------
diff --git a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ZKReplicationQueueStorage.java b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ZKReplicationQueueStorage.java
index cca8bfc..68f2adc 100644
--- a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ZKReplicationQueueStorage.java
+++ b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ZKReplicationQueueStorage.java
@@ -118,7 +118,8 @@ class ZKReplicationQueueStorage extends ZKReplicationStorageBase
         .get(ZOOKEEPER_ZNODE_REPLICATION_REGIONS_KEY, ZOOKEEPER_ZNODE_REPLICATION_REGIONS_DEFAULT));
   }
 
-  private String getRsNode(ServerName serverName) {
+  @Override
+  public String getRsNode(ServerName serverName) {
     return ZNodePaths.joinZNode(queuesZNode, serverName.getServerName());
   }
 

http://git-wip-us.apache.org/repos/asf/hbase/blob/8a5537b5/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsSource.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsSource.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsSource.java
index 906f0c6..830ebe1 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsSource.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsSource.java
@@ -326,6 +326,10 @@ public class MetricsSource implements BaseSource {
     globalSourceSource.incrCompletedRecoveryQueue();
   }
 
+  public void incrFailedRecoveryQueue() {
+    globalSourceSource.incrFailedRecoveryQueue();
+  }
+
   @Override
   public void init() {
     singleSourceSource.init();

http://git-wip-us.apache.org/repos/asf/hbase/blob/8a5537b5/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java
index a370867..5d4f034 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java
@@ -43,6 +43,7 @@ import java.util.stream.Collectors;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.CompatibilitySingletonFactory;
 import org.apache.hadoop.hbase.HConstants;
 import org.apache.hadoop.hbase.Server;
 import org.apache.hadoop.hbase.ServerName;
@@ -635,6 +636,8 @@ public class ReplicationSourceManager implements ReplicationListener {
     try {
       this.executor.execute(transfer);
     } catch (RejectedExecutionException ex) {
+      CompatibilitySingletonFactory.getInstance(MetricsReplicationSourceFactory.class)
+          .getGlobalSource().incrFailedRecoveryQueue();
       LOG.info("Cancelling the transfer of " + deadRS + " because of " + ex.getMessage());
     }
   }
@@ -706,7 +709,12 @@ public class ReplicationSourceManager implements ReplicationListener {
           queueStorage.removeReplicatorIfQueueIsEmpty(deadRS);
         }
       } catch (ReplicationException e) {
-        server.abort("Failed to claim queue from dead regionserver", e);
+        LOG.error(String.format("ReplicationException: cannot claim dead region (%s)'s " +
+            "replication queue. Znode : (%s)" +
+            " Possible solution: check if znode size exceeds jute.maxBuffer value. " +
+            " If so, increase it for both client and server side." + e),  deadRS,
+            queueStorage.getRsNode(deadRS));
+        server.abort("Failed to claim queue from dead regionserver.", e);
         return;
       }
       // Copying over the failed queue is completed.

http://git-wip-us.apache.org/repos/asf/hbase/blob/8a5537b5/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationEndpoint.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationEndpoint.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationEndpoint.java
index 5d833cc..03fbb59 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationEndpoint.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationEndpoint.java
@@ -17,7 +17,9 @@
  */
 package org.apache.hadoop.hbase.replication;
 
+import static org.mockito.Mockito.doNothing;
 import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.spy;
 import static org.mockito.Mockito.verify;
 import static org.mockito.Mockito.when;
 
@@ -316,11 +318,17 @@ public class TestReplicationEndpoint extends TestReplicationBase {
     MetricsReplicationSourceImpl globalRms = mock(MetricsReplicationSourceImpl.class);
     when(globalRms.getMetricsRegistry()).thenReturn(mockRegistry);
 
+
     MetricsReplicationSourceSource singleSourceSource = new MetricsReplicationSourceSourceImpl(singleRms, id);
     MetricsReplicationSourceSource globalSourceSource = new MetricsReplicationGlobalSourceSource(globalRms);
+    MetricsReplicationSourceSource spyglobalSourceSource = spy(globalSourceSource);
+    doNothing().when(spyglobalSourceSource).incrFailedRecoveryQueue();
+
     Map<String, MetricsReplicationSourceSource> singleSourceSourceByTable = new HashMap<>();
-    MetricsSource source = new MetricsSource(id, singleSourceSource, globalSourceSource,
+    MetricsSource source = new MetricsSource(id, singleSourceSource, spyglobalSourceSource,
         singleSourceSourceByTable);
+
+
     String gaugeName = "gauge";
     String singleGaugeName = "source.id." + gaugeName;
     String globalGaugeName = "source." + gaugeName;
@@ -340,6 +348,8 @@ public class TestReplicationEndpoint extends TestReplicationBase {
     source.removeMetric(gaugeName);
     source.setGauge(gaugeName, delta);
     source.updateHistogram(counterName, count);
+    source.incrFailedRecoveryQueue();
+
 
     verify(singleRms).decGauge(singleGaugeName, delta);
     verify(globalRms).decGauge(globalGaugeName, delta);
@@ -357,6 +367,7 @@ public class TestReplicationEndpoint extends TestReplicationBase {
     verify(globalRms).setGauge(globalGaugeName, delta);
     verify(singleRms).updateHistogram(singleCounterName, count);
     verify(globalRms).updateHistogram(globalCounterName, count);
+    verify(spyglobalSourceSource).incrFailedRecoveryQueue();
 
     //check singleSourceSourceByTable metrics.
     // singleSourceSourceByTable map entry will be created only
@@ -373,6 +384,7 @@ public class TestReplicationEndpoint extends TestReplicationBase {
     // cannot put more concreate value here to verify because the age is arbitrary.
     // as long as it's greater than 0, we see it as correct answer.
     Assert.assertTrue(msr.getLastShippedAge() > 0);
+
   }
 
   private void doPut(byte[] row) throws IOException {