You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by bh...@apache.org on 2019/03/14 19:42:07 UTC

[hadoop] branch trunk updated: HDDS-917. Expose NodeManagerMXBean as a MetricsSource. Contributed by Siddharth Wagle.

This is an automated email from the ASF dual-hosted git repository.

bharat pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/hadoop.git


The following commit(s) were added to refs/heads/trunk by this push:
     new 091a664  HDDS-917. Expose NodeManagerMXBean as a MetricsSource. Contributed by Siddharth Wagle.
091a664 is described below

commit 091a664977a3b97cd6057129da9d093a73d63a68
Author: Bharat Viswanadham <bh...@apache.org>
AuthorDate: Thu Mar 14 12:30:06 2019 -0700

    HDDS-917. Expose NodeManagerMXBean as a MetricsSource. Contributed by Siddharth Wagle.
---
 .../hadoop/hdds/scm/node/SCMNodeManager.java       |  4 +-
 .../hadoop/hdds/scm/node/SCMNodeMetrics.java       | 76 ++++++++++++++++++++--
 .../hadoop/ozone/scm/node/TestSCMNodeMetrics.java  | 40 ++++++++++++
 3 files changed, 114 insertions(+), 6 deletions(-)

diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeManager.java
index e457b13..4464ed1 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeManager.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeManager.java
@@ -102,7 +102,6 @@ public class SCMNodeManager implements NodeManager {
   public SCMNodeManager(OzoneConfiguration conf, String clusterID,
       StorageContainerManager scmManager, EventPublisher eventPublisher)
       throws IOException {
-    this.metrics = SCMNodeMetrics.create();
     this.nodeStateManager = new NodeStateManager(conf, eventPublisher);
     this.clusterID = clusterID;
     this.version = VersionInfo.getLatestVersion();
@@ -110,6 +109,7 @@ public class SCMNodeManager implements NodeManager {
     this.scmManager = scmManager;
     LOG.info("Entering startup chill mode.");
     registerMXBean();
+    this.metrics = SCMNodeMetrics.create(this);
   }
 
   private void registerMXBean() {
@@ -118,7 +118,7 @@ public class SCMNodeManager implements NodeManager {
   }
 
   private void unregisterMXBean() {
-    if(this.nmInfoBean != null) {
+    if (this.nmInfoBean != null) {
       MBeans.unregister(this.nmInfoBean);
       this.nmInfoBean = null;
     }
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeMetrics.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeMetrics.java
index 30b1079..1596523 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeMetrics.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeMetrics.java
@@ -18,11 +18,24 @@
 
 package org.apache.hadoop.hdds.scm.node;
 
+import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.DEAD;
+import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.DECOMMISSIONED;
+import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.DECOMMISSIONING;
+import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.HEALTHY;
+import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.STALE;
+
+import java.util.Map;
+
 import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.metrics2.MetricsCollector;
+import org.apache.hadoop.metrics2.MetricsInfo;
+import org.apache.hadoop.metrics2.MetricsSource;
 import org.apache.hadoop.metrics2.MetricsSystem;
 import org.apache.hadoop.metrics2.annotation.Metric;
 import org.apache.hadoop.metrics2.annotation.Metrics;
 import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
+import org.apache.hadoop.metrics2.lib.Interns;
+import org.apache.hadoop.metrics2.lib.MetricsRegistry;
 import org.apache.hadoop.metrics2.lib.MutableCounterLong;
 
 /**
@@ -30,7 +43,7 @@ import org.apache.hadoop.metrics2.lib.MutableCounterLong;
  */
 @InterfaceAudience.Private
 @Metrics(about = "SCM NodeManager Metrics", context = "ozone")
-public final class SCMNodeMetrics {
+public final class SCMNodeMetrics implements MetricsSource {
 
   private static final String SOURCE_NAME =
       SCMNodeMetrics.class.getSimpleName();
@@ -40,18 +53,26 @@ public final class SCMNodeMetrics {
   private @Metric MutableCounterLong numNodeReportProcessed;
   private @Metric MutableCounterLong numNodeReportProcessingFailed;
 
+  private final MetricsRegistry registry;
+  private final NodeManagerMXBean managerMXBean;
+  private final MetricsInfo recordInfo = Interns.info("SCMNodeManager",
+      "SCM NodeManager metrics");
+
   /** Private constructor. */
-  private SCMNodeMetrics() { }
+  private SCMNodeMetrics(NodeManagerMXBean managerMXBean) {
+    this.managerMXBean = managerMXBean;
+    this.registry = new MetricsRegistry(recordInfo);
+  }
 
   /**
    * Create and returns SCMNodeMetrics instance.
    *
    * @return SCMNodeMetrics
    */
-  public static SCMNodeMetrics create() {
+  public static SCMNodeMetrics create(NodeManagerMXBean managerMXBean) {
     MetricsSystem ms = DefaultMetricsSystem.instance();
     return ms.register(SOURCE_NAME, "SCM NodeManager Metrics",
-        new SCMNodeMetrics());
+        new SCMNodeMetrics(managerMXBean));
   }
 
   /**
@@ -90,4 +111,51 @@ public final class SCMNodeMetrics {
     numNodeReportProcessingFailed.incr();
   }
 
+  /**
+   * Get aggregated counter and gauage metrics.
+   */
+  @Override
+  @SuppressWarnings("SuspiciousMethodCalls")
+  public void getMetrics(MetricsCollector collector, boolean all) {
+    Map<String, Integer> nodeCount = managerMXBean.getNodeCount();
+    Map<String, Long> nodeInfo = managerMXBean.getNodeInfo();
+
+    registry.snapshot(
+        collector.addRecord(registry.info()) // Add annotated ones first
+            .addGauge(Interns.info(
+                "HealthyNodes",
+                "Number of healthy datanodes"),
+                nodeCount.get(HEALTHY.toString()))
+            .addGauge(Interns.info("StaleNodes",
+                "Number of stale datanodes"),
+                nodeCount.get(STALE.toString()))
+            .addGauge(Interns.info("DeadNodes",
+                "Number of dead datanodes"),
+                nodeCount.get(DEAD.toString()))
+            .addGauge(Interns.info("DecommissioningNodes",
+                "Number of decommissioning datanodes"),
+                nodeCount.get(DECOMMISSIONING.toString()))
+            .addGauge(Interns.info("DecommissionedNodes",
+                "Number of decommissioned datanodes"),
+                nodeCount.get(DECOMMISSIONED.toString()))
+            .addGauge(Interns.info("DiskCapacity",
+                "Total disk capacity"),
+                nodeInfo.get("DISKCapacity"))
+            .addGauge(Interns.info("DiskUsed",
+                "Total disk capacity used"),
+                nodeInfo.get("DISKUsed"))
+            .addGauge(Interns.info("DiskRemaining",
+                "Total disk capacity remaining"),
+                nodeInfo.get("DISKRemaining"))
+            .addGauge(Interns.info("SSDCapacity",
+                "Total ssd capacity"),
+                nodeInfo.get("SSDCapacity"))
+            .addGauge(Interns.info("SSDUsed",
+                "Total ssd capacity used"),
+                nodeInfo.get("SSDUsed"))
+            .addGauge(Interns.info("SSDRemaining",
+                "Total disk capacity remaining"),
+                nodeInfo.get("SSDRemaining")),
+        all);
+  }
 }
diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/node/TestSCMNodeMetrics.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/node/TestSCMNodeMetrics.java
index c18ae5f..d19be93 100644
--- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/node/TestSCMNodeMetrics.java
+++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/node/TestSCMNodeMetrics.java
@@ -35,6 +35,7 @@ import org.junit.Before;
 import org.junit.Test;
 
 import static org.apache.hadoop.test.MetricsAsserts.assertCounter;
+import static org.apache.hadoop.test.MetricsAsserts.assertGauge;
 import static org.apache.hadoop.test.MetricsAsserts.getLongCounter;
 import static org.apache.hadoop.test.MetricsAsserts.getMetrics;
 
@@ -128,6 +129,45 @@ public class TestSCMNodeMetrics {
         getMetrics(SCMNodeMetrics.class.getSimpleName()));
   }
 
+  /**
+   * Verify that datanode aggregated state and capacity metrics are reported.
+   */
+  @Test
+  public void testNodeCountAndInfoMetricsReported() throws Exception {
+    HddsDatanodeService datanode = cluster.getHddsDatanodes().get(0);
+    StorageReportProto storageReport = TestUtils.createStorageReport(
+        datanode.getDatanodeDetails().getUuid(), "/tmp", 100, 10, 90, null);
+    NodeReportProto nodeReport = NodeReportProto.newBuilder()
+        .addStorageReport(storageReport).build();
+    datanode.getDatanodeStateMachine().getContext().addReport(nodeReport);
+    datanode.getDatanodeStateMachine().triggerHeartbeat();
+    // Give some time so that SCM receives and processes the heartbeat.
+    Thread.sleep(300L);
+
+    assertGauge("HealthyNodes", 1,
+        getMetrics(SCMNodeMetrics.class.getSimpleName()));
+    assertGauge("StaleNodes", 0,
+        getMetrics(SCMNodeMetrics.class.getSimpleName()));
+    assertGauge("DeadNodes", 0,
+        getMetrics(SCMNodeMetrics.class.getSimpleName()));
+    assertGauge("DecommissioningNodes", 0,
+        getMetrics(SCMNodeMetrics.class.getSimpleName()));
+    assertGauge("DecommissionedNodes", 0,
+        getMetrics(SCMNodeMetrics.class.getSimpleName()));
+    assertGauge("DiskCapacity", 100L,
+        getMetrics(SCMNodeMetrics.class.getSimpleName()));
+    assertGauge("DiskUsed", 10L,
+        getMetrics(SCMNodeMetrics.class.getSimpleName()));
+    assertGauge("DiskRemaining", 90L,
+        getMetrics(SCMNodeMetrics.class.getSimpleName()));
+    assertGauge("SSDCapacity", 0L,
+        getMetrics(SCMNodeMetrics.class.getSimpleName()));
+    assertGauge("SSDUsed", 0L,
+        getMetrics(SCMNodeMetrics.class.getSimpleName()));
+    assertGauge("SSDRemaining", 0L,
+        getMetrics(SCMNodeMetrics.class.getSimpleName()));
+  }
+
   @After
   public void teardown() {
     cluster.shutdown();


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org