You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by eb...@apache.org on 2021/03/17 18:27:48 UTC

[hadoop] branch branch-3.3 updated: YARN-10688. ClusterMetrics should support GPU capacity related metrics.. Contributed by Qi Zhu.

This is an automated email from the ASF dual-hosted git repository.

ebadger pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/hadoop.git


The following commit(s) were added to refs/heads/branch-3.3 by this push:
     new cd417f1  YARN-10688. ClusterMetrics should support GPU capacity related metrics.. Contributed by Qi Zhu.
cd417f1 is described below

commit cd417f17aea693b8985694287bc9a3a579a529a4
Author: Eric Badger <eb...@verizonmedia.com>
AuthorDate: Wed Mar 17 18:16:59 2021 +0000

    YARN-10688. ClusterMetrics should support GPU capacity related metrics.. Contributed by Qi Zhu.
    
    (cherry picked from commit 49f89f1d3de66f3bb4db5952e8873432ba62f71a)
---
 .../server/resourcemanager/ClusterMetrics.java     | 27 ++++++++-
 .../capacity/TestCSAllocateCustomResource.java     | 64 ++++++++++++++++++++++
 2 files changed, 89 insertions(+), 2 deletions(-)

diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClusterMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClusterMetrics.java
index 37f4ec4..7fe5cc9 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClusterMetrics.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClusterMetrics.java
@@ -34,6 +34,8 @@ import org.apache.hadoop.metrics2.lib.MutableGaugeLong;
 import org.apache.hadoop.metrics2.lib.MutableRate;
 import org.apache.hadoop.yarn.api.records.Resource;
 import org.apache.hadoop.thirdparty.com.google.common.annotations.VisibleForTesting;
+import org.apache.hadoop.yarn.api.records.ResourceInformation;
+import org.apache.hadoop.yarn.util.resource.ResourceUtils;
 
 @InterfaceAudience.Private
 @Metrics(context="yarn")
@@ -56,13 +58,14 @@ public class ClusterMetrics {
   @Metric("Vcore Utilization") MutableGaugeLong utilizedVirtualCores;
   @Metric("Memory Capability") MutableGaugeLong capabilityMB;
   @Metric("Vcore Capability") MutableGaugeLong capabilityVirtualCores;
+  @Metric("GPU Capability") MutableGaugeLong capabilityGPUs;
 
   private static final MetricsInfo RECORD_INFO = info("ClusterMetrics",
   "Metrics for the Yarn Cluster");
   
   private static volatile ClusterMetrics INSTANCE = null;
   private static MetricsRegistry registry;
-  
+
   public static ClusterMetrics getMetrics() {
     if(!isInitialized.get()){
       synchronized (ClusterMetrics.class) {
@@ -206,10 +209,24 @@ public class ClusterMetrics {
     return capabilityVirtualCores.value();
   }
 
+  public long getCapabilityGPUs() {
+    if (capabilityGPUs == null) {
+      return 0;
+    }
+
+    return capabilityGPUs.value();
+  }
+
   public void incrCapability(Resource res) {
     if (res != null) {
       capabilityMB.incr(res.getMemorySize());
       capabilityVirtualCores.incr(res.getVirtualCores());
+      Integer gpuIndex = ResourceUtils.getResourceTypeIndex()
+          .get(ResourceInformation.GPU_URI);
+      if (gpuIndex != null) {
+        capabilityGPUs.incr(res.
+            getResourceValue(ResourceInformation.GPU_URI));
+      }
     }
   }
 
@@ -217,6 +234,12 @@ public class ClusterMetrics {
     if (res != null) {
       capabilityMB.decr(res.getMemorySize());
       capabilityVirtualCores.decr(res.getVirtualCores());
+      Integer gpuIndex = ResourceUtils.getResourceTypeIndex()
+          .get(ResourceInformation.GPU_URI);
+      if (gpuIndex != null) {
+        capabilityGPUs.decr(res.
+            getResourceValue(ResourceInformation.GPU_URI));
+      }
     }
   }
 
@@ -251,4 +274,4 @@ public class ClusterMetrics {
   public void incrUtilizedVirtualCores(long delta) {
     utilizedVirtualCores.incr(delta);
   }
-}
+}
\ No newline at end of file
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCSAllocateCustomResource.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCSAllocateCustomResource.java
index 65473b9..d6f1544 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCSAllocateCustomResource.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCSAllocateCustomResource.java
@@ -22,18 +22,22 @@ import org.apache.commons.io.FileUtils;
 import org.apache.hadoop.yarn.api.records.ContainerId;
 import org.apache.hadoop.yarn.api.records.Resource;
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.server.resourcemanager.ClusterMetrics;
 import org.apache.hadoop.yarn.server.resourcemanager.MockAM;
 import org.apache.hadoop.yarn.server.resourcemanager.MockNM;
 import org.apache.hadoop.yarn.server.resourcemanager.MockRM;
 import org.apache.hadoop.yarn.server.resourcemanager.MockRMAppSubmissionData;
 import org.apache.hadoop.yarn.server.resourcemanager.MockRMAppSubmitter;
+import org.apache.hadoop.yarn.server.resourcemanager.MockNodes;
 import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.NullRMNodeLabelsManager;
 import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
 import org.apache.hadoop.yarn.server.resourcemanager.resource.TestResourceProfiles;
 import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
 import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ClusterNodeTracker;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent;
 import org.apache.hadoop.yarn.util.resource.DominantResourceCalculator;
 import org.apache.hadoop.yarn.util.resource.ResourceCalculator;
@@ -47,8 +51,12 @@ import org.junit.Test;
 import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
 
+import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI;
 import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulerConfiguration.MAXIMUM_ALLOCATION_MB;
+import static org.junit.Assert.assertEquals;
 
 /**
  * Test case for custom resource container allocation.
@@ -64,6 +72,9 @@ public class TestCSAllocateCustomResource {
 
   private final int g = 1024;
 
+  private ClusterNodeTracker<FiCaSchedulerNode> nodeTracker;
+  private ClusterMetrics metrics;
+
   @Before
   public void setUp() throws Exception {
     conf = new YarnConfiguration();
@@ -182,4 +193,57 @@ public class TestCSAllocateCustomResource {
             .getResourceValue("yarn.io/gpu"));
     rm.close();
   }
+
+  @Test
+  public void testClusterMetricsWithGPU()
+      throws Exception {
+    metrics = ClusterMetrics.getMetrics();
+    // reset resource types
+    ResourceUtils.resetResourceTypes();
+    String resourceTypesFileName = "resource-types-test.xml";
+    File source = new File(
+        conf.getClassLoader().getResource(resourceTypesFileName).getFile());
+    resourceTypesFile = new File(source.getParent(), "resource-types.xml");
+    FileUtils.copyFile(source, resourceTypesFile);
+
+    CapacitySchedulerConfiguration newConf =
+        (CapacitySchedulerConfiguration) TestUtils
+            .getConfigurationWithMultipleQueues(conf);
+    newConf.setClass(CapacitySchedulerConfiguration.RESOURCE_CALCULATOR_CLASS,
+        DominantResourceCalculator.class, ResourceCalculator.class);
+    //start RM
+    MockRM rm = new MockRM(newConf);
+    rm.start();
+
+    nodeTracker = new ClusterNodeTracker<>();
+    MockNodes.resetHostIds();
+    Resource nodeResource = Resource.newInstance(4096, 4,
+        Collections.singletonMap(GPU_URI, 4L));
+    List<RMNode> rmNodes =
+        MockNodes.newNodes(2, 4, nodeResource);
+    for (RMNode rmNode : rmNodes) {
+      nodeTracker.addNode(new FiCaSchedulerNode(rmNode, false));
+    }
+
+    // Check GPU inc related cluster metrics.
+    assertEquals("Cluster Capability Memory incorrect",
+        metrics.getCapabilityMB(), (4096 * 8));
+    assertEquals("Cluster Capability Vcores incorrect",
+        metrics.getCapabilityVirtualCores(), 4 * 8);
+    assertEquals("Cluster Capability GPUs incorrect",
+        metrics.getCapabilityGPUs(), 4 * 8);
+
+    for (RMNode rmNode : rmNodes) {
+      nodeTracker.removeNode(rmNode.getNodeID());
+    }
+
+    // Check GPU dec related cluster metrics.
+    assertEquals("Cluster Capability Memory incorrect",
+        metrics.getCapabilityMB(), 0);
+    assertEquals("Cluster Capability Vcores incorrect",
+        metrics.getCapabilityVirtualCores(), 0);
+    assertEquals("Cluster Capability GPUs incorrect",
+        metrics.getCapabilityGPUs(), 0);
+    ClusterMetrics.destroy();
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org