You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by aa...@apache.org on 2021/05/02 10:19:11 UTC

[hadoop] branch trunk updated: HDFS-15810. RBF: RBFMetrics's TotalCapacity out of bounds (#2910)

This is an automated email from the ASF dual-hosted git repository.

aajisaka pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/hadoop.git


The following commit(s) were added to refs/heads/trunk by this push:
     new 6e525ab  HDFS-15810. RBF: RBFMetrics's TotalCapacity out of bounds (#2910)
6e525ab is described below

commit 6e525ab81cc2d93eb1e4aeea6b13769635076c06
Author: lfengnan <lf...@uber.com>
AuthorDate: Sun May 2 03:18:47 2021 -0700

    HDFS-15810. RBF: RBFMetrics's TotalCapacity out of bounds (#2910)
    
    Reviewed-by: Inigo Goiri <in...@apache.org>
    Signed-off-by: Akira Ajisaka <aa...@apache.org>
---
 .../hadoop-common/src/site/markdown/Metrics.md     |  9 ++++--
 .../server/federation/metrics/FederationMBean.java | 26 ++++++++++++++++
 .../hdfs/server/federation/metrics/RBFMetrics.java | 36 ++++++++++++++++++++--
 .../src/main/webapps/router/federationhealth.html  |  6 ++--
 .../server/federation/metrics/TestMetricsBase.java | 11 +++++++
 .../server/federation/metrics/TestRBFMetrics.java  | 36 +++++++++++++++++++++-
 6 files changed, 115 insertions(+), 9 deletions(-)

diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md b/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md
index 8423b53..6cec030 100644
--- a/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md
+++ b/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md
@@ -527,9 +527,12 @@ RBFMetrics shows the metrics which are the aggregated values of sub-clusters' in
 | `NumInMaintenanceLiveDataNodes` | Number of live Datanodes which are in maintenance state |
 | `NumInMaintenanceDeadDataNodes` | Number of dead Datanodes which are in maintenance state |
 | `NumEnteringMaintenanceDataNodes` | Number of Datanodes that are entering the maintenance state |
-| `TotalCapacity` | Current raw capacity of DataNodes in bytes |
-| `UsedCapacity` | Current used capacity across all DataNodes in bytes |
-| `RemainingCapacity` | Current remaining capacity in bytes |
+| `TotalCapacity` | Current raw capacity of DataNodes in bytes (long primitive, may overflow) |
+| `UsedCapacity` | Current used capacity across all DataNodes in bytes (long primitive, may overflow) |
+| `RemainingCapacity` | Current remaining capacity in bytes (long primitive, may overflow) |
+| `TotalCapacityBigInt` | Current raw capacity of DataNodes in bytes (using BigInteger) |
+| `UsedCapacityBigInt` | Current used capacity across all DataNodes in bytes (using BigInteger) |
+| `RemainingCapacityBigInt` | Current remaining capacity in bytes (using BigInteger) |
 | `NumOfMissingBlocks` | Current number of missing blocks |
 | `NumLiveNodes` | Number of datanodes which are currently live |
 | `NumDeadNodes` | Number of datanodes which are currently dead |
diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationMBean.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationMBean.java
index c06a2e0..b9ea870 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationMBean.java
+++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationMBean.java
@@ -17,6 +17,8 @@
  */
 package org.apache.hadoop.hdfs.server.federation.metrics;
 
+import java.math.BigInteger;
+
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 
@@ -54,23 +56,47 @@ public interface FederationMBean {
 
   /**
    * Get the total capacity of the federated cluster.
+   * The number could overflow if too big. In that case use
+   * {@link #getTotalCapacityBigInt()} instead.
    * @return Total capacity of the federated cluster.
    */
   long getTotalCapacity();
 
   /**
    * Get the used capacity of the federated cluster.
+   * The number could overflow if too big. In that case use
+   * {@link #getUsedCapacityBigInt()} instead.
    * @return Used capacity of the federated cluster.
    */
   long getUsedCapacity();
 
   /**
    * Get the remaining capacity of the federated cluster.
+   * The number could overflow if too big. In that case use
+   * {@link #getRemainingCapacityBigInt()} instead.
    * @return Remaining capacity of the federated cluster.
    */
   long getRemainingCapacity();
 
   /**
+   * Get the total capacity (big integer) of the federated cluster.
+   * @return Total capacity of the federated cluster.
+   */
+  BigInteger getTotalCapacityBigInt();
+
+  /**
+   * Get the used capacity (big integer) of the federated cluster.
+   * @return Used capacity of the federated cluster.
+   */
+  BigInteger getUsedCapacityBigInt();
+
+  /**
+   * Get the remaining capacity (big integer) of the federated cluster.
+   * @return Remaining capacity of the federated cluster.
+   */
+  BigInteger getRemainingCapacityBigInt();
+
+  /**
    * Get the total remote storage capacity mounted in the federated cluster.
    * @return Remote capacity of the federated cluster.
    */
diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/RBFMetrics.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/RBFMetrics.java
index cc5bf07..ec41a32 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/RBFMetrics.java
+++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/RBFMetrics.java
@@ -21,6 +21,7 @@ import static org.apache.hadoop.util.Time.now;
 
 import java.io.IOException;
 import java.lang.reflect.Method;
+import java.math.BigInteger;
 import java.net.InetAddress;
 import java.net.InetSocketAddress;
 import java.net.UnknownHostException;
@@ -381,13 +382,28 @@ public class RBFMetrics implements RouterMBean, FederationMBean {
   }
 
   @Override
+  public long getUsedCapacity() {
+    return getTotalCapacity() - getRemainingCapacity();
+  }
+
+  @Override
+  public BigInteger getTotalCapacityBigInt() {
+    return getNameserviceAggregatedBigInt(MembershipStats::getTotalSpace);
+  }
+
+  @Override
+  public BigInteger getRemainingCapacityBigInt() {
+    return getNameserviceAggregatedBigInt(MembershipStats::getAvailableSpace);
+  }
+
+  @Override
   public long getProvidedSpace() {
     return getNameserviceAggregatedLong(MembershipStats::getProvidedSpace);
   }
 
   @Override
-  public long getUsedCapacity() {
-    return getTotalCapacity() - getRemainingCapacity();
+  public BigInteger getUsedCapacityBigInt() {
+    return getTotalCapacityBigInt().subtract(getRemainingCapacityBigInt());
   }
 
   @Override
@@ -783,6 +799,22 @@ public class RBFMetrics implements RouterMBean, FederationMBean {
     }
   }
 
+  private BigInteger getNameserviceAggregatedBigInt(
+      ToLongFunction<MembershipStats> f) {
+    try {
+      List<MembershipState> states = getActiveNamenodeRegistrations();
+      BigInteger sum = BigInteger.valueOf(0);
+      for (MembershipState state : states) {
+        long lvalue = f.applyAsLong(state.getStats());
+        sum = sum.add(BigInteger.valueOf(lvalue));
+      }
+      return sum;
+    } catch (IOException e) {
+      LOG.error("Unable to extract metrics: {}", e.getMessage());
+      return new BigInteger("0");
+    }
+  }
+
   /**
    * Fetches the most active namenode memberships for all known nameservices.
    * The fetched membership may not or may not be active. Excludes expired
diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/webapps/router/federationhealth.html b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/webapps/router/federationhealth.html
index 87d0e71..eca395f 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/webapps/router/federationhealth.html
+++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/webapps/router/federationhealth.html
@@ -113,9 +113,9 @@
 <p>Non Heap Memory used {used|fmt_bytes} of {committed|fmt_bytes} Commited Non Heap Memory. Max Non Heap Memory is {@eq key=max value="-1" type="number"}&ltunbounded&gt{:else}{max|fmt_bytes}{/eq}.</p>
 {/mem.NonHeapMemoryUsage}
 <table class="table table-bordered table-striped">
-  <tr><th>Total capacity</th><td>{TotalCapacity|fmt_bytes}</td></tr>
-  <tr><th>Used capacity</th><td>{UsedCapacity|fmt_bytes}</td></tr>
-  <tr><th>Remaining capacity</th><td>{RemainingCapacity|fmt_bytes}</td></tr>
+  <tr><th>Total capacity</th><td>{TotalCapacityBigInt|fmt_bytes}</td></tr>
+  <tr><th>Used capacity</th><td>{UsedCapacityBigInt|fmt_bytes}</td></tr>
+  <tr><th>Remaining capacity</th><td>{RemainingCapacityBigInt|fmt_bytes}</td></tr>
   <tr><th>Nameservices</th><td>{NumNameservices}</td></tr>
   <tr><th>Namenodes</th><td>{NumNamenodes}</td></tr>
   <tr>
diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestMetricsBase.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestMetricsBase.java
index 4759d05..b01e220 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestMetricsBase.java
+++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestMetricsBase.java
@@ -259,4 +259,15 @@ public class TestMetricsBase {
     assertTrue(response.getResult());
     return record;
   }
+
+  // refresh namenode registration for new attributes
+  public boolean refreshNamenodeRegistration(NamenodeHeartbeatRequest request)
+      throws IOException {
+    boolean result = membershipStore.namenodeHeartbeat(request).getResult();
+    membershipStore.loadCache(true);
+    MembershipNamenodeResolver resolver =
+        (MembershipNamenodeResolver) router.getNamenodeResolver();
+    resolver.loadCache(true);
+    return result;
+  }
 }
diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestRBFMetrics.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestRBFMetrics.java
index 2c7edaa..25473f8 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestRBFMetrics.java
+++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestRBFMetrics.java
@@ -19,11 +19,13 @@ package org.apache.hadoop.hdfs.server.federation.metrics;
 
 import static org.apache.hadoop.hdfs.server.federation.FederationTestUtils.getBean;
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.assertFalse;
 
 import java.io.IOException;
+import java.math.BigInteger;
 import java.util.Iterator;
 import java.util.List;
 
@@ -31,6 +33,7 @@ import javax.management.MalformedObjectNameException;
 
 import org.apache.commons.collections.ListUtils;
 import org.apache.hadoop.hdfs.server.federation.router.Router;
+import org.apache.hadoop.hdfs.server.federation.store.protocol.NamenodeHeartbeatRequest;
 import org.apache.hadoop.hdfs.server.federation.store.records.MembershipState;
 import org.apache.hadoop.hdfs.server.federation.store.records.MembershipStats;
 import org.apache.hadoop.hdfs.server.federation.store.records.MountTable;
@@ -58,6 +61,7 @@ public class TestRBFMetrics extends TestMetricsBase {
     FederationMBean federationBean = getBean(FEDERATION_BEAN,
         FederationMBean.class);
     validateClusterStatsFederationBean(federationBean);
+    testCapacity(federationBean);
     RouterMBean routerBean = getBean(ROUTER_BEAN, RouterMBean.class);
     validateClusterStatsRouterBean(routerBean);
   }
@@ -348,4 +352,34 @@ public class TestRBFMetrics extends TestMetricsBase {
     assertTrue(bean.getHostAndPort().length() > 0);
     assertFalse(bean.isSecurityEnabled());
   }
+
+  private void testCapacity(FederationMBean bean) throws IOException {
+    List<MembershipState> memberships = getActiveMemberships();
+    assertTrue(memberships.size() > 1);
+
+    BigInteger availableCapacity = BigInteger.valueOf(0);
+    BigInteger totalCapacity = BigInteger.valueOf(0);
+    BigInteger unitCapacity = BigInteger.valueOf(Long.MAX_VALUE);
+    for (MembershipState mock : memberships) {
+      MembershipStats stats = mock.getStats();
+      stats.setTotalSpace(Long.MAX_VALUE);
+      stats.setAvailableSpace(Long.MAX_VALUE);
+      // reset stats to make the new value persistent
+      mock.setStats(stats);
+      // write back the new namenode information to state store
+      assertTrue(refreshNamenodeRegistration(
+          NamenodeHeartbeatRequest.newInstance(mock)));
+      totalCapacity = totalCapacity.add(unitCapacity);
+      availableCapacity = availableCapacity.add(unitCapacity);
+    }
+
+    // for local cache update
+    assertEquals(totalCapacity, bean.getTotalCapacityBigInt());
+    // not equal since overflow happened.
+    assertNotEquals(totalCapacity, BigInteger.valueOf(bean.getTotalCapacity()));
+    assertEquals(availableCapacity, bean.getRemainingCapacityBigInt());
+    // not equal since overflow happened.
+    assertNotEquals(availableCapacity,
+        BigInteger.valueOf(bean.getRemainingCapacity()));
+  }
 }

---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org