You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by aa...@apache.org on 2021/05/03 15:31:43 UTC
[hadoop] 01/02: HDFS-15810. RBF: RBFMetrics's TotalCapacity out of
bounds (#2910)
This is an automated email from the ASF dual-hosted git repository.
aajisaka pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/hadoop.git
commit 43fac739bb30aeea8aff959efcc40e575565f035
Author: lfengnan <lf...@uber.com>
AuthorDate: Sun May 2 03:18:47 2021 -0700
HDFS-15810. RBF: RBFMetrics's TotalCapacity out of bounds (#2910)
Reviewed-by: Inigo Goiri <in...@apache.org>
Signed-off-by: Akira Ajisaka <aa...@apache.org>
(cherry picked from commit 6e525ab81cc2d93eb1e4aeea6b13769635076c06)
---
.../hadoop-common/src/site/markdown/Metrics.md | 9 ++++--
.../server/federation/metrics/FederationMBean.java | 26 ++++++++++++++++
.../hdfs/server/federation/metrics/RBFMetrics.java | 36 ++++++++++++++++++++--
.../src/main/webapps/router/federationhealth.html | 6 ++--
.../server/federation/metrics/TestMetricsBase.java | 11 +++++++
.../server/federation/metrics/TestRBFMetrics.java | 36 +++++++++++++++++++++-
6 files changed, 115 insertions(+), 9 deletions(-)
diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md b/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md
index 5260db1..4a078d7 100644
--- a/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md
+++ b/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md
@@ -507,9 +507,12 @@ RBFMetrics shows the metrics which are the aggregated values of sub-clusters' in
| `NumInMaintenanceLiveDataNodes` | Number of live Datanodes which are in maintenance state |
| `NumInMaintenanceDeadDataNodes` | Number of dead Datanodes which are in maintenance state |
| `NumEnteringMaintenanceDataNodes` | Number of Datanodes that are entering the maintenance state |
-| `TotalCapacity` | Current raw capacity of DataNodes in bytes |
-| `UsedCapacity` | Current used capacity across all DataNodes in bytes |
-| `RemainingCapacity` | Current remaining capacity in bytes |
+| `TotalCapacity` | Current raw capacity of DataNodes in bytes (long primitive, may overflow) |
+| `UsedCapacity` | Current used capacity across all DataNodes in bytes (long primitive, may overflow) |
+| `RemainingCapacity` | Current remaining capacity in bytes (long primitive, may overflow) |
+| `TotalCapacityBigInt` | Current raw capacity of DataNodes in bytes (using BigInteger) |
+| `UsedCapacityBigInt` | Current used capacity across all DataNodes in bytes (using BigInteger) |
+| `RemainingCapacityBigInt` | Current remaining capacity in bytes (using BigInteger) |
| `NumOfMissingBlocks` | Current number of missing blocks |
| `NumLiveNodes` | Number of datanodes which are currently live |
| `NumDeadNodes` | Number of datanodes which are currently dead |
diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationMBean.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationMBean.java
index 5fa4755..e78ae4c 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationMBean.java
+++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationMBean.java
@@ -17,6 +17,8 @@
*/
package org.apache.hadoop.hdfs.server.federation.metrics;
+import java.math.BigInteger;
+
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
@@ -54,23 +56,47 @@ public interface FederationMBean {
/**
* Get the total capacity of the federated cluster.
+ * The number could overflow if too big. In that case use
+ * {@link #getTotalCapacityBigInt()} instead.
* @return Total capacity of the federated cluster.
*/
long getTotalCapacity();
/**
* Get the used capacity of the federated cluster.
+ * The number could overflow if too big. In that case use
+ * {@link #getUsedCapacityBigInt()} instead.
* @return Used capacity of the federated cluster.
*/
long getUsedCapacity();
/**
* Get the remaining capacity of the federated cluster.
+ * The number could overflow if too big. In that case use
+ * {@link #getRemainingCapacityBigInt()} instead.
* @return Remaining capacity of the federated cluster.
*/
long getRemainingCapacity();
/**
+ * Get the total capacity (big integer) of the federated cluster.
+ * @return Total capacity of the federated cluster.
+ */
+ BigInteger getTotalCapacityBigInt();
+
+ /**
+ * Get the used capacity (big integer) of the federated cluster.
+ * @return Used capacity of the federated cluster.
+ */
+ BigInteger getUsedCapacityBigInt();
+
+ /**
+ * Get the remaining capacity (big integer) of the federated cluster.
+ * @return Remaining capacity of the federated cluster.
+ */
+ BigInteger getRemainingCapacityBigInt();
+
+ /**
* Get the total remote storage capacity mounted in the federated cluster.
* @return Remote capacity of the federated cluster.
*/
diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/RBFMetrics.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/RBFMetrics.java
index d626c23..1eae105 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/RBFMetrics.java
+++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/RBFMetrics.java
@@ -21,6 +21,7 @@ import static org.apache.hadoop.util.Time.now;
import java.io.IOException;
import java.lang.reflect.Method;
+import java.math.BigInteger;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.UnknownHostException;
@@ -378,13 +379,28 @@ public class RBFMetrics implements RouterMBean, FederationMBean {
}
@Override
+ public long getUsedCapacity() {
+ return getTotalCapacity() - getRemainingCapacity();
+ }
+
+ @Override
+ public BigInteger getTotalCapacityBigInt() {
+ return getNameserviceAggregatedBigInt(MembershipStats::getTotalSpace);
+ }
+
+ @Override
+ public BigInteger getRemainingCapacityBigInt() {
+ return getNameserviceAggregatedBigInt(MembershipStats::getAvailableSpace);
+ }
+
+ @Override
public long getProvidedSpace() {
return getNameserviceAggregatedLong(MembershipStats::getProvidedSpace);
}
@Override
- public long getUsedCapacity() {
- return getTotalCapacity() - getRemainingCapacity();
+ public BigInteger getUsedCapacityBigInt() {
+ return getTotalCapacityBigInt().subtract(getRemainingCapacityBigInt());
}
@Override
@@ -730,6 +746,22 @@ public class RBFMetrics implements RouterMBean, FederationMBean {
}
}
+ private BigInteger getNameserviceAggregatedBigInt(
+ ToLongFunction<MembershipStats> f) {
+ try {
+ List<MembershipState> states = getActiveNamenodeRegistrations();
+ BigInteger sum = BigInteger.valueOf(0);
+ for (MembershipState state : states) {
+ long lvalue = f.applyAsLong(state.getStats());
+ sum = sum.add(BigInteger.valueOf(lvalue));
+ }
+ return sum;
+ } catch (IOException e) {
+ LOG.error("Unable to extract metrics: {}", e.getMessage());
+ return new BigInteger("0");
+ }
+ }
+
/**
* Fetches the most active namenode memberships for all known nameservices.
* The fetched membership may not or may not be active. Excludes expired
diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/webapps/router/federationhealth.html b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/webapps/router/federationhealth.html
index 389491c..dff89fa 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/webapps/router/federationhealth.html
+++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/webapps/router/federationhealth.html
@@ -113,9 +113,9 @@
<p>Non Heap Memory used {used|fmt_bytes} of {committed|fmt_bytes} Commited Non Heap Memory. Max Non Heap Memory is {@eq key=max value="-1" type="number"}<unbounded>{:else}{max|fmt_bytes}{/eq}.</p>
{/mem.NonHeapMemoryUsage}
<table class="table table-bordered table-striped">
- <tr><th>Total capacity</th><td>{TotalCapacity|fmt_bytes}</td></tr>
- <tr><th>Used capacity</th><td>{UsedCapacity|fmt_bytes}</td></tr>
- <tr><th>Remaining capacity</th><td>{RemainingCapacity|fmt_bytes}</td></tr>
+ <tr><th>Total capacity</th><td>{TotalCapacityBigInt|fmt_bytes}</td></tr>
+ <tr><th>Used capacity</th><td>{UsedCapacityBigInt|fmt_bytes}</td></tr>
+ <tr><th>Remaining capacity</th><td>{RemainingCapacityBigInt|fmt_bytes}</td></tr>
<tr><th>Nameservices</th><td>{NumNameservices}</td></tr>
<tr><th>Namenodes</th><td>{NumNamenodes}</td></tr>
<tr>
diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestMetricsBase.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestMetricsBase.java
index 4759d05..b01e220 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestMetricsBase.java
+++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestMetricsBase.java
@@ -259,4 +259,15 @@ public class TestMetricsBase {
assertTrue(response.getResult());
return record;
}
+
+ // refresh namenode registration for new attributes
+ public boolean refreshNamenodeRegistration(NamenodeHeartbeatRequest request)
+ throws IOException {
+ boolean result = membershipStore.namenodeHeartbeat(request).getResult();
+ membershipStore.loadCache(true);
+ MembershipNamenodeResolver resolver =
+ (MembershipNamenodeResolver) router.getNamenodeResolver();
+ resolver.loadCache(true);
+ return result;
+ }
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestRBFMetrics.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestRBFMetrics.java
index e1d1d8e..eed41c7 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestRBFMetrics.java
+++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestRBFMetrics.java
@@ -19,11 +19,13 @@ package org.apache.hadoop.hdfs.server.federation.metrics;
import static org.apache.hadoop.hdfs.server.federation.FederationTestUtils.getBean;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.assertFalse;
import java.io.IOException;
+import java.math.BigInteger;
import java.util.Iterator;
import java.util.List;
@@ -31,6 +33,7 @@ import javax.management.MalformedObjectNameException;
import org.apache.commons.collections.ListUtils;
import org.apache.hadoop.hdfs.server.federation.router.Router;
+import org.apache.hadoop.hdfs.server.federation.store.protocol.NamenodeHeartbeatRequest;
import org.apache.hadoop.hdfs.server.federation.store.records.MembershipState;
import org.apache.hadoop.hdfs.server.federation.store.records.MembershipStats;
import org.apache.hadoop.hdfs.server.federation.store.records.MountTable;
@@ -58,6 +61,7 @@ public class TestRBFMetrics extends TestMetricsBase {
FederationMBean federationBean = getBean(FEDERATION_BEAN,
FederationMBean.class);
validateClusterStatsFederationBean(federationBean);
+ testCapacity(federationBean);
RouterMBean routerBean = getBean(ROUTER_BEAN, RouterMBean.class);
validateClusterStatsRouterBean(routerBean);
}
@@ -326,4 +330,34 @@ public class TestRBFMetrics extends TestMetricsBase {
assertTrue(bean.getHostAndPort().length() > 0);
assertFalse(bean.isSecurityEnabled());
}
+
+ private void testCapacity(FederationMBean bean) throws IOException {
+ List<MembershipState> memberships = getActiveMemberships();
+ assertTrue(memberships.size() > 1);
+
+ BigInteger availableCapacity = BigInteger.valueOf(0);
+ BigInteger totalCapacity = BigInteger.valueOf(0);
+ BigInteger unitCapacity = BigInteger.valueOf(Long.MAX_VALUE);
+ for (MembershipState mock : memberships) {
+ MembershipStats stats = mock.getStats();
+ stats.setTotalSpace(Long.MAX_VALUE);
+ stats.setAvailableSpace(Long.MAX_VALUE);
+ // reset stats to make the new value persistent
+ mock.setStats(stats);
+ // write back the new namenode information to state store
+ assertTrue(refreshNamenodeRegistration(
+ NamenodeHeartbeatRequest.newInstance(mock)));
+ totalCapacity = totalCapacity.add(unitCapacity);
+ availableCapacity = availableCapacity.add(unitCapacity);
+ }
+
+ // for local cache update
+ assertEquals(totalCapacity, bean.getTotalCapacityBigInt());
+ // not equal since overflow happened.
+ assertNotEquals(totalCapacity, BigInteger.valueOf(bean.getTotalCapacity()));
+ assertEquals(availableCapacity, bean.getRemainingCapacityBigInt());
+ // not equal since overflow happened.
+ assertNotEquals(availableCapacity,
+ BigInteger.valueOf(bean.getRemainingCapacity()));
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org