You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by ay...@apache.org on 2023/02/12 06:24:07 UTC
[hadoop] branch branch-3.3 updated: HDFS-16907. Add LastHeartbeatResponseTime for BP service actor (#5349) (#5358)
This is an automated email from the ASF dual-hosted git repository.
ayushsaxena pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/hadoop.git
The following commit(s) were added to refs/heads/branch-3.3 by this push:
new 5099077f4d9 HDFS-16907. Add LastHeartbeatResponseTime for BP service actor (#5349) (#5358)
5099077f4d9 is described below
commit 5099077f4d9bab20f0efd99be66e61151805d18c
Author: Viraj Jasani <vj...@apache.org>
AuthorDate: Sat Feb 11 22:23:59 2023 -0800
HDFS-16907. Add LastHeartbeatResponseTime for BP service actor (#5349) (#5358)
* ADDENDUM Remove unused variables from testDataNodeMXBeanLastHeartbeats (#5373)
Reviewed-by: Ayush Saxena <ay...@apache.org>
Reviewed-by: Shilun Fan <sl...@apache.org>
Signed-off-by: Tao Li <to...@apache.org>
---
.../hdfs/server/datanode/BPServiceActor.java | 15 +++++
.../hadoop/hdfs/server/datanode/DataNode.java | 35 ++++++++--
.../src/main/webapps/datanode/datanode.html | 4 +-
.../org/apache/hadoop/hdfs/MiniDFSCluster.java | 18 ++++++
.../hdfs/server/datanode/TestDataNodeMXBean.java | 75 ++++++++++++++++++++++
5 files changed, 142 insertions(+), 5 deletions(-)
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java
index 362edd1cace..d7ea8a83739 100755
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java
@@ -205,6 +205,8 @@ class BPServiceActor implements Runnable {
info.put("ActorState", getRunningState());
info.put("LastHeartbeat",
String.valueOf(getScheduler().getLastHearbeatTime()));
+ info.put("LastHeartbeatResponseTime",
+ String.valueOf(getScheduler().getLastHeartbeatResponseTime()));
info.put("LastBlockReport",
String.valueOf(getScheduler().getLastBlockReportTime()));
info.put("maxBlockReportSize", String.valueOf(getMaxBlockReportSize()));
@@ -568,6 +570,8 @@ class BPServiceActor implements Runnable {
slowPeers,
slowDisks);
+ scheduler.updateLastHeartbeatResponseTime(monotonicNow());
+
if (outliersReportDue) {
// If the report was due and successfully sent, schedule the next one.
scheduler.scheduleNextOutlierReport();
@@ -1190,6 +1194,9 @@ class BPServiceActor implements Runnable {
@VisibleForTesting
volatile long lastHeartbeatTime = monotonicNow();
+ @VisibleForTesting
+ private volatile long lastHeartbeatResponseTime = -1;
+
@VisibleForTesting
boolean resetBlockReportTime = true;
@@ -1238,6 +1245,10 @@ class BPServiceActor implements Runnable {
lastHeartbeatTime = heartbeatTime;
}
+ void updateLastHeartbeatResponseTime(long heartbeatTime) {
+ this.lastHeartbeatResponseTime = heartbeatTime;
+ }
+
void updateLastBlockReportTime(long blockReportTime) {
lastBlockReportTime = blockReportTime;
}
@@ -1250,6 +1261,10 @@ class BPServiceActor implements Runnable {
return (monotonicNow() - lastHeartbeatTime)/1000;
}
+ private long getLastHeartbeatResponseTime() {
+ return (monotonicNow() - lastHeartbeatResponseTime) / 1000;
+ }
+
long getLastBlockReportTime() {
return (monotonicNow() - lastBlockReportTime)/1000;
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java
index 633befbb5b5..402d4710870 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java
@@ -3496,8 +3496,12 @@ public class DataNode extends ReconfigurableBase
*/
@Override // DataNodeMXBean
public String getBPServiceActorInfo() {
- final ArrayList<Map<String, String>> infoArray =
- new ArrayList<Map<String, String>>();
+ return JSON.toString(getBPServiceActorInfoMap());
+ }
+
+ @VisibleForTesting
+ public List<Map<String, String>> getBPServiceActorInfoMap() {
+ final List<Map<String, String>> infoArray = new ArrayList<>();
for (BPOfferService bpos : blockPoolManager.getAllNamenodeThreads()) {
if (bpos != null) {
for (BPServiceActor actor : bpos.getBPServiceActors()) {
@@ -3505,7 +3509,7 @@ public class DataNode extends ReconfigurableBase
}
}
}
- return JSON.toString(infoArray);
+ return infoArray;
}
/**
@@ -3684,6 +3688,29 @@ public class DataNode extends ReconfigurableBase
* @return true - if the data node is fully started
*/
public boolean isDatanodeFullyStarted() {
+ return isDatanodeFullyStarted(false);
+ }
+
+ /**
+ * A datanode is considered to be fully started if all the BP threads are
+ * alive and all the block pools are initialized. If checkConnectionToActiveNamenode is true,
+ * the datanode is considered to be fully started if it is also heartbeating to
+ * active namenode in addition to the above-mentioned conditions.
+ *
+ * @param checkConnectionToActiveNamenode if true, performs additional check of whether datanode
+ * is heartbeating to active namenode.
+ * @return true if the datanode is fully started and also conditionally connected to active
+ * namenode, false otherwise.
+ */
+ public boolean isDatanodeFullyStarted(boolean checkConnectionToActiveNamenode) {
+ if (checkConnectionToActiveNamenode) {
+ for (BPOfferService bp : blockPoolManager.getAllNamenodeThreads()) {
+ if (!bp.isInitialized() || !bp.isAlive() || bp.getActiveNN() == null) {
+ return false;
+ }
+ }
+ return true;
+ }
for (BPOfferService bp : blockPoolManager.getAllNamenodeThreads()) {
if (!bp.isInitialized() || !bp.isAlive()) {
return false;
@@ -3691,7 +3718,7 @@ public class DataNode extends ReconfigurableBase
}
return true;
}
-
+
@VisibleForTesting
public DatanodeID getDatanodeId() {
return id;
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/datanode/datanode.html b/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/datanode/datanode.html
index b491d5a04e3..28cba0153c1 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/datanode/datanode.html
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/datanode/datanode.html
@@ -84,7 +84,8 @@
<th>Namenode HA State</th>
<th>Block Pool ID</th>
<th>Actor State</th>
- <th>Last Heartbeat</th>
+ <th>Last Heartbeat Sent</th>
+ <th>Last Heartbeat Response</th>
<th>Last Block Report</th>
<th>Last Block Report Size (Max Size)</th>
</tr>
@@ -96,6 +97,7 @@
<td>{BlockPoolID}</td>
<td>{ActorState}</td>
<td>{LastHeartbeat}s</td>
+ <td>{LastHeartbeatResponseTime}s</td>
<td>{#helper_relative_time value="{LastBlockReport}"/}</td>
<td>{maxBlockReportSize|fmt_bytes} ({maxDataLength|fmt_bytes})</td>
</tr>
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java
index d9550cdf70c..4dafbbe5741 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java
@@ -2508,6 +2508,24 @@ public class MiniDFSCluster implements AutoCloseable {
return restartDataNode(dnprop, false);
}
+ /**
+ * Wait for the datanode to be fully functional i.e. all the BP service threads are alive,
+ * all block pools initiated and also connected to active namenode.
+ *
+ * @param dn Datanode instance.
+ * @param timeout Timeout in millis until when we should wait for datanode to be fully
+ * operational.
+ * @throws InterruptedException If the thread wait is interrupted.
+ * @throws TimeoutException If times out while awaiting the fully operational capability of
+ * datanode.
+ */
+ public void waitDatanodeConnectedToActive(DataNode dn, int timeout)
+ throws InterruptedException, TimeoutException {
+ GenericTestUtils.waitFor(() -> dn.isDatanodeFullyStarted(true),
+ 100, timeout, "Datanode is not connected to active namenode even after "
+ + timeout + " ms of waiting");
+ }
+
public void waitDatanodeFullyStarted(DataNode dn, int timeout)
throws TimeoutException, InterruptedException {
GenericTestUtils.waitFor(new Supplier<Boolean>() {
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeMXBean.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeMXBean.java
index ea43cccbb18..28330139bc4 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeMXBean.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeMXBean.java
@@ -38,7 +38,9 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSTestUtil;
import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
import org.apache.hadoop.hdfs.protocol.datatransfer.sasl.SaslDataTransferTestCase;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.test.GenericTestUtils;
import org.junit.Assert;
@@ -294,4 +296,77 @@ public class TestDataNodeMXBean extends SaslDataTransferTestCase {
if (cluster != null) {cluster.shutdown();}
}
}
+
+ @Test
+ public void testDataNodeMXBeanLastHeartbeats() throws Exception {
+ Configuration conf = new Configuration();
+ try (MiniDFSCluster cluster = new MiniDFSCluster
+ .Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology(2))
+ .build()) {
+ cluster.waitActive();
+ cluster.transitionToActive(0);
+ cluster.transitionToStandby(1);
+
+ DataNode datanode = cluster.getDataNodes().get(0);
+
+ // Verify and wait until one of the BP service actor identifies active namenode as active
+ // and another as standby.
+ cluster.waitDatanodeConnectedToActive(datanode, 5000);
+
+ // Verify that last heartbeat sent to both namenodes in last 5 sec.
+ assertLastHeartbeatSentTime(datanode, "LastHeartbeat");
+ // Verify that last heartbeat response from both namenodes have been received within
+ // last 5 sec.
+ assertLastHeartbeatSentTime(datanode, "LastHeartbeatResponseTime");
+
+
+ NameNode sbNameNode = cluster.getNameNode(1);
+
+ // Stopping standby namenode
+ sbNameNode.stop();
+
+ // Verify that last heartbeat response time from one of the namenodes would stay much higher
+ // after stopping one namenode.
+ GenericTestUtils.waitFor(() -> {
+ List<Map<String, String>> bpServiceActorInfo = datanode.getBPServiceActorInfoMap();
+ Map<String, String> bpServiceActorInfo1 = bpServiceActorInfo.get(0);
+ Map<String, String> bpServiceActorInfo2 = bpServiceActorInfo.get(1);
+
+ long lastHeartbeatResponseTime1 =
+ Long.parseLong(bpServiceActorInfo1.get("LastHeartbeatResponseTime"));
+ long lastHeartbeatResponseTime2 =
+ Long.parseLong(bpServiceActorInfo2.get("LastHeartbeatResponseTime"));
+
+ LOG.info("Last heartbeat response from namenode 1: {}", lastHeartbeatResponseTime1);
+ LOG.info("Last heartbeat response from namenode 2: {}", lastHeartbeatResponseTime2);
+
+ return (lastHeartbeatResponseTime1 < 5L && lastHeartbeatResponseTime2 > 5L) || (
+ lastHeartbeatResponseTime1 > 5L && lastHeartbeatResponseTime2 < 5L);
+
+ }, 200, 15000,
+ "Last heartbeat response should be higher than 5s for at least one namenode");
+
+ // Verify that last heartbeat sent to both namenodes in last 5 sec even though
+ // the last heartbeat received from one of the namenodes is greater than 5 sec ago.
+ assertLastHeartbeatSentTime(datanode, "LastHeartbeat");
+ }
+ }
+
+ private static void assertLastHeartbeatSentTime(DataNode datanode, String lastHeartbeat) {
+ List<Map<String, String>> bpServiceActorInfo = datanode.getBPServiceActorInfoMap();
+ Map<String, String> bpServiceActorInfo1 = bpServiceActorInfo.get(0);
+ Map<String, String> bpServiceActorInfo2 = bpServiceActorInfo.get(1);
+
+ long lastHeartbeatSent1 =
+ Long.parseLong(bpServiceActorInfo1.get(lastHeartbeat));
+ long lastHeartbeatSent2 =
+ Long.parseLong(bpServiceActorInfo2.get(lastHeartbeat));
+
+ Assert.assertTrue(lastHeartbeat + " for first bp service actor is higher than 5s",
+ lastHeartbeatSent1 < 5L);
+ Assert.assertTrue(lastHeartbeat + " for second bp service actor is higher than 5s",
+ lastHeartbeatSent2 < 5L);
+ }
+
}
---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org