You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by ki...@apache.org on 2020/01/22 15:31:07 UTC
[hadoop] branch branch-2.10 updated: HDFS-14968. Add ability to log
stale datanodes. Contributed by Ahmed Hussein.
This is an automated email from the ASF dual-hosted git repository.
kihwal pushed a commit to branch branch-2.10
in repository https://gitbox.apache.org/repos/asf/hadoop.git
The following commit(s) were added to refs/heads/branch-2.10 by this push:
new 0ead38b HDFS-14968. Add ability to log stale datanodes. Contributed by Ahmed Hussein.
0ead38b is described below
commit 0ead38b59a31100880bed0c3ad68dc355a53e145
Author: Kihwal Lee <ki...@apache.org>
AuthorDate: Wed Jan 22 09:30:41 2020 -0600
HDFS-14968. Add ability to log stale datanodes. Contributed by Ahmed Hussein.
(cherry picked from commit bd03053ea2f32ef982e37fbf2ffd679cb7dda797)
Conflicts:
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
(cherry picked from commit 484270832064c84122348e663aaf5927ed411ebb)
---
.../java/org/apache/hadoop/hdfs/DFSConfigKeys.java | 5 +
.../server/blockmanagement/HeartbeatManager.java | 105 +++++++++++++++++++--
.../src/main/resources/hdfs-default.xml | 8 ++
3 files changed, 108 insertions(+), 10 deletions(-)
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java
index aff6b8b..e0b3a77 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java
@@ -364,6 +364,11 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
// Whether to enable datanode's stale state detection and usage for writes
public static final String DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_WRITE_KEY = "dfs.namenode.avoid.write.stale.datanode";
public static final boolean DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_WRITE_DEFAULT = false;
+ // enable and disable logging datanode staleness. Disabled by default.
+ public static final String DFS_NAMENODE_ENABLE_LOG_STALE_DATANODE_KEY =
+ "dfs.namenode.enable.log.stale.datanode";
+ public static final boolean DFS_NAMENODE_ENABLE_LOG_STALE_DATANODE_DEFAULT =
+ false;
// The default value of the time interval for marking datanodes as stale
public static final String DFS_NAMENODE_STALE_DATANODE_INTERVAL_KEY = "dfs.namenode.stale.datanode.interval";
public static final long DFS_NAMENODE_STALE_DATANODE_INTERVAL_DEFAULT = 30 * 1000; // 30s
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
index a72ad64..09d015d 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
@@ -18,8 +18,10 @@
package org.apache.hadoop.hdfs.server.blockmanagement;
import java.util.ArrayList;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
+import java.util.Set;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.conf.Configuration;
@@ -43,7 +45,15 @@ import com.google.common.annotations.VisibleForTesting;
*/
class HeartbeatManager implements DatanodeStatistics {
static final Logger LOG = LoggerFactory.getLogger(HeartbeatManager.class);
-
+ private static final String REPORT_DELTA_STALE_DN_HEADER =
+ "StaleNodes Report: [New Stale Nodes]: %d";
+ private static final String REPORT_STALE_DN_LINE_ENTRY = "%n\t %s";
+ private static final String REPORT_STALE_DN_LINE_TAIL = ", %s";
+ private static final String REPORT_REMOVE_DEAD_NODE_ENTRY =
+ "StaleNodes Report: [Remove DeadNode]: %s";
+ private static final String REPORT_REMOVE_STALE_NODE_ENTRY =
+ "StaleNodes Report: [Remove StaleNode]: %s";
+ private static final int REPORT_STALE_NODE_NODES_PER_LINE = 10;
/**
* Stores a subset of the datanodeMap in DatanodeManager,
* containing nodes that are considered alive.
@@ -56,14 +66,19 @@ class HeartbeatManager implements DatanodeStatistics {
/** Statistics, which are synchronized by the heartbeat manager lock. */
private final DatanodeStats stats = new DatanodeStats();
- /** The time period to check for expired datanodes */
+ /** The time period to check for expired datanodes. */
private final long heartbeatRecheckInterval;
- /** Heartbeat monitor thread */
+ /** Heartbeat monitor thread. */
private final Daemon heartbeatThread = new Daemon(new Monitor());
private final StopWatch heartbeatStopWatch = new StopWatch();
final Namesystem namesystem;
final BlockManager blockManager;
+ /** Enable log for datanode staleness. */
+ private final boolean enableLogStaleNodes;
+
+ /** reports for stale datanodes. */
+ private final Set<DatanodeDescriptor> staleDataNodes = new HashSet<>();
HeartbeatManager(final Namesystem namesystem,
final BlockManager blockManager, final Configuration conf) {
@@ -78,6 +93,9 @@ class HeartbeatManager implements DatanodeStatistics {
long staleInterval = conf.getLong(
DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_KEY,
DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_DEFAULT);// 30s
+ enableLogStaleNodes = conf.getBoolean(
+ DFSConfigKeys.DFS_NAMENODE_ENABLE_LOG_STALE_DATANODE_KEY,
+ DFSConfigKeys.DFS_NAMENODE_ENABLE_LOG_STALE_DATANODE_DEFAULT);
if (avoidStaleDataNodesForWrite && staleInterval < recheckInterval) {
this.heartbeatRecheckInterval = staleInterval;
@@ -223,6 +241,7 @@ class HeartbeatManager implements DatanodeStatistics {
if (node.isAlive()) {
stats.subtract(node);
datanodes.remove(node);
+ removeNodeFromStaleList(node);
node.setAlive(false);
}
}
@@ -319,6 +338,59 @@ class HeartbeatManager implements DatanodeStatistics {
}
/**
+ * Remove deadNode from StaleNodeList if it exists.
+ * This method assumes that it is called inside a synchronized block.
+ *
+ * @param d node descriptor to be marked as dead.
+ * @return true if the node was already on the stale list.
+ */
+ private boolean removeNodeFromStaleList(DatanodeDescriptor d) {
+ return removeNodeFromStaleList(d, true);
+ }
+
+ /**
+ * Remove node from StaleNodeList if it exists.
+ * If enabled, the log will show whether the node is removed from list because
+ * it is dead or not.
+ * This method assumes that it is called inside a synchronized block.
+ *
+ * @param d node descriptor to be marked as dead.
+ * @param isDead
+ * @return true if the node was already in the stale list.
+ */
+ private boolean removeNodeFromStaleList(DatanodeDescriptor d,
+ boolean isDead) {
+ boolean result = false;
+ result = staleDataNodes.remove(d);
+ if (enableLogStaleNodes && result) {
+ LOG.info(String.format(isDead ?
+ REPORT_REMOVE_DEAD_NODE_ENTRY : REPORT_REMOVE_STALE_NODE_ENTRY,
+ d));
+ }
+ return result;
+ }
+
+ /**
+ * Dump the new stale data nodes added since last heartbeat check.
+ *
+ * @param staleNodes list of datanodes added in the last heartbeat check.
+ */
+ private void dumpStaleNodes(List<DatanodeDescriptor> staleNodes) {
+ // log nodes detected as stale
+ if (enableLogStaleNodes && (!staleNodes.isEmpty())) {
+ StringBuilder staleLogMSG =
+ new StringBuilder(String.format(REPORT_DELTA_STALE_DN_HEADER,
+ staleNodes.size()));
+ for (int ind = 0; ind < staleNodes.size(); ind++) {
+ String logFormat = (ind % REPORT_STALE_NODE_NODES_PER_LINE == 0) ?
+ REPORT_STALE_DN_LINE_ENTRY : REPORT_STALE_DN_LINE_TAIL;
+ staleLogMSG.append(String.format(logFormat, staleNodes.get(ind)));
+ }
+ LOG.info(staleLogMSG.toString());
+ }
+ }
+
+ /**
* Check if there are any expired heartbeats, and if so,
* whether any blocks have to be re-replicated.
* While removing dead datanodes, make sure that only one datanode is marked
@@ -360,9 +432,9 @@ class HeartbeatManager implements DatanodeStatistics {
// locate the first failed storage that isn't on a dead node.
DatanodeStorageInfo failedStorage = null;
- // check the number of stale nodes
- int numOfStaleNodes = 0;
+ // check the number of stale storages
int numOfStaleStorages = 0;
+ List<DatanodeDescriptor> staleNodes = new ArrayList<>();
synchronized(this) {
for (DatanodeDescriptor d : datanodes) {
// check if an excessive GC pause has occurred
@@ -372,9 +444,19 @@ class HeartbeatManager implements DatanodeStatistics {
if (dead == null && dm.isDatanodeDead(d)) {
stats.incrExpiredHeartbeats();
dead = d;
- }
- if (d.isStale(dm.getStaleInterval())) {
- numOfStaleNodes++;
+ // remove the node from stale list to adjust the stale list size
+ // before setting the stale count of the DatanodeManager
+ removeNodeFromStaleList(d);
+ } else {
+ if (d.isStale(dm.getStaleInterval())) {
+ if (staleDataNodes.add(d)) {
+ // the node is n
+ staleNodes.add(d);
+ }
+ } else {
+ // remove the node if it is no longer stale
+ removeNodeFromStaleList(d, false);
+ }
}
DatanodeStorageInfo[] storageInfos = d.getStorageInfos();
for(DatanodeStorageInfo storageInfo : storageInfos) {
@@ -388,18 +470,21 @@ class HeartbeatManager implements DatanodeStatistics {
failedStorage = storageInfo;
}
}
-
}
// Set the number of stale nodes in the DatanodeManager
- dm.setNumStaleNodes(numOfStaleNodes);
+ dm.setNumStaleNodes(staleDataNodes.size());
dm.setNumStaleStorages(numOfStaleStorages);
}
+ // log nodes detected as stale since last heartBeat
+ dumpStaleNodes(staleNodes);
+
allAlive = dead == null && failedStorage == null;
if (!allAlive && namesystem.isInStartupSafeMode()) {
return;
}
+
if (dead != null) {
// acquire the fsnamesystem lock, and then remove the dead node.
namesystem.writeLock();
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml
index 9c3ef5a..2fce9a5 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml
@@ -1889,6 +1889,14 @@
</property>
<property>
+ <name>dfs.namenode.enable.log.stale.datanode</name>
+ <value>false</value>
+ <description>
+ Enable and disable logging datanode staleness. Disabled by default.
+ </description>
+</property>
+
+<property>
<name>dfs.namenode.stale.datanode.interval</name>
<value>30000</value>
<description>
---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org