You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by sh...@apache.org on 2017/09/06 22:27:10 UTC
hadoop git commit: HDFS-12131. Add some of the FSNamesystem JMX
values as metrics. Contributed by Erik Krogen.
Repository: hadoop
Updated Branches:
refs/heads/branch-2.7 03892df21 -> 0dca198f0
HDFS-12131. Add some of the FSNamesystem JMX values as metrics. Contributed by Erik Krogen.
(cherry picked from commit f4c6b00a9f48ae7667db4035b641769efc3bb7cf)
Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/0dca198f
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/0dca198f
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/0dca198f
Branch: refs/heads/branch-2.7
Commit: 0dca198f097276bbe32dcdcdc041417312608bd7
Parents: 03892df
Author: Andrew Wang <wa...@apache.org>
Authored: Thu Aug 3 15:45:47 2017 -0700
Committer: Konstantin V Shvachko <sh...@apache.org>
Committed: Wed Sep 6 15:18:28 2017 -0700
----------------------------------------------------------------------
.../hadoop-common/src/site/markdown/Metrics.md | 8 ++
hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 +
.../hdfs/server/namenode/FSNamesystem.java | 14 ++
.../org/apache/hadoop/hdfs/MiniDFSCluster.java | 6 +-
.../namenode/metrics/TestNameNodeMetrics.java | 128 ++++++++++++++++++-
5 files changed, 157 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hadoop/blob/0dca198f/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md
----------------------------------------------------------------------
diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md b/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md
index 88ed6f6..81539a9 100644
--- a/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md
+++ b/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md
@@ -230,7 +230,15 @@ Each metrics record contains tags such as HAState and Hostname as additional inf
| `PendingDataNodeMessageCourt` | (HA-only) Current number of pending block-related messages for later processing in the standby NameNode |
| `MillisSinceLastLoadedEdits` | (HA-only) Time in milliseconds since the last time standby NameNode load edit log. In active NameNode, set to 0 |
| `BlockCapacity` | Current number of block capacity |
+| `NumLiveDataNodes` | Number of datanodes which are currently live |
+| `NumDeadDataNodes` | Number of datanodes which are currently dead |
+| `NumDecomLiveDataNodes` | Number of datanodes which have been decommissioned and are now live |
+| `NumDecomDeadDataNodes` | Number of datanodes which have been decommissioned and are now dead |
+| `NumDecommissioningDataNodes` | Number of datanodes in decommissioning state |
+| `VolumeFailuresTotal` | Total number of volume failures across all Datanodes |
+| `EstimatedCapacityLostTotal` | An estimate of the total capacity lost due to volume failures |
| `StaleDataNodes` | Current number of DataNodes marked stale due to delayed heartbeat |
+| `NumStaleStorages` | Number of storages marked as content stale (after NameNode restart/failover before first block report is received) |
| `TotalFiles` | Current number of files and directories (same as FilesTotal) |
| `LockQueueLength` | Number of threads waiting to acquire FSNameSystem lock |
http://git-wip-us.apache.org/repos/asf/hadoop/blob/0dca198f/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
----------------------------------------------------------------------
diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
index d1ebdcd..3ce47d7 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
+++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
@@ -12,6 +12,9 @@ Release 2.7.5 - UNRELEASED
HDFS-8797. WebHdfsFileSystem creates too many connections for pread. (jing9)
+ HDFS-12131. Add some of the FSNamesystem JMX values as metrics.
+ (Erik Krogen via wang, shv)
+
OPTIMIZATIONS
BUG FIXES
http://git-wip-us.apache.org/repos/asf/hadoop/blob/0dca198f/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
----------------------------------------------------------------------
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
index ce8878f..cb0c7a3 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
@@ -6030,16 +6030,20 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
}
@Override // FSNamesystemMBean
+ @Metric({"NumLiveDataNodes", "Number of datanodes which are currently live"})
public int getNumLiveDataNodes() {
return getBlockManager().getDatanodeManager().getNumLiveDataNodes();
}
@Override // FSNamesystemMBean
+ @Metric({"NumDeadDataNodes", "Number of datanodes which are currently dead"})
public int getNumDeadDataNodes() {
return getBlockManager().getDatanodeManager().getNumDeadDataNodes();
}
@Override // FSNamesystemMBean
+ @Metric({"NumDecomLiveDataNodes",
+ "Number of datanodes which have been decommissioned and are now live"})
public int getNumDecomLiveDataNodes() {
final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false);
@@ -6051,6 +6055,8 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
}
@Override // FSNamesystemMBean
+ @Metric({"NumDecomDeadDataNodes",
+ "Number of datanodes which have been decommissioned and are now dead"})
public int getNumDecomDeadDataNodes() {
final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
getBlockManager().getDatanodeManager().fetchDatanodes(null, dead, false);
@@ -6062,6 +6068,8 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
}
@Override // FSNamesystemMBean
+ @Metric({"VolumeFailuresTotal",
+ "Total number of volume failures across all Datanodes"})
public int getVolumeFailuresTotal() {
List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false);
@@ -6073,6 +6081,8 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
}
@Override // FSNamesystemMBean
+ @Metric({"EstimatedCapacityLostTotal",
+ "An estimate of the total capacity lost due to volume failures"})
public long getEstimatedCapacityLostTotal() {
List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false);
@@ -6088,6 +6098,8 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
}
@Override // FSNamesystemMBean
+ @Metric({"NumDecommissioningDataNodes",
+ "Number of datanodes in decommissioning state"})
public int getNumDecommissioningDataNodes() {
return getBlockManager().getDatanodeManager().getDecommissioningNodes()
.size();
@@ -6105,6 +6117,8 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
* before NN receives the first Heartbeat followed by the first Blockreport.
*/
@Override // FSNamesystemMBean
+ @Metric({"NumStaleStorages",
+ "Number of storages marked as content stale"})
public int getNumStaleStorages() {
return getBlockManager().getDatanodeManager().getNumStaleStorages();
}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/0dca198f/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java
----------------------------------------------------------------------
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java
index 1739821..15ecf0197 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java
@@ -142,6 +142,8 @@ public class MiniDFSCluster {
public static final String HDFS_MINIDFS_BASEDIR = "hdfs.minidfs.basedir";
public static final String DFS_NAMENODE_SAFEMODE_EXTENSION_TESTING_KEY
= DFS_NAMENODE_SAFEMODE_EXTENSION_KEY + ".testing";
+ public static final String DFS_NAMENODE_DECOMMISSION_INTERVAL_TESTING_KEY
+ = DFS_NAMENODE_DECOMMISSION_INTERVAL_KEY + ".testing";
// Changing this default may break some tests that assume it is 2.
private static final int DEFAULT_STORAGES_PER_DATANODE = 2;
@@ -788,7 +790,9 @@ public class MiniDFSCluster {
int safemodeExtension = conf.getInt(
DFS_NAMENODE_SAFEMODE_EXTENSION_TESTING_KEY, 0);
conf.setInt(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, safemodeExtension);
- conf.setInt(DFS_NAMENODE_DECOMMISSION_INTERVAL_KEY, 3); // 3 second
+ int decommissionInterval = conf.getInt(
+ DFS_NAMENODE_DECOMMISSION_INTERVAL_TESTING_KEY, 3);
+ conf.setInt(DFS_NAMENODE_DECOMMISSION_INTERVAL_KEY, decommissionInterval);
conf.setClass(NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY,
StaticMapping.class, DNSToSwitchMapping.class);
http://git-wip-us.apache.org/repos/asf/hadoop/blob/0dca198f/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java
----------------------------------------------------------------------
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java
index ad4c171..8665834 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java
@@ -24,10 +24,15 @@ import static org.apache.hadoop.test.MetricsAsserts.assertQuantileGauges;
import static org.apache.hadoop.test.MetricsAsserts.getMetrics;
import static org.junit.Assert.assertTrue;
+import com.google.common.base.Joiner;
import java.io.DataInputStream;
+import java.io.File;
import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
import java.util.Random;
+import org.apache.commons.io.FileUtils;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.logging.impl.Log4JLogger;
import org.apache.hadoop.conf.Configuration;
@@ -46,6 +51,8 @@ import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
+import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi;
+import org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsVolumeImpl;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
import org.apache.hadoop.hdfs.server.namenode.top.TopAuditLogger;
@@ -84,6 +91,13 @@ public class TestNameNodeMetrics {
DFS_REPLICATION_INTERVAL);
CONF.setInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_KEY,
DFS_REPLICATION_INTERVAL);
+ // Set it long enough to essentially disable unless we manually call it
+ // Used for decommissioning DataNode metrics
+ CONF.setInt(MiniDFSCluster.DFS_NAMENODE_DECOMMISSION_INTERVAL_TESTING_KEY,
+ 9999999);
+ // For checking failed volume metrics
+ CONF.setInt(DFSConfigKeys.DFS_DF_INTERVAL_KEY, 1000);
+ CONF.setInt(DFSConfigKeys.DFS_DATANODE_FAILED_VOLUMES_TOLERATED_KEY, 1);
CONF.set(DFSConfigKeys.DFS_METRICS_PERCENTILES_INTERVALS_KEY,
"" + PERCENTILES_INTERVAL);
// Enable stale DataNodes checking
@@ -97,6 +111,8 @@ public class TestNameNodeMetrics {
private final Random rand = new Random();
private FSNamesystem namesystem;
private BlockManager bm;
+ // List of temporary files on local FileSystem to be cleaned up
+ private List<Path> tempFiles;
private static Path getTestPath(String fileName) {
return new Path(TEST_ROOT_DIR_PATH, fileName);
@@ -109,6 +125,7 @@ public class TestNameNodeMetrics {
namesystem = cluster.getNamesystem();
bm = namesystem.getBlockManager();
fs = cluster.getFileSystem();
+ tempFiles = new ArrayList<>();
}
@After
@@ -120,6 +137,9 @@ public class TestNameNodeMetrics {
assertQuantileGauges("GetGroups1s", rb);
}
cluster.shutdown();
+ for (Path p : tempFiles) {
+ FileUtils.deleteQuietly(new File(p.toUri().getPath()));
+ }
}
/** create a file with a length of <code>fileLen</code> */
@@ -196,7 +216,113 @@ public class TestNameNodeMetrics {
.getBlockManager());
assertGauge("StaleDataNodes", 0, getMetrics(NS_METRICS));
}
-
+
+ /**
+ * Test metrics associated with volume failures.
+ */
+ @Test
+ public void testVolumeFailures() throws Exception {
+ assertGauge("VolumeFailuresTotal", 0, getMetrics(NS_METRICS));
+ assertGauge("EstimatedCapacityLostTotal", 0L, getMetrics(NS_METRICS));
+ DataNode dn = cluster.getDataNodes().get(0);
+ FsVolumeSpi fsVolume =
+ DataNodeTestUtils.getFSDataset(dn).getVolumes().get(0);
+ File dataDir = new File(fsVolume.getBasePath());
+ long capacity = ((FsVolumeImpl) fsVolume).getCapacity();
+ DataNodeTestUtils.injectDataDirFailure(dataDir);
+ long lastDiskErrorCheck = dn.getLastDiskErrorCheck();
+ dn.checkDiskErrorAsync();
+ while (dn.getLastDiskErrorCheck() == lastDiskErrorCheck) {
+ Thread.sleep(100);
+ }
+ DataNodeTestUtils.triggerHeartbeat(dn);
+ BlockManagerTestUtil.checkHeartbeat(bm);
+ assertGauge("VolumeFailuresTotal", 1, getMetrics(NS_METRICS));
+ assertGauge("EstimatedCapacityLostTotal", capacity, getMetrics(NS_METRICS));
+ }
+
+ /**
+ * Test metrics associated with liveness and decommission status of DataNodes.
+ */
+ @Test
+ public void testDataNodeLivenessAndDecom() throws Exception {
+ Path hostFileDir = new Path(MiniDFSCluster.getBaseDirectory(), "hosts");
+ FileSystem localFs = FileSystem.getLocal(CONF);
+ localFs.mkdirs(hostFileDir);
+ Path includeFile = new Path(hostFileDir, "include");
+ Path excludeFile = new Path(hostFileDir, "exclude");
+ tempFiles.add(includeFile);
+ tempFiles.add(excludeFile);
+ CONF.set(DFSConfigKeys.DFS_HOSTS, includeFile.toUri().getPath());
+ CONF.set(DFSConfigKeys.DFS_HOSTS_EXCLUDE, excludeFile.toUri().getPath());
+
+ List<DataNode> dataNodes = cluster.getDataNodes();
+ DatanodeDescriptor[] dnDescriptors = new DatanodeDescriptor[DATANODE_COUNT];
+ String[] dnAddresses = new String[DATANODE_COUNT];
+ for (int i = 0; i < DATANODE_COUNT; i++) {
+ dnDescriptors[i] = bm.getDatanodeManager()
+ .getDatanode(dataNodes.get(i).getDatanodeId());
+ dnAddresses[i] = dnDescriptors[i].getXferAddr();
+ }
+ // First put all DNs into include
+ DFSTestUtil.writeFile(localFs, includeFile,
+ Joiner.on("\n").join(dnAddresses));
+ DFSTestUtil.writeFile(localFs, excludeFile, "");
+ bm.getDatanodeManager().refreshNodes(CONF);
+ assertGauge("NumDecomLiveDataNodes", 0, getMetrics(NS_METRICS));
+ assertGauge("NumLiveDataNodes", DATANODE_COUNT, getMetrics(NS_METRICS));
+
+ // Now decommission one DN
+ DFSTestUtil.writeFile(localFs, excludeFile, dnAddresses[0]);
+ bm.getDatanodeManager().refreshNodes(CONF);
+ assertGauge("NumDecommissioningDataNodes", 1, getMetrics(NS_METRICS));
+ BlockManagerTestUtil.recheckDecommissionState(bm.getDatanodeManager());
+ assertGauge("NumDecommissioningDataNodes", 0, getMetrics(NS_METRICS));
+ assertGauge("NumDecomLiveDataNodes", 1, getMetrics(NS_METRICS));
+ assertGauge("NumLiveDataNodes", DATANODE_COUNT, getMetrics(NS_METRICS));
+
+ // Now kill all DNs by expiring their heartbeats
+ for (int i = 0; i < DATANODE_COUNT; i++) {
+ DataNodeTestUtils.setHeartbeatsDisabledForTests(dataNodes.get(i), true);
+ long expireInterval = CONF.getLong(
+ DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY,
+ DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_DEFAULT) * 2L
+ + CONF.getLong(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY,
+ DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_DEFAULT) * 10 * 1000L;
+ DFSTestUtil.resetLastUpdatesWithOffset(dnDescriptors[i],
+ -(expireInterval + 1));
+ }
+ BlockManagerTestUtil.checkHeartbeat(bm);
+ assertGauge("NumDecomLiveDataNodes", 0, getMetrics(NS_METRICS));
+ assertGauge("NumDecomDeadDataNodes", 1, getMetrics(NS_METRICS));
+ assertGauge("NumLiveDataNodes", 0, getMetrics(NS_METRICS));
+ assertGauge("NumDeadDataNodes", DATANODE_COUNT, getMetrics(NS_METRICS));
+
+ // Now remove the decommissioned DN altogether
+ String[] includeHosts = new String[dnAddresses.length - 1];
+ for (int i = 0; i < includeHosts.length; i++) {
+ includeHosts[i] = dnAddresses[i + 1];
+ }
+ DFSTestUtil.writeFile(localFs, includeFile,
+ Joiner.on("\n").join(includeHosts));
+ // Just init to a nonexistent host to clear out the previous exclusion
+ DFSTestUtil.writeFile(localFs, excludeFile, "");
+ bm.getDatanodeManager().refreshNodes(CONF);
+ assertGauge("NumDecomLiveDataNodes", 0, getMetrics(NS_METRICS));
+ assertGauge("NumDecomDeadDataNodes", 0, getMetrics(NS_METRICS));
+ assertGauge("NumLiveDataNodes", 0, getMetrics(NS_METRICS));
+ assertGauge("NumDeadDataNodes", DATANODE_COUNT - 1, getMetrics(NS_METRICS));
+
+ // Finally mark the remaining DNs as live again
+ for (int i = 1; i < dataNodes.size(); i++) {
+ DataNodeTestUtils.setHeartbeatsDisabledForTests(dataNodes.get(i), false);
+ DFSTestUtil.resetLastUpdatesWithOffset(dnDescriptors[i], 0);
+ }
+ BlockManagerTestUtil.checkHeartbeat(bm);
+ assertGauge("NumLiveDataNodes", DATANODE_COUNT - 1, getMetrics(NS_METRICS));
+ assertGauge("NumDeadDataNodes", 0, getMetrics(NS_METRICS));
+ }
+
/** Test metrics associated with addition of a file */
@Test
public void testFileAdd() throws Exception {
---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org