You are viewing a plain text version of this content. The canonical link for it is here.
Posted to hdfs-commits@hadoop.apache.org by to...@apache.org on 2012/01/11 06:55:32 UTC
svn commit: r1229898 - in
/hadoop/common/branches/HDFS-1623/hadoop-hdfs-project/hadoop-hdfs: ./
src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/
src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/
Author: todd
Date: Wed Jan 11 05:55:32 2012
New Revision: 1229898
URL: http://svn.apache.org/viewvc?rev=1229898&view=rev
Log:
HDFS-2753. Fix standby getting stuck in safemode when blocks are written while SBN is down. Contributed by Hari Mankude and Todd Lipcon.
Modified:
hadoop/common/branches/HDFS-1623/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt
hadoop/common/branches/HDFS-1623/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
hadoop/common/branches/HDFS-1623/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java
hadoop/common/branches/HDFS-1623/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHASafeMode.java
Modified: hadoop/common/branches/HDFS-1623/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1623/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt?rev=1229898&r1=1229897&r2=1229898&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1623/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt (original)
+++ hadoop/common/branches/HDFS-1623/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt Wed Jan 11 05:55:32 2012
@@ -93,3 +93,5 @@ HDFS-2730. Refactor shared HA-related te
HDFS-2762. Fix TestCheckpoint timing out on HA branch. (Uma Maheswara Rao G via todd)
HDFS-2724. NN web UI can throw NPE after startup, before standby state is entered. (todd)
+
+HDFS-2753. Fix standby getting stuck in safemode when blocks are written while SBN is down. (Hari Mankude and todd via todd)
Modified: hadoop/common/branches/HDFS-1623/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1623/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java?rev=1229898&r1=1229897&r2=1229898&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1623/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java (original)
+++ hadoop/common/branches/HDFS-1623/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java Wed Jan 11 05:55:32 2012
@@ -1361,7 +1361,7 @@ public class BlockManager {
// To minimize startup time, we discard any second (or later) block reports
// that we receive while still in startup phase.
- if (namesystem.isInStartupSafeMode() && node.numBlocks() > 0) {
+ if (namesystem.isInStartupSafeMode() && !node.isFirstBlockReport()) {
NameNode.stateChangeLog.info("BLOCK* processReport: "
+ "discarded non-initial block report from " + nodeID.getName()
+ " because namenode still in startup phase");
Modified: hadoop/common/branches/HDFS-1623/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1623/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java?rev=1229898&r1=1229897&r2=1229898&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1623/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java (original)
+++ hadoop/common/branches/HDFS-1623/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java Wed Jan 11 05:55:32 2012
@@ -151,6 +151,10 @@ public class DatanodeDescriptor extends
private long lastBlocksScheduledRollTime = 0;
private static final int BLOCKS_SCHEDULED_ROLL_INTERVAL = 600*1000; //10min
private int volumeFailures = 0;
+
+ /** Set to false after processing first block report */
+ private boolean firstBlockReport = true;
+
/**
* When set to true, the node is not in include list and is not allowed
* to communicate with the namenode
@@ -608,6 +612,11 @@ public class DatanodeDescriptor extends
if (heartbeatedSinceFailover) {
blockContentsStale = false;
}
+ firstBlockReport = false;
+ }
+
+ boolean isFirstBlockReport() {
+ return firstBlockReport;
}
@Override
Modified: hadoop/common/branches/HDFS-1623/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHASafeMode.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1623/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHASafeMode.java?rev=1229898&r1=1229897&r2=1229898&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1623/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHASafeMode.java (original)
+++ hadoop/common/branches/HDFS-1623/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHASafeMode.java Wed Jan 11 05:55:32 2012
@@ -192,7 +192,7 @@ public class TestHASafeMode {
* knows there should only be 90 blocks, but it's still in safemode.
* 8. NN2 doesn't ever recheck whether it should leave safemode.
*
- * This is essentially the inverse of {@link #testBlocksAddedWhileStandbyShutdown()}
+ * This is essentially the inverse of {@link #testBlocksAddedBeforeStandbyRestart()}
*/
@Test
public void testBlocksRemovedBeforeStandbyRestart() throws Exception {
@@ -329,6 +329,39 @@ public class TestHASafeMode {
}
/**
+ * Regression test for HDFS-2753. In this bug, the following sequence was
+ * observed:
+ * - Some blocks are written to DNs while the SBN was down. This causes
+ * the blockReceived messages to get queued in the BPServiceActor on the
+ * DN.
+ * - When the SBN returns, the DN re-registers with the SBN, and then
+ * flushes its blockReceived queue to the SBN before it sends its
+ * first block report. This caused the first block report to be
+ * incorrect ignored.
+ * - The SBN would become stuck in safemode.
+ */
+ @Test
+ public void testBlocksAddedWhileStandbyIsDown() throws Exception {
+ DFSTestUtil.createFile(fs, new Path("/test"), 3*BLOCK_SIZE, (short) 3, 1L);
+
+ banner("Stopping standby");
+ cluster.shutdownNameNode(1);
+
+ DFSTestUtil.createFile(fs, new Path("/test2"), 3*BLOCK_SIZE, (short) 3, 1L);
+
+ banner("Rolling edit log so standby gets all edits on restart");
+ nn0.getRpcServer().rollEditLog();
+
+ restartStandby();
+ String status = nn1.getNamesystem().getSafemode();
+ assertTrue("Bad safemode status: '" + status + "'",
+ status.startsWith(
+ "Safe mode is ON." +
+ "The reported blocks 6 has reached the threshold 0.9990 of " +
+ "total blocks 6. Safe mode will be turned off automatically"));
+ }
+
+ /**
* Print a big banner in the test log to make debug easier.
*/
static void banner(String string) {