You are viewing a plain text version of this content. The canonical link for it is here.
Posted to hdfs-commits@hadoop.apache.org by vi...@apache.org on 2014/08/04 10:43:52 UTC
svn commit: r1615504 - in
/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs: ./
src/main/java/org/apache/hadoop/hdfs/server/datanode/
src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/
src/test/java/org/apache/hadoop/hdfs/server/da...
Author: vinayakumarb
Date: Mon Aug 4 08:43:51 2014
New Revision: 1615504
URL: http://svn.apache.org/r1615504
Log:
HDFS-5185. DN fails to startup if one of the data dir is full. Contributed by Vinayakumar B.
Modified:
hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java
hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java
hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java
hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDiskError.java
Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt?rev=1615504&r1=1615503&r2=1615504&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt (original)
+++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt Mon Aug 4 08:43:51 2014
@@ -433,6 +433,8 @@ Release 2.6.0 - UNRELEASED
HDFS-5723. Append failed FINALIZED replica should not be accepted as valid
when that block is underconstruction (vinayakumarb)
+ HDFS-5185. DN fails to startup if one of the data dir is full. (vinayakumarb)
+
Release 2.5.0 - UNRELEASED
INCOMPATIBLE CHANGES
Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java?rev=1615504&r1=1615503&r2=1615504&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java (original)
+++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java Mon Aug 4 08:43:51 2014
@@ -253,7 +253,7 @@ class BlockReceiver implements Closeable
if (cause != null) { // possible disk error
ioe = cause;
- datanode.checkDiskError();
+ datanode.checkDiskErrorAsync();
}
throw ioe;
@@ -329,7 +329,7 @@ class BlockReceiver implements Closeable
}
// disk check
if(ioe != null) {
- datanode.checkDiskError();
+ datanode.checkDiskErrorAsync();
throw ioe;
}
}
@@ -639,7 +639,7 @@ class BlockReceiver implements Closeable
manageWriterOsCache(offsetInBlock);
}
} catch (IOException iex) {
- datanode.checkDiskError();
+ datanode.checkDiskErrorAsync();
throw iex;
}
}
@@ -1208,7 +1208,7 @@ class BlockReceiver implements Closeable
} catch (IOException e) {
LOG.warn("IOException in BlockReceiver.run(): ", e);
if (running) {
- datanode.checkDiskError();
+ datanode.checkDiskErrorAsync();
LOG.info(myString, e);
running = false;
if (!Thread.interrupted()) { // failure not caused by interruption
Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java?rev=1615504&r1=1615503&r2=1615504&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java (original)
+++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java Mon Aug 4 08:43:51 2014
@@ -1075,6 +1075,11 @@ public class DataNode extends Configured
// In the case that this is the first block pool to connect, initialize
// the dataset, block scanners, etc.
initStorage(nsInfo);
+
+ // Exclude failed disks before initializing the block pools to avoid startup
+ // failures.
+ checkDiskError();
+
initPeriodicScanners(conf);
data.addBlockPool(nsInfo.getBlockPoolID(), conf);
@@ -1510,9 +1515,9 @@ public class DataNode extends Configured
/**
- * Check if there is a disk failure and if so, handle the error
+ * Check if there is a disk failure asynchronously and if so, handle the error
*/
- public void checkDiskError() {
+ public void checkDiskErrorAsync() {
synchronized(checkDiskErrorMutex) {
checkDiskErrorFlag = true;
if(checkDiskErrorThread == null) {
@@ -1821,7 +1826,7 @@ public class DataNode extends Configured
LOG.warn(bpReg + ":Failed to transfer " + b + " to " +
targets[0] + " got ", ie);
// check if there are any disk problem
- checkDiskError();
+ checkDiskErrorAsync();
} finally {
xmitsInProgress.getAndDecrement();
IOUtils.closeStream(blockSender);
@@ -2759,7 +2764,18 @@ public class DataNode extends Configured
public ShortCircuitRegistry getShortCircuitRegistry() {
return shortCircuitRegistry;
}
-
+
+ /**
+ * Check the disk error
+ */
+ private void checkDiskError() {
+ try {
+ data.checkDataDir();
+ } catch (DiskErrorException de) {
+ handleDiskError(de.getMessage());
+ }
+ }
+
/**
* Starts a new thread which will check for disk error check request
* every 5 sec
@@ -2776,9 +2792,7 @@ public class DataNode extends Configured
}
if(tempFlag) {
try {
- data.checkDataDir();
- } catch (DiskErrorException de) {
- handleDiskError(de.getMessage());
+ checkDiskError();
} catch (Exception e) {
LOG.warn("Unexpected exception occurred while checking disk error " + e);
checkDiskErrorThread = null;
Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java?rev=1615504&r1=1615503&r2=1615504&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java (original)
+++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java Mon Aug 4 08:43:51 2014
@@ -1151,7 +1151,7 @@ class FsDatasetImpl implements FsDataset
return f;
// if file is not null, but doesn't exist - possibly disk failed
- datanode.checkDiskError();
+ datanode.checkDiskErrorAsync();
}
if (LOG.isDebugEnabled()) {
Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDiskError.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDiskError.java?rev=1615504&r1=1615503&r2=1615504&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDiskError.java (original)
+++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDiskError.java Mon Aug 4 08:43:51 2014
@@ -201,7 +201,7 @@ public class TestDiskError {
}
/**
- * Checks whether {@link DataNode#checkDiskError()} is being called or not.
+ * Checks whether {@link DataNode#checkDiskErrorAsync()} is being called or not.
* Before refactoring the code the above function was not getting called
* @throws IOException, InterruptedException
*/
@@ -214,7 +214,7 @@ public class TestDiskError {
DataNode dataNode = cluster.getDataNodes().get(0);
long slackTime = dataNode.checkDiskErrorInterval/2;
//checking for disk error
- dataNode.checkDiskError();
+ dataNode.checkDiskErrorAsync();
Thread.sleep(dataNode.checkDiskErrorInterval);
long lastDiskErrorCheck = dataNode.getLastDiskErrorCheck();
assertTrue("Disk Error check is not performed within " + dataNode.checkDiskErrorInterval + " ms", ((Time.monotonicNow()-lastDiskErrorCheck) < (dataNode.checkDiskErrorInterval + slackTime)));