You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by mb...@apache.org on 2012/05/04 12:37:52 UTC
svn commit: r1333866 - in
/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master:
HMaster.java MetaScanner.java RetryableMetaOperation.java RootScanner.java
ServerManager.java
Author: mbautin
Date: Fri May 4 10:37:51 2012
New Revision: 1333866
URL: http://svn.apache.org/viewvc?rev=1333866&view=rev
Log:
[master] Make master's checkFileSystem() not kill the cluster
Summary:
Make checkFileSystem() simply LOG.warn when the file system is down, except when master is really going down.
Test Plan: Bring the master up and bring namenode down right before one of the following is about to happen: Remove old logs files. Log Splitting (rename files on startup?) Then look in the log to make sure that a warning gets logged.
Reviewers: pkhemani
Reviewed By: pkhemani
CC: nspiegelberg, kannan, hbase-eng@, pchakka, tao-diffs@lists
Differential Revision: https://phabricator.fb.com/D464000
Task ID: 1044645
Modified:
hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/MetaScanner.java
hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RetryableMetaOperation.java
hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RootScanner.java
hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1333866&r1=1333865&r2=1333866&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Fri May 4 10:37:51 2012
@@ -582,14 +582,20 @@ public class HMaster extends Thread impl
* If not, sets closed
* @return false if file system is not available
*/
- protected boolean checkFileSystem() {
+ protected boolean checkFileSystem(boolean shutdownClusterOnFail) {
if (this.fsOk) {
try {
FSUtils.checkFileSystemAvailable(this.fs);
} catch (IOException e) {
- LOG.fatal("Shutting down HBase cluster: file system not available", e);
- shutdownClusterNow();
- this.fsOk = false;
+ if (shutdownClusterOnFail) {
+ LOG.fatal("Shutting down HBase cluster: file system not available", e);
+ shutdownClusterNow();
+ this.fsOk = false;
+ }
+ else {
+ LOG.warn("File system unavailable, but continuing anyway", e);
+ return false;
+ }
}
}
return this.fsOk;
@@ -756,10 +762,10 @@ public class HMaster extends Thread impl
// If FAILED op processing, bad. Exit.
break FINISHED;
case REQUEUED_BUT_PROBLEM:
- if (!checkFileSystem())
- // If bad filesystem, exit.
- break FINISHED;
- default:
+ // LOG if the file system is down, but don't do anything.
+ checkFileSystem(false);
+ break;
+ default:
// Continue run loop if conditions are PROCESSED, NOOP, REQUEUED
break;
}
@@ -985,7 +991,7 @@ public class HMaster extends Thread impl
/*
* Inspect the log directory to recover any log file without
- * ad active region server.
+ * an active region server.
*/
private void splitLogAfterStartup() {
boolean retrySplitting = !conf.getBoolean("hbase.hlog.split.skip.errors",
@@ -1031,10 +1037,11 @@ public class HMaster extends Thread impl
LOG.warn("Failed splitting of " + serverNames, ioe);
// reset serverNames
serverNames = new ArrayList<String>();
- if (!checkFileSystem()) {
- LOG.warn("Bad Filesystem, exiting");
- Runtime.getRuntime().halt(1);
- }
+
+ // if the file system is down, then just log it, and retry the log
+ // splitting after 30 seconds if retry splitting is turned on
+ checkFileSystem(false);
+
try {
if (retrySplitting) {
Thread.sleep(30000); //30s
Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/MetaScanner.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/MetaScanner.java?rev=1333866&r1=1333865&r2=1333866&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/MetaScanner.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/MetaScanner.java Fri May 4 10:37:51 2012
@@ -86,11 +86,13 @@ class MetaScanner extends BaseScanner {
return false;
}
// Make sure the file system is still available
- this.master.checkFileSystem();
+ this.master.checkFileSystem(false);
+ // TODO: Should we return false here? See RootScanner for more info.
} catch (Exception e) {
// If for some reason we get some other kind of exception,
// at least log it rather than go out silently.
LOG.error("Unexpected exception", e);
+ // TODO: Should we return false here? See RootScanner for more info.
}
return true;
}
Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RetryableMetaOperation.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RetryableMetaOperation.java?rev=1333866&r1=1333865&r2=1333866&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RetryableMetaOperation.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RetryableMetaOperation.java Fri May 4 10:37:51 2012
@@ -86,7 +86,7 @@ abstract class RetryableMetaOperation<T>
}
LOG.debug(message);
}
- this.master.checkFileSystem();
+ this.master.checkFileSystem(false);
throw e;
}
if (LOG.isDebugEnabled()) {
Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RootScanner.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RootScanner.java?rev=1333866&r1=1333865&r2=1333866&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RootScanner.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RootScanner.java Fri May 4 10:37:51 2012
@@ -58,12 +58,19 @@ class RootScanner extends BaseScanner {
} catch (IOException e) {
e = RemoteExceptionHandler.checkIOException(e);
LOG.warn("Scan ROOT region", e);
- // Make sure the file system is still available
- master.checkFileSystem();
+ // Make sure the file system is still available, but don't do anything
+ // if it's not available.
+ master.checkFileSystem(false);
+ // TODO: we used to ignore this. Now, we'll enter an infinite loop if
+ // this is an idempotent problem but the web ui will be up.
+ // Revisit this later
+ return false;
} catch (Exception e) {
// If for some reason we get some other kind of exception,
// at least log it rather than go out silently.
LOG.error("Unexpected exception", e);
+ // TODO: See above
+ return false;
}
return true;
}
Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=1333866&r1=1333865&r2=1333866&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java Fri May 4 10:37:51 2012
@@ -926,7 +926,7 @@ public class ServerManager {
* a MSG_REGIONSERVER_STOP.
*/
void letRegionServersShutdown() {
- if (!master.checkFileSystem()) {
+ if (!master.checkFileSystem(true)) {
// Forget waiting for the region servers if the file system has gone
// away. Just exit as quickly as possible.
return;