You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by mb...@apache.org on 2012/05/04 12:37:52 UTC

svn commit: r1333866 - in /hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master: HMaster.java MetaScanner.java RetryableMetaOperation.java RootScanner.java ServerManager.java

Author: mbautin
Date: Fri May  4 10:37:51 2012
New Revision: 1333866

URL: http://svn.apache.org/viewvc?rev=1333866&view=rev
Log:
[master] Make master's checkFileSystem() not kill the cluster

Summary:
  Make checkFileSystem() simply LOG.warn when the file system is down, except when master is really going down.

Test Plan: Bring the master up and bring namenode down right before one of the following is about to happen: Remove old logs files. Log Splitting (rename files on startup?) Then look in the log to make sure that a warning gets logged.

Reviewers: pkhemani

Reviewed By: pkhemani

CC: nspiegelberg, kannan, hbase-eng@, pchakka, tao-diffs@lists

Differential Revision: https://phabricator.fb.com/D464000

Task ID: 1044645

Modified:
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/MetaScanner.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RetryableMetaOperation.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RootScanner.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1333866&r1=1333865&r2=1333866&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Fri May  4 10:37:51 2012
@@ -582,14 +582,20 @@ public class HMaster extends Thread impl
    * If not, sets closed
    * @return false if file system is not available
    */
-  protected boolean checkFileSystem() {
+  protected boolean checkFileSystem(boolean shutdownClusterOnFail) {
     if (this.fsOk) {
       try {
         FSUtils.checkFileSystemAvailable(this.fs);
       } catch (IOException e) {
-        LOG.fatal("Shutting down HBase cluster: file system not available", e);
-        shutdownClusterNow();
-        this.fsOk = false;
+        if (shutdownClusterOnFail) {
+          LOG.fatal("Shutting down HBase cluster: file system not available", e);
+          shutdownClusterNow();
+          this.fsOk = false;
+        }
+        else {
+          LOG.warn("File system unavailable, but continuing anyway", e);
+          return false;
+        }
       }
     }
     return this.fsOk;
@@ -756,10 +762,10 @@ public class HMaster extends Thread impl
             // If FAILED op processing, bad. Exit.
           break FINISHED;
         case REQUEUED_BUT_PROBLEM:
-          if (!checkFileSystem())
-              // If bad filesystem, exit.
-            break FINISHED;
-          default:
+          // LOG if the file system is down, but don't do anything.
+          checkFileSystem(false);
+          break;
+        default:
             // Continue run loop if conditions are PROCESSED, NOOP, REQUEUED
           break;
         }
@@ -985,7 +991,7 @@ public class HMaster extends Thread impl
 
   /*
    * Inspect the log directory to recover any log file without
-   * ad active region server.
+   * an active region server.
    */
   private void splitLogAfterStartup() {
     boolean retrySplitting = !conf.getBoolean("hbase.hlog.split.skip.errors",
@@ -1031,10 +1037,11 @@ public class HMaster extends Thread impl
           LOG.warn("Failed splitting of " + serverNames, ioe);
           // reset serverNames
           serverNames = new ArrayList<String>();
-          if (!checkFileSystem()) {
-            LOG.warn("Bad Filesystem, exiting");
-            Runtime.getRuntime().halt(1);
-          }
+
+          // if the file system is down, then just log it, and retry the log
+          //  splitting after 30 seconds if retry splitting is turned on
+          checkFileSystem(false);
+
           try {
             if (retrySplitting) {
               Thread.sleep(30000); //30s

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/MetaScanner.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/MetaScanner.java?rev=1333866&r1=1333865&r2=1333866&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/MetaScanner.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/MetaScanner.java Fri May  4 10:37:51 2012
@@ -86,11 +86,13 @@ class MetaScanner extends BaseScanner {
         return false;
       }
       // Make sure the file system is still available
-      this.master.checkFileSystem();
+      this.master.checkFileSystem(false);
+      // TODO: Should we return false here? See RootScanner for more info.
     } catch (Exception e) {
       // If for some reason we get some other kind of exception,
       // at least log it rather than go out silently.
       LOG.error("Unexpected exception", e);
+      // TODO: Should we return false here? See RootScanner for more info.
     }
     return true;
   }

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RetryableMetaOperation.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RetryableMetaOperation.java?rev=1333866&r1=1333865&r2=1333866&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RetryableMetaOperation.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RetryableMetaOperation.java Fri May  4 10:37:51 2012
@@ -86,7 +86,7 @@ abstract class RetryableMetaOperation<T>
             }
             LOG.debug(message);
           }
-          this.master.checkFileSystem();
+          this.master.checkFileSystem(false);
           throw e;
         }
         if (LOG.isDebugEnabled()) {

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RootScanner.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RootScanner.java?rev=1333866&r1=1333865&r2=1333866&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RootScanner.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RootScanner.java Fri May  4 10:37:51 2012
@@ -58,12 +58,19 @@ class RootScanner extends BaseScanner {
     } catch (IOException e) {
       e = RemoteExceptionHandler.checkIOException(e);
       LOG.warn("Scan ROOT region", e);
-      // Make sure the file system is still available
-      master.checkFileSystem();
+      // Make sure the file system is still available, but don't do anything
+      //  if it's not available.
+      master.checkFileSystem(false);
+      // TODO: we used to ignore this. Now, we'll enter an infinite loop if
+      // this is an idempotent problem but the web ui will be up.
+      // Revisit this later
+      return false;
     } catch (Exception e) {
       // If for some reason we get some other kind of exception,
       // at least log it rather than go out silently.
       LOG.error("Unexpected exception", e);
+      // TODO: See above
+      return false;
     }
     return true;
   }

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=1333866&r1=1333865&r2=1333866&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java Fri May  4 10:37:51 2012
@@ -926,7 +926,7 @@ public class ServerManager {
    * a MSG_REGIONSERVER_STOP.
    */
   void letRegionServersShutdown() {
-    if (!master.checkFileSystem()) {
+    if (!master.checkFileSystem(true)) {
       // Forget waiting for the region servers if the file system has gone
       // away. Just exit as quickly as possible.
       return;