You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by ji...@apache.org on 2008/12/11 22:53:35 UTC

svn commit: r725828 - in /hadoop/hbase/trunk: CHANGES.txt src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java

Author: jimk
Date: Thu Dec 11 13:53:35 2008
New Revision: 725828

URL: http://svn.apache.org/viewvc?rev=725828&view=rev
Log:
HBASE-1052  Stopping a HRegionServer with unflushed cache causes data loss from org.apache.hadoop.hbase.DroppedSnapshotException

Modified:
    hadoop/hbase/trunk/CHANGES.txt
    hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java

Modified: hadoop/hbase/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/CHANGES.txt?rev=725828&r1=725827&r2=725828&view=diff
==============================================================================
--- hadoop/hbase/trunk/CHANGES.txt (original)
+++ hadoop/hbase/trunk/CHANGES.txt Thu Dec 11 13:53:35 2008
@@ -104,6 +104,8 @@
    HBASE-900   Regionserver memory leak causing OOME during relatively
                modest bulk importing; part 1
    HBASE-1054  Index NPE on scanning (Clint Morgan via Andrew Purtell)
+   HBASE-1052  Stopping a HRegionServer with unflushed cache causes data loss
+               from org.apache.hadoop.hbase.DroppedSnapshotException
 
   IMPROVEMENTS
    HBASE-901   Add a limit to key length, check key and value length on client side

Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java?rev=725828&r1=725827&r2=725828&view=diff
==============================================================================
--- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (original)
+++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java Thu Dec 11 13:53:35 2008
@@ -24,6 +24,7 @@
 import java.lang.management.ManagementFactory;
 import java.lang.management.MemoryUsage;
 import java.lang.reflect.Constructor;
+import java.lang.reflect.Field;
 import java.net.InetSocketAddress;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -282,10 +283,6 @@
     for(int i = 0; i < nbBlocks; i++)  {
       reservedSpace.add(new byte[DEFAULT_SIZE_RESERVATION_BLOCK]);
     }
-    
-    // Register shutdown hook for HRegionServer, runs an orderly shutdown
-    // when a kill signal is recieved
-    Runtime.getRuntime().addShutdownHook(new ShutdownThread(this));
   }
 
   /**
@@ -522,6 +519,15 @@
       this.hbaseMaster = null;
     }
     join();
+
+    LOG.info("Running hdfs shutdown thread");
+    hdfsShutdownThread.start();
+    try {
+      hdfsShutdownThread.join();
+      LOG.info("Hdfs shutdown thread completed.");
+    } catch (InterruptedException e) {
+      LOG.warn("hdfsShutdownThread.join() was interrupted", e);
+    }
     LOG.info(Thread.currentThread().getName() + " exiting");
   }
 
@@ -552,6 +558,13 @@
       // to defaults).
       this.conf.set("fs.default.name", this.conf.get("hbase.rootdir"));
       this.fs = FileSystem.get(this.conf);
+
+      // Register shutdown hook for HRegionServer, runs an orderly shutdown
+      // when a kill signal is recieved
+      Runtime.getRuntime().addShutdownHook(new ShutdownThread(this,
+          Thread.currentThread()));
+      this.hdfsShutdownThread = suppressHdfsShutdownHook();
+
       this.rootDir = new Path(this.conf.get(HConstants.HBASE_DIR));
       this.log = setupHLog();
       this.logFlusher.setHLog(log);
@@ -693,25 +706,34 @@
    */
   private static class ShutdownThread extends Thread {
     private final HRegionServer instance;
+    private final Thread mainThread;
     
     /**
      * @param instance
+     * @param mainThread
      */
-    public ShutdownThread(HRegionServer instance) {
+    public ShutdownThread(HRegionServer instance, Thread mainThread) {
       this.instance = instance;
+      this.mainThread = mainThread;
     }
 
     @Override
     public void run() {
       LOG.info("Starting shutdown thread.");
       
-      // tell the region server to stop and wait for it to complete
+      // tell the region server to stop
       instance.stop();
-      instance.join();
+
+      // Wait for main thread to exit.
+      Threads.shutdown(mainThread);
+
       LOG.info("Shutdown thread complete");
     }    
   }
 
+  // We need to call HDFS shutdown when we are done shutting down
+  private Thread hdfsShutdownThread;
+
   /*
    * Inner class that runs on a long period checking if regions need major
    * compaction.
@@ -745,6 +767,43 @@
   }
   
   /**
+   * So, HDFS caches FileSystems so when you call FileSystem.get it's fast. In
+   * order to make sure things are cleaned up, it also creates a shutdown hook
+   * so that all filesystems can be closed when the process is terminated. This
+   * conveniently runs concurrently with our own shutdown handler, and
+   * therefore causes all the filesystems to be closed before the server can do
+   * all its necessary cleanup.
+   *
+   * The crazy dirty reflection in this method sneaks into the FileSystem cache
+   * and grabs the shutdown hook, removes it from the list of active shutdown
+   * hooks, and hangs onto it until later. Then, after we're properly done with
+   * our graceful shutdown, we can execute the hdfs hook manually to make sure
+   * loose ends are tied up.
+   *
+   * This seems quite fragile and susceptible to breaking if Hadoop changes
+   * anything about the way this cleanup is managed. Keep an eye on things.
+   */
+  private Thread suppressHdfsShutdownHook() {
+    try {
+      Field field = FileSystem.class.getDeclaredField ("clientFinalizer");
+      field.setAccessible(true);
+      Thread hdfsClientFinalizer = (Thread)field.get(null);
+      if (hdfsClientFinalizer == null) {
+        throw new RuntimeException("client finalizer is null, can't suppress!");
+      }
+      Runtime.getRuntime().removeShutdownHook(hdfsClientFinalizer);
+      return hdfsClientFinalizer;
+      
+    } catch (NoSuchFieldException nsfe) {
+      LOG.fatal("Couldn't find field 'clientFinalizer' in FileSystem!", nsfe);
+      throw new RuntimeException("Failed to suppress HDFS shutdown hook");
+    } catch (IllegalAccessException iae) {
+      LOG.fatal("Couldn't access field 'clientFinalizer' in FileSystem!", iae);
+      throw new RuntimeException("Failed to suppress HDFS shutdown hook");
+    }
+  }
+
+  /**
    * Report the status of the server. A server is online once all the startup 
    * is completed (setting up filesystem, starting service threads, etc.). This
    * method is designed mostly to be useful in tests.