You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by ji...@apache.org on 2008/12/11 22:53:35 UTC
svn commit: r725828 - in /hadoop/hbase/trunk: CHANGES.txt
src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
Author: jimk
Date: Thu Dec 11 13:53:35 2008
New Revision: 725828
URL: http://svn.apache.org/viewvc?rev=725828&view=rev
Log:
HBASE-1052 Stopping a HRegionServer with unflushed cache causes data loss from org.apache.hadoop.hbase.DroppedSnapshotException
Modified:
hadoop/hbase/trunk/CHANGES.txt
hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
Modified: hadoop/hbase/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/CHANGES.txt?rev=725828&r1=725827&r2=725828&view=diff
==============================================================================
--- hadoop/hbase/trunk/CHANGES.txt (original)
+++ hadoop/hbase/trunk/CHANGES.txt Thu Dec 11 13:53:35 2008
@@ -104,6 +104,8 @@
HBASE-900 Regionserver memory leak causing OOME during relatively
modest bulk importing; part 1
HBASE-1054 Index NPE on scanning (Clint Morgan via Andrew Purtell)
+ HBASE-1052 Stopping a HRegionServer with unflushed cache causes data loss
+ from org.apache.hadoop.hbase.DroppedSnapshotException
IMPROVEMENTS
HBASE-901 Add a limit to key length, check key and value length on client side
Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java?rev=725828&r1=725827&r2=725828&view=diff
==============================================================================
--- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (original)
+++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java Thu Dec 11 13:53:35 2008
@@ -24,6 +24,7 @@
import java.lang.management.ManagementFactory;
import java.lang.management.MemoryUsage;
import java.lang.reflect.Constructor;
+import java.lang.reflect.Field;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.Arrays;
@@ -282,10 +283,6 @@
for(int i = 0; i < nbBlocks; i++) {
reservedSpace.add(new byte[DEFAULT_SIZE_RESERVATION_BLOCK]);
}
-
- // Register shutdown hook for HRegionServer, runs an orderly shutdown
- // when a kill signal is recieved
- Runtime.getRuntime().addShutdownHook(new ShutdownThread(this));
}
/**
@@ -522,6 +519,15 @@
this.hbaseMaster = null;
}
join();
+
+ LOG.info("Running hdfs shutdown thread");
+ hdfsShutdownThread.start();
+ try {
+ hdfsShutdownThread.join();
+ LOG.info("Hdfs shutdown thread completed.");
+ } catch (InterruptedException e) {
+ LOG.warn("hdfsShutdownThread.join() was interrupted", e);
+ }
LOG.info(Thread.currentThread().getName() + " exiting");
}
@@ -552,6 +558,13 @@
// to defaults).
this.conf.set("fs.default.name", this.conf.get("hbase.rootdir"));
this.fs = FileSystem.get(this.conf);
+
+ // Register shutdown hook for HRegionServer, runs an orderly shutdown
+ // when a kill signal is recieved
+ Runtime.getRuntime().addShutdownHook(new ShutdownThread(this,
+ Thread.currentThread()));
+ this.hdfsShutdownThread = suppressHdfsShutdownHook();
+
this.rootDir = new Path(this.conf.get(HConstants.HBASE_DIR));
this.log = setupHLog();
this.logFlusher.setHLog(log);
@@ -693,25 +706,34 @@
*/
private static class ShutdownThread extends Thread {
private final HRegionServer instance;
+ private final Thread mainThread;
/**
* @param instance
+ * @param mainThread
*/
- public ShutdownThread(HRegionServer instance) {
+ public ShutdownThread(HRegionServer instance, Thread mainThread) {
this.instance = instance;
+ this.mainThread = mainThread;
}
@Override
public void run() {
LOG.info("Starting shutdown thread.");
- // tell the region server to stop and wait for it to complete
+ // tell the region server to stop
instance.stop();
- instance.join();
+
+ // Wait for main thread to exit.
+ Threads.shutdown(mainThread);
+
LOG.info("Shutdown thread complete");
}
}
+ // We need to call HDFS shutdown when we are done shutting down
+ private Thread hdfsShutdownThread;
+
/*
* Inner class that runs on a long period checking if regions need major
* compaction.
@@ -745,6 +767,43 @@
}
/**
+ * So, HDFS caches FileSystems so when you call FileSystem.get it's fast. In
+ * order to make sure things are cleaned up, it also creates a shutdown hook
+ * so that all filesystems can be closed when the process is terminated. This
+ * conveniently runs concurrently with our own shutdown handler, and
+ * therefore causes all the filesystems to be closed before the server can do
+ * all its necessary cleanup.
+ *
+ * The crazy dirty reflection in this method sneaks into the FileSystem cache
+ * and grabs the shutdown hook, removes it from the list of active shutdown
+ * hooks, and hangs onto it until later. Then, after we're properly done with
+ * our graceful shutdown, we can execute the hdfs hook manually to make sure
+ * loose ends are tied up.
+ *
+ * This seems quite fragile and susceptible to breaking if Hadoop changes
+ * anything about the way this cleanup is managed. Keep an eye on things.
+ */
+ private Thread suppressHdfsShutdownHook() {
+ try {
+ Field field = FileSystem.class.getDeclaredField ("clientFinalizer");
+ field.setAccessible(true);
+ Thread hdfsClientFinalizer = (Thread)field.get(null);
+ if (hdfsClientFinalizer == null) {
+ throw new RuntimeException("client finalizer is null, can't suppress!");
+ }
+ Runtime.getRuntime().removeShutdownHook(hdfsClientFinalizer);
+ return hdfsClientFinalizer;
+
+ } catch (NoSuchFieldException nsfe) {
+ LOG.fatal("Couldn't find field 'clientFinalizer' in FileSystem!", nsfe);
+ throw new RuntimeException("Failed to suppress HDFS shutdown hook");
+ } catch (IllegalAccessException iae) {
+ LOG.fatal("Couldn't access field 'clientFinalizer' in FileSystem!", iae);
+ throw new RuntimeException("Failed to suppress HDFS shutdown hook");
+ }
+ }
+
+ /**
* Report the status of the server. A server is online once all the startup
* is completed (setting up filesystem, starting service threads, etc.). This
* method is designed mostly to be useful in tests.