You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2011/07/14 02:27:15 UTC

svn commit: r1146526 - in /hbase/trunk: ./ src/main/java/org/apache/hadoop/hbase/master/ src/main/java/org/apache/hadoop/hbase/master/handler/ src/main/java/org/apache/hadoop/hbase/regionserver/ src/test/java/org/apache/hadoop/hbase/catalog/ src/test/j...

Author: stack
Date: Thu Jul 14 00:27:14 2011
New Revision: 1146526

URL: http://svn.apache.org/viewvc?rev=1146526&view=rev
Log:
HBASE-3872 Hole in split transaction rollback; edits to .META. need to be rolled back even if it seems like they didn't make it

Modified:
    hbase/trunk/CHANGES.txt
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/CatalogJanitor.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/RegionServerServices.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/SplitRequest.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/SplitTransaction.java
    hbase/trunk/src/test/java/org/apache/hadoop/hbase/catalog/TestCatalogTracker.java
    hbase/trunk/src/test/java/org/apache/hadoop/hbase/master/TestCatalogJanitor.java
    hbase/trunk/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransaction.java

Modified: hbase/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hbase/trunk/CHANGES.txt?rev=1146526&r1=1146525&r2=1146526&view=diff
==============================================================================
--- hbase/trunk/CHANGES.txt (original)
+++ hbase/trunk/CHANGES.txt Thu Jul 14 00:27:14 2011
@@ -409,6 +409,8 @@ Release 0.90.4 - Unreleased
    HBASE-4033  The shutdown RegionServer could be added to
                AssignmentManager.servers again (Jieshan Bean)
    HBASE-4088  npes in server shutdown
+   HBASE-3872  Hole in split transaction rollback; edits to .META. need
+               to be rolled back even if it seems like they didn't make it
 
   IMPROVEMENT
    HBASE-3882  hbase-config.sh needs to be updated so it can auto-detects the

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/CatalogJanitor.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/CatalogJanitor.java?rev=1146526&r1=1146525&r2=1146526&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/CatalogJanitor.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/CatalogJanitor.java Thu Jul 14 00:27:14 2011
@@ -44,6 +44,7 @@ import org.apache.hadoop.hbase.client.Re
 import org.apache.hadoop.hbase.regionserver.HRegion;
 import org.apache.hadoop.hbase.regionserver.Store;
 import org.apache.hadoop.hbase.regionserver.StoreFile;
+import org.apache.hadoop.hbase.util.Pair;
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.util.Writables;
 
@@ -161,16 +162,15 @@ class CatalogJanitor extends Chore {
    * the filesystem.
    * @throws IOException
    */
-  boolean cleanParent(final HRegionInfo parent,
-    Result rowContent)
+  boolean cleanParent(final HRegionInfo parent, Result rowContent)
   throws IOException {
     boolean result = false;
     // Run checks on each daughter split.
-    boolean hasReferencesA =
+    Pair<Boolean, Boolean> a =
       checkDaughter(parent, rowContent, HConstants.SPLITA_QUALIFIER);
-    boolean hasReferencesB =
+    Pair<Boolean, Boolean> b =
       checkDaughter(parent, rowContent, HConstants.SPLITB_QUALIFIER);
-    if (!hasReferencesA && !hasReferencesB) {
+    if ((a.getFirst() && !a.getSecond()) && (b.getFirst() && !b.getSecond())) {
       LOG.debug("Deleting region " + parent.getRegionNameAsString() +
         " because daughter splits no longer hold references");
       // This latter regionOffline should not be necessary but is done for now
@@ -197,14 +197,26 @@ class CatalogJanitor extends Chore {
    * @param parent
    * @param rowContent
    * @param qualifier
-   * @return True if this daughter still has references to the parent.
+   * @return A pair where the first boolean says whether or not the daughter
+   * region directory exists in the filesystem and then the second boolean says
+   * whether the daughter has references to the parent.
    * @throws IOException
    */
-  boolean checkDaughter(final HRegionInfo parent,
+  Pair<Boolean, Boolean> checkDaughter(final HRegionInfo parent,
     final Result rowContent, final byte [] qualifier)
   throws IOException {
     HRegionInfo hri = getDaughterRegionInfo(rowContent, qualifier);
-    return hasReferences(parent, rowContent, hri, qualifier);
+    Pair<Boolean, Boolean> result =
+      checkDaughterInFs(parent, rowContent, hri, qualifier);
+    if (result.getFirst() && !result.getSecond()) {
+      // Remove daughter from the parent IFF the daughter region exists in FS.
+      // If there is no daughter region in the filesystem, must be because of
+      // a failed split.  The ServerShutdownHandler will do the fixup.  Don't
+      // do any deletes in here that could intefere with ServerShutdownHandler
+      // fixup
+      removeDaughterFromParent(parent, hri, qualifier);
+    }
+    return result;
   }
 
   /**
@@ -242,23 +254,35 @@ class CatalogJanitor extends Chore {
   /**
    * Checks if a daughter region -- either splitA or splitB -- still holds
    * references to parent.  If not, removes reference to the split from
-   * the parent meta region row so we don't check it any more.
+   * the parent meta region row so we don't check it any more.  Also checks
+   * daughter region exists in the filesytem.
    * @param parent Parent region name. 
    * @param rowContent Keyed content of the parent row in meta region.
    * @param split Which column family.
    * @param qualifier Which of the daughters to look at, splitA or splitB.
-   * @return True if still has references to parent.
+   * @return A pair where the first boolean says whether or not the daughter
+   * region directory exists in the filesystem and then the second boolean says
+   * whether the daughter has references to the parent.
    * @throws IOException
    */
-  boolean hasReferences(final HRegionInfo parent,
+  Pair<Boolean, Boolean> checkDaughterInFs(final HRegionInfo parent,
     final Result rowContent, final HRegionInfo split,
     final byte [] qualifier)
   throws IOException {
-    boolean result = false;
-    if (split == null)  return result;
+    boolean references = false;
+    boolean exists = false;
+    if (split == null)  {
+      return new Pair<Boolean, Boolean>(Boolean.FALSE, Boolean.FALSE);
+    }
     FileSystem fs = this.services.getMasterFileSystem().getFileSystem();
     Path rootdir = this.services.getMasterFileSystem().getRootDir();
     Path tabledir = new Path(rootdir, split.getTableNameAsString());
+    Path regiondir = new Path(tabledir, split.getEncodedName());
+    exists = fs.exists(regiondir);
+    if (!exists) {
+      LOG.warn("Daughter regiondir does not exist: " + regiondir.toString());
+      return new Pair<Boolean, Boolean>(exists, Boolean.FALSE);
+    }
     HTableDescriptor parentDescriptor = getTableDescriptor(parent.getTableName());
 
     for (HColumnDescriptor family: parentDescriptor.getFamilies()) {
@@ -275,18 +299,16 @@ class CatalogJanitor extends Chore {
       );
 
       if (ps != null && ps.length > 0) {
-        result = true;
+        references = true;
         break;
       }
     }
-    if (!result) {
-      removeDaughterFromParent(parent, split, qualifier);
-    }
-    return result;
+    return new Pair<Boolean, Boolean>(Boolean.valueOf(exists),
+      Boolean.valueOf(references));
   }
 
   private HTableDescriptor getTableDescriptor(byte[] tableName)
   throws TableExistsException, FileNotFoundException, IOException {
     return this.services.getTableDescriptors().get(Bytes.toString(tableName));
   }
-}
\ No newline at end of file
+}

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java?rev=1146526&r1=1146525&r2=1146526&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java Thu Jul 14 00:27:14 2011
@@ -237,11 +237,12 @@ public class ServerShutdownHandler exten
    */
   static void fixupDaughters(final Result result,
       final AssignmentManager assignmentManager,
-      final CatalogTracker catalogTracker) throws IOException {
+      final CatalogTracker catalogTracker)
+  throws IOException {
     fixupDaughter(result, HConstants.SPLITA_QUALIFIER, assignmentManager,
-        catalogTracker);
+      catalogTracker);
     fixupDaughter(result, HConstants.SPLITB_QUALIFIER, assignmentManager,
-        catalogTracker);
+      catalogTracker);
   }
 
   /**
@@ -282,8 +283,8 @@ public class ServerShutdownHandler exten
   }
 
   /**
-   * Look for presence of the daughter OR of a split of the daughter. Daughter
-   * could have been split over on regionserver before a run of the
+   * Look for presence of the daughter OR of a split of the daughter in .META.
+   * Daughter could have been split over on regionserver before a run of the
    * catalogJanitor had chance to clear reference from parent.
    * @param daughter Daughter region to search for.
    * @throws IOException 
@@ -350,4 +351,4 @@ public class ServerShutdownHandler exten
       return false;
     }
   }
-}
+}
\ No newline at end of file

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java?rev=1146526&r1=1146525&r2=1146526&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java Thu Jul 14 00:27:14 2011
@@ -1380,7 +1380,8 @@ public class HRegionServer implements HR
   public void postOpenDeployTasks(final HRegion r, final CatalogTracker ct,
       final boolean daughter)
   throws KeeperException, IOException {
-    LOG.info("HRS.PostOpenDeployTasks");
+    LOG.info("Post open deploy tasks for region=" + r.getRegionNameAsString() +
+      ", daughter=" + daughter);
     // Do checks to see if we need to compact (references or too many files)
     for (Store s : r.getStores().values()) {
       if (s.hasReferences() || s.needsCompaction()) {
@@ -1393,32 +1394,23 @@ public class HRegionServer implements HR
     LOG.info("addToOnlineRegions is done" + r.getRegionInfo());
     // Update ZK, ROOT or META
     if (r.getRegionInfo().isRootRegion()) {
-
-      LOG.info("setRootLocation");
       RootLocationEditor.setRootLocation(getZooKeeper(),
        this.serverNameFromMasterPOV);
     } else if (r.getRegionInfo().isMetaRegion()) {
-      LOG.info("updateMetaLocation");
-
       MetaEditor.updateMetaLocation(ct, r.getRegionInfo(),
         this.serverNameFromMasterPOV);
     } else {
-      LOG.info("updateMetaLocation 111");
-
       if (daughter) {
-        LOG.info("updateMetaLocation 22");
-
         // If daughter of a split, update whole row, not just location.
         MetaEditor.addDaughter(ct, r.getRegionInfo(),
           this.serverNameFromMasterPOV);
       } else {
-        LOG.info("updateMetaLocation 33");
-
         MetaEditor.updateRegionLocation(ct, r.getRegionInfo(),
           this.serverNameFromMasterPOV);
       }
     }
-    LOG.info("END HRS.PostOpenDeployTasks");
+    LOG.info("Done with post open deploy taks for region=" +
+      r.getRegionNameAsString() + ", daughter=" + daughter);
 
   }
 

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/RegionServerServices.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/RegionServerServices.java?rev=1146526&r1=1146525&r2=1146526&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/RegionServerServices.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/RegionServerServices.java Thu Jul 14 00:27:14 2011
@@ -24,7 +24,6 @@ import java.io.IOException;
 import org.apache.hadoop.hbase.catalog.CatalogTracker;
 import org.apache.hadoop.hbase.ipc.HBaseRpcMetrics;
 import org.apache.hadoop.hbase.regionserver.wal.HLog;
-import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
 import org.apache.zookeeper.KeeperException;
 import java.util.Set;
 

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/SplitRequest.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/SplitRequest.java?rev=1146526&r1=1146525&r2=1146526&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/SplitRequest.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/SplitRequest.java Thu Jul 14 00:27:14 2011
@@ -62,15 +62,20 @@ class SplitRequest implements Runnable {
         st.execute(this.server, this.server);
       } catch (Exception e) {
         try {
-          LOG.info("Running rollback of failed split of " + parent + "; "
-              + e.getMessage());
-          st.rollback(this.server, this.server);
-          LOG.info("Successful rollback of failed split of " + parent);
+          LOG.info("Running rollback/cleanup of failed split of " +
+            parent.getRegionNameAsString() + "; " + e.getMessage());
+          if (st.rollback(this.server, this.server)) {
+            LOG.info("Successful rollback of failed split of " +
+              parent.getRegionNameAsString());
+          } else {
+            this.server.abort("Abort; we got an error after point-of-no-return");
+          }
         } catch (RuntimeException ee) {
-          // If failed rollback, kill server to avoid having a hole in table.
-          LOG.info("Failed rollback of failed split of "
-              + parent.getRegionNameAsString() + " -- aborting server", ee);
-          this.server.abort("Failed split");
+          String msg = "Failed rollback of failed split of " +
+            parent.getRegionNameAsString() + " -- aborting server";
+          // If failed rollback, kill this server to avoid having a hole in table.
+          LOG.info(msg, ee);
+          this.server.abort(msg);
         }
         return;
       }

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/SplitTransaction.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/SplitTransaction.java?rev=1146526&r1=1146525&r2=1146526&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/SplitTransaction.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/SplitTransaction.java Thu Jul 14 00:27:14 2011
@@ -60,18 +60,18 @@ import com.google.common.util.concurrent
 
 /**
  * Executes region split as a "transaction".  Call {@link #prepare()} to setup
- * the transaction, {@link #execute(OnlineRegions)} to run the transaction and
- * {@link #rollback(OnlineRegions)} to cleanup if execute fails.
+ * the transaction, {@link #execute(Server, RegionServerServices)} to run the
+ * transaction and {@link #rollback(OnlineRegions)} to cleanup if execute fails.
  *
  * <p>Here is an example of how you would use this class:
  * <pre>
  *  SplitTransaction st = new SplitTransaction(this.conf, parent, midKey)
  *  if (!st.prepare()) return;
  *  try {
- *    st.execute(myOnlineRegions);
+ *    st.execute(server, services);
  *  } catch (IOException ioe) {
  *    try {
- *      st.rollback(myOnlineRegions);
+ *      st.rollback(server, services);
  *      return;
  *    } catch (RuntimeException e) {
  *      myAbortable.abort("Failed split, abort");
@@ -101,7 +101,9 @@ public class SplitTransaction {
   private final byte [] splitrow;
 
   /**
-   * Types to add to the transaction journal
+   * Types to add to the transaction journal.
+   * Each enum is a step in the split transaction. Used to figure how much
+   * we need to rollback.
    */
   enum JournalEntry {
     /**
@@ -127,7 +129,13 @@ public class SplitTransaction {
     /**
      * Started in on the creation of the second daughter region.
      */
-    STARTED_REGION_B_CREATION
+    STARTED_REGION_B_CREATION,
+    /**
+     * Point of no return.
+     * If we got here, then transaction is not recoverable other than by
+     * crashing out the regionserver.
+     */
+    PONR
   }
 
   /*
@@ -137,7 +145,7 @@ public class SplitTransaction {
 
   /**
    * Constructor
-   * @param services So we can online new servces.  If null, we'll skip onlining
+   * @param services So we can online new regions.  If null, we'll skip onlining
    * (Useful testing).
    * @param c Configuration to use running split
    * @param r Region to split
@@ -156,7 +164,7 @@ public class SplitTransaction {
    */
   public boolean prepare() {
     if (this.parent.isClosed() || this.parent.isClosing()) return false;
-    // Split key can be false if this region is unsplittable; i.e. has refs.
+    // Split key can be null if this region is unsplittable; i.e. has refs.
     if (this.splitrow == null) return false;
     HRegionInfo hri = this.parent.getRegionInfo();
     parent.prepareToSplit();
@@ -196,13 +204,14 @@ public class SplitTransaction {
 
   /**
    * Run the transaction.
-   * @param server Hosting server instance.
+   * @param server Hosting server instance.  Can be null when testing (won't try
+   * and update in zk if a null server)
    * @param services Used to online/offline regions.
-   * @throws IOException If thrown, transaction failed. Call {@link #rollback(OnlineRegions)}
+   * @throws IOException If thrown, transaction failed. Call {@link #rollback(Server, RegionServerServices)}
    * @return Regions created
    * @throws KeeperException
    * @throws NodeExistsException 
-   * @see #rollback(OnlineRegions)
+   * @see #rollback(Server, RegionServerServices)
    */
   public PairOfSameType<HRegion> execute(final Server server,
       final RegionServerServices services)
@@ -227,7 +236,7 @@ public class SplitTransaction {
           this.fileSplitTimeout);
 
     // Set ephemeral SPLITTING znode up in zk.  Mocked servers sometimes don't
-    // have zookeeper so don't do zk stuff if zookeeper is null
+    // have zookeeper so don't do zk stuff if server or zookeeper is null
     if (server != null && server.getZooKeeper() != null) {
       try {
         this.znodeVersion = createNodeSplitting(server.getZooKeeper(),
@@ -259,13 +268,13 @@ public class SplitTransaction {
     }
     this.journal.add(JournalEntry.OFFLINED_PARENT);
 
-    // TODO: If the below were multithreaded would we complete steps in less
-    // elapsed time?  St.Ack 20100920
-
-    splitStoreFiles(this.splitdir, hstoreFilesToSplit);
+    // TODO: If splitStoreFiles were multithreaded would we complete steps in
+    // less elapsed time?  St.Ack 20100920
+    //
     // splitStoreFiles creates daughter region dirs under the parent splits dir
     // Nothing to unroll here if failure -- clean up of CREATE_SPLIT_DIR will
     // clean this up.
+    splitStoreFiles(this.splitdir, hstoreFilesToSplit);
 
     // Log to the journal that we are creating region A, the first daughter
     // region.  We could fail halfway through.  If we do, we could have left
@@ -278,27 +287,38 @@ public class SplitTransaction {
     this.journal.add(JournalEntry.STARTED_REGION_B_CREATION);
     HRegion b = createDaughterRegion(this.hri_b, this.parent.rsServices);
 
-    // Edit parent in meta
+    // Edit parent in meta.  Offlines parent region and adds splita and splitb.
     if (!testing) {
       MetaEditor.offlineParentInMeta(server.getCatalogTracker(),
         this.parent.getRegionInfo(), a.getRegionInfo(), b.getRegionInfo());
     }
 
-    // This is the point of no return.  We are committed to the split now.  We
-    // have still the daughter regions to open but meta has been changed.
-    // If we fail from here on out, we cannot rollback so, we'll just abort.
-    if (!testing) {
+    // This is the point of no return.  Adding subsequent edits to .META. as we
+    // do below when we do the daugther opens adding each to .META. can fail in
+    // various interesting ways the most interesting of which is a timeout
+    // BUT the edits all go through (See HBASE-3872).  IF we reach the POWR
+    // then subsequent failures need to crash out this regionserver; the
+    // server shutdown processing should be able to fix-up the incomplete split.
+    this.journal.add(JournalEntry.PONR);
       // Open daughters in parallel.
-      DaughterOpener aOpener = new DaughterOpener(server, services, a);
-      DaughterOpener bOpener = new DaughterOpener(server, services, b);
-      aOpener.start();
-      bOpener.start();
-      try {
-        aOpener.join();
-        bOpener.join();
-      } catch (InterruptedException e) {
-        server.abort("Exception running daughter opens", e);
-      }
+    DaughterOpener aOpener = new DaughterOpener(server, services, a);
+    DaughterOpener bOpener = new DaughterOpener(server, services, b);
+    aOpener.start();
+    bOpener.start();
+    try {
+      aOpener.join();
+      bOpener.join();
+    } catch (InterruptedException e) {
+      Thread.currentThread().interrupt();
+      throw new IOException("Interrupted " + e.getMessage());
+    }
+    if (aOpener.getException() != null) {
+      throw new IOException("Failed " +
+        aOpener.getName(), aOpener.getException());
+    }
+    if (bOpener.getException() != null) {
+      throw new IOException("Failed " +
+        bOpener.Name(), bOpener.getException());
     }
 
     // Tell master about split by updating zk.  If we fail, abort.
@@ -328,12 +348,10 @@ public class SplitTransaction {
         if (e instanceof InterruptedException) {
           Thread.currentThread().interrupt();
         }
-        server.abort("Failed telling master about split", e);
+        throw new IOException("Failed telling master about split", e);
       }
     }
 
-
-
     // Coprocessor callback
     if (this.parent.getCoprocessorHost() != null) {
       this.parent.getCoprocessorHost().postSplit(a,b);
@@ -345,10 +363,15 @@ public class SplitTransaction {
     return new PairOfSameType<HRegion>(a, b);
   }
 
+  /*
+   * Open daughter region in its own thread.
+   * If we fail, abort this hosting server.
+   */
   class DaughterOpener extends Thread {
     private final RegionServerServices services;
     private final Server server;
     private final HRegion r;
+    private Throwable t = null;
 
     DaughterOpener(final Server s, final RegionServerServices services,
         final HRegion r) {
@@ -359,13 +382,20 @@ public class SplitTransaction {
       this.r = r;
     }
 
+    /**
+     * @return Null if open succeeded else exception that causes us fail open.
+     * Call it after this thread exits else you may get wrong view on result.
+     */
+    Throwable getException() {
+      return this.t;
+    }
+
     @Override
     public void run() {
       try {
         openDaughterRegion(this.server, this.services, r);
       } catch (Throwable t) {
-        this.server.abort("Failed open of daughter " +
-          this.r.getRegionInfo().getRegionNameAsString(), t);
+        this.t = t;
       }
     }
   }
@@ -373,7 +403,7 @@ public class SplitTransaction {
   /**
    * Open daughter regions, add them to online list and update meta.
    * @param server
-   * @param services
+   * @param services Can be null when testing.
    * @param daughter
    * @throws IOException
    * @throws KeeperException
@@ -381,20 +411,22 @@ public class SplitTransaction {
   void openDaughterRegion(final Server server,
       final RegionServerServices services, final HRegion daughter)
   throws IOException, KeeperException {
-    if (server.isStopped() || services.isStopping()) {
+    boolean stopping = services != null && services.isStopping();
+    if (server.isStopped() || stopping) {
       MetaEditor.addDaughter(server.getCatalogTracker(),
         daughter.getRegionInfo(), null);
       LOG.info("Not opening daughter " +
         daughter.getRegionInfo().getRegionNameAsString() +
-        " because stopping=" + services.isStopping() + ", stopped=" +
-        server.isStopped());
+        " because stopping=" + stopping + ", stopped=" + server.isStopped());
       return;
     }
     HRegionInfo hri = daughter.getRegionInfo();
     LoggingProgressable reporter =
       new LoggingProgressable(hri, server.getConfiguration());
     HRegion r = daughter.openHRegion(reporter);
-    services.postOpenDeployTasks(r, server.getCatalogTracker(), true);
+    if (services != null) {
+      services.postOpenDeployTasks(r, server.getCatalogTracker(), true);
+    }
   }
 
   static class LoggingProgressable implements CancelableProgressable {
@@ -598,16 +630,19 @@ public class SplitTransaction {
   }
 
   /**
-   * @param or Object that can online/offline parent region.  Can be passed null
-   * by unit tests.
-   * @return The region we were splitting
+   * @param server Hosting server instance (May be null when testing).
+   * @param services
    * @throws IOException If thrown, rollback failed.  Take drastic action.
+   * @return True if we successfully rolled back, false if we got to the point
+   * of no return and so now need to abort the server to minimize damage.
    */
-  public void rollback(final Server server, final OnlineRegions or)
+  public boolean rollback(final Server server, final RegionServerServices services)
   throws IOException {
+    boolean result = true;
     FileSystem fs = this.parent.getFilesystem();
     ListIterator<JournalEntry> iterator =
       this.journal.listIterator(this.journal.size());
+    // Iterate in reverse.
     while (iterator.hasPrevious()) {
       JournalEntry je = iterator.previous();
       switch(je) {
@@ -642,13 +677,19 @@ public class SplitTransaction {
         break;
 
       case OFFLINED_PARENT:
-        if (or != null) or.addToOnlineRegions(this.parent);
+        if (services != null) services.addToOnlineRegions(this.parent);
         break;
 
+      case PONR:
+        // We got to the point-of-no-return so we need to just abort. Return
+        // immediately.
+        return false;
+
       default:
         throw new RuntimeException("Unhandled journal entry: " + je);
       }
     }
+    return result;
   }
 
   HRegionInfo getFirstDaughter() {

Modified: hbase/trunk/src/test/java/org/apache/hadoop/hbase/catalog/TestCatalogTracker.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/test/java/org/apache/hadoop/hbase/catalog/TestCatalogTracker.java?rev=1146526&r1=1146525&r2=1146526&view=diff
==============================================================================
--- hbase/trunk/src/test/java/org/apache/hadoop/hbase/catalog/TestCatalogTracker.java (original)
+++ hbase/trunk/src/test/java/org/apache/hadoop/hbase/catalog/TestCatalogTracker.java Thu Jul 14 00:27:14 2011
@@ -113,7 +113,7 @@ public class TestCatalogTracker {
   @Test public void testThatIfMETAMovesWeAreNotified()
   throws IOException, InterruptedException, KeeperException {
     HConnection connection = Mockito.mock(HConnection.class);
-    final CatalogTracker ct = constructAndStartCatalogTracker(connection);
+    constructAndStartCatalogTracker(connection);
     try {
       RootLocationEditor.setRootLocation(this.watcher,
         new ServerName("example.com", 1234, System.currentTimeMillis()));

Modified: hbase/trunk/src/test/java/org/apache/hadoop/hbase/master/TestCatalogJanitor.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/test/java/org/apache/hadoop/hbase/master/TestCatalogJanitor.java?rev=1146526&r1=1146525&r2=1146526&view=diff
==============================================================================
--- hbase/trunk/src/test/java/org/apache/hadoop/hbase/master/TestCatalogJanitor.java (original)
+++ hbase/trunk/src/test/java/org/apache/hadoop/hbase/master/TestCatalogJanitor.java Thu Jul 14 00:27:14 2011
@@ -293,6 +293,14 @@ public class TestCatalogJanitor {
     assertFalse(janitor.cleanParent(parent, r));
     // Remove the reference file and try again.
     assertTrue(fs.delete(p, true));
+    // We will fail!!! Because split b is empty, which is right... we should
+    // not remove parent if daughters do not exist in fs.
+    assertFalse(janitor.cleanParent(parent, r));
+    // Put in place daughter dir for b... that should make it so parent gets
+    // cleaned up.
+    storedir = Store.getStoreHomedir(tabledir, splitb.getEncodedName(),
+      htd.getColumnFamilies()[0].getName());
+    assertTrue(fs.mkdirs(storedir));
     assertTrue(janitor.cleanParent(parent, r));
   }
 }

Modified: hbase/trunk/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransaction.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransaction.java?rev=1146526&r1=1146525&r2=1146526&view=diff
==============================================================================
--- hbase/trunk/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransaction.java (original)
+++ hbase/trunk/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransaction.java Thu Jul 14 00:27:14 2011
@@ -42,6 +42,7 @@ import org.apache.hadoop.hbase.client.Sc
 import org.apache.hadoop.hbase.regionserver.wal.HLog;
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.util.PairOfSameType;
+import org.apache.zookeeper.KeeperException;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -84,6 +85,46 @@ public class TestSplitTransaction {
     this.fs.delete(this.testdir, true);
   }
 
+  @Test public void testFailAfterPONR() throws IOException, KeeperException {
+    final int rowcount = TEST_UTIL.loadRegion(this.parent, CF);
+    assertTrue(rowcount > 0);
+    int parentRowCount = countRows(this.parent);
+    assertEquals(rowcount, parentRowCount);
+
+    // Start transaction.
+    SplitTransaction st = prepareGOOD_SPLIT_ROW();
+    SplitTransaction spiedUponSt = spy(st);
+    Mockito.doThrow(new MockedFailedDaughterOpen()).
+      when(spiedUponSt).openDaughterRegion((Server)Mockito.anyObject(),
+        (RegionServerServices)Mockito.anyObject(), (HRegion)Mockito.anyObject());
+
+    // Run the execute.  Look at what it returns.
+    boolean expectedException = false;
+    Server mockServer = Mockito.mock(Server.class);
+    when(mockServer.getConfiguration()).thenReturn(TEST_UTIL.getConfiguration());
+    try {
+      spiedUponSt.execute(mockServer, null);
+    } catch (IOException e) {
+      if (e.getCause() != null &&
+          e.getCause() instanceof MockedFailedDaughterOpen) {
+        expectedException = true;
+      }
+    }
+    assertTrue(expectedException);
+    // Run rollback returns that we should restart.
+    assertFalse(spiedUponSt.rollback(null, null));
+    // Make sure that region a and region b are still in the filesystem, that
+    // they have not been removed; this is supposed to be the case if we go
+    // past point of no return.
+    Path tableDir =  this.parent.getRegionDir().getParent();
+    Path daughterADir =
+      new Path(tableDir, spiedUponSt.getFirstDaughter().getEncodedName());
+    Path daughterBDir =
+      new Path(tableDir, spiedUponSt.getSecondDaughter().getEncodedName());
+    assertTrue(TEST_UTIL.getTestFileSystem().exists(daughterADir));
+    assertTrue(TEST_UTIL.getTestFileSystem().exists(daughterBDir));
+  }
+
   /**
    * Test straight prepare works.  Tries to split on {@link #GOOD_SPLIT_ROW}
    * @throws IOException
@@ -190,7 +231,7 @@ public class TestSplitTransaction {
     }
     assertTrue(expectedException);
     // Run rollback
-    spiedUponSt.rollback(null, null);
+    assertTrue(spiedUponSt.rollback(null, null));
 
     // Assert I can scan parent.
     int parentRowCount2 = countRows(this.parent);
@@ -228,6 +269,7 @@ public class TestSplitTransaction {
    */
   @SuppressWarnings("serial")
   private class MockedFailedDaughterCreation extends IOException {}
+  private class MockedFailedDaughterOpen extends IOException {}
 
   private int countRows(final HRegion r) throws IOException {
     int rowcount = 0;