You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by en...@apache.org on 2013/10/04 02:56:10 UTC

svn commit: r1529046 - in /hbase/branches/0.96: hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java

Author: enis
Date: Fri Oct  4 00:56:09 2013
New Revision: 1529046

URL: http://svn.apache.org/r1529046
Log:
HBASE-9703 DistributedHBaseCluster should not throw exceptions, but do a best effort restore

Modified:
    hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java
    hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java

Modified: hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java?rev=1529046&r1=1529045&r2=1529046&view=diff
==============================================================================
--- hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java (original)
+++ hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java Fri Oct  4 00:56:09 2013
@@ -18,9 +18,9 @@
 package org.apache.hadoop.hbase;
 
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.HashMap;
-
-import com.google.common.collect.Sets;
+import java.util.List;
 
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.conf.Configuration;
@@ -36,6 +36,8 @@ import org.apache.hadoop.hbase.protobuf.
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.util.Threads;
 
+import com.google.common.collect.Sets;
+
 /**
  * Manages the interactions with an already deployed distributed cluster (as opposed to
  * a pseudo-distributed, or mini/local cluster). This is used by integration and system tests.
@@ -215,38 +217,64 @@ public class DistributedHBaseCluster ext
   }
 
   @Override
-  public void restoreClusterStatus(ClusterStatus initial) throws IOException {
-    //TODO: caution: not tested throughly
+  public boolean restoreClusterStatus(ClusterStatus initial) throws IOException {
     ClusterStatus current = getClusterStatus();
 
-    //restore masters
+    LOG.info("Restoring cluster - started");
+
+    // do a best effort restore
+    boolean success = true;
+    success = restoreMasters(initial, current) & success;
+    success = restoreRegionServers(initial, current) & success;
+    success = restoreAdmin() & success;
+
+    LOG.info("Restoring cluster - done");
+    return success;
+  }
 
+  protected boolean restoreMasters(ClusterStatus initial, ClusterStatus current) {
+    List<IOException> deferred = new ArrayList<IOException>();
     //check whether current master has changed
     if (!ServerName.isSameHostnameAndPort(initial.getMaster(), current.getMaster())) {
-      LOG.info("Initial active master : " + initial.getMaster().getHostname()
+      LOG.info("Restoring cluster - Initial active master : " + initial.getMaster().getHostname()
           + " has changed to : " + current.getMaster().getHostname());
       // If initial master is stopped, start it, before restoring the state.
       // It will come up as a backup master, if there is already an active master.
-      if (!clusterManager.isRunning(ServiceType.HBASE_MASTER, initial.getMaster().getHostname())) {
-        startMaster(initial.getMaster().getHostname());
-      }
+      try {
+        if (!clusterManager.isRunning(ServiceType.HBASE_MASTER, initial.getMaster().getHostname())) {
+          LOG.info("Restoring cluster - starting initial active master at:" + initial.getMaster().getHostname());
+          startMaster(initial.getMaster().getHostname());
+        }
 
-      //master has changed, we would like to undo this.
-      //1. Kill the current backups
-      //2. Stop current master
-      //3. Start backup masters
-      for (ServerName currentBackup : current.getBackupMasters()) {
-        if (!ServerName.isSameHostnameAndPort(currentBackup, initial.getMaster())) {
-          stopMaster(currentBackup);
+        //master has changed, we would like to undo this.
+        //1. Kill the current backups
+        //2. Stop current master
+        //3. Start backup masters
+        for (ServerName currentBackup : current.getBackupMasters()) {
+          if (!ServerName.isSameHostnameAndPort(currentBackup, initial.getMaster())) {
+            LOG.info("Restoring cluster - stopping backup master: " + currentBackup);
+            stopMaster(currentBackup);
+          }
         }
+        LOG.info("Restoring cluster - stopping active master: " + current.getMaster());
+        stopMaster(current.getMaster());
+        waitForActiveAndReadyMaster(); //wait so that active master takes over
+      } catch (IOException ex) {
+        // if we fail to start the initial active master, we do not want to continue stopping
+        // backup masters. Just keep what we have now
+        deferred.add(ex);
       }
-      stopMaster(current.getMaster());
-      waitForActiveAndReadyMaster(); //wait so that active master takes over
+
       //start backup masters
       for (ServerName backup : initial.getBackupMasters()) {
-        //these are not started in backup mode, but we should already have an active master
-        if(!clusterManager.isRunning(ServiceType.HBASE_MASTER, backup.getHostname())) {
-          startMaster(backup.getHostname());
+        try {
+          //these are not started in backup mode, but we should already have an active master
+          if(!clusterManager.isRunning(ServiceType.HBASE_MASTER, backup.getHostname())) {
+            LOG.info("Restoring cluster - starting initial backup master: " + backup.getHostname());
+            startMaster(backup.getHostname());
+          }
+        } catch (IOException ex) {
+          deferred.add(ex);
         }
       }
     } else {
@@ -262,19 +290,38 @@ public class DistributedHBaseCluster ext
       }
 
       for (String hostname : Sets.difference(initialBackups.keySet(), currentBackups.keySet())) {
-        if(!clusterManager.isRunning(ServiceType.HBASE_MASTER, hostname)) {
-          startMaster(hostname);
+        try {
+          if(!clusterManager.isRunning(ServiceType.HBASE_MASTER, hostname)) {
+            LOG.info("Restoring cluster - starting initial backup master: " + hostname);
+            startMaster(hostname);
+          }
+        } catch (IOException ex) {
+          deferred.add(ex);
         }
       }
 
       for (String hostname : Sets.difference(currentBackups.keySet(), initialBackups.keySet())) {
-        if(clusterManager.isRunning(ServiceType.HBASE_MASTER, hostname)) {
-          stopMaster(currentBackups.get(hostname));
+        try {
+          if(clusterManager.isRunning(ServiceType.HBASE_MASTER, hostname)) {
+            LOG.info("Restoring cluster - stopping backup master: " + hostname);
+            stopMaster(currentBackups.get(hostname));
+          }
+        } catch (IOException ex) {
+          deferred.add(ex);
         }
       }
     }
+    if (!deferred.isEmpty()) {
+      LOG.warn("Restoring cluster - restoring region servers reported " + deferred.size() + " errors:");
+      for (int i=0; i<deferred.size() && i < 3; i++) {
+        LOG.warn(deferred.get(i));
+      }
+    }
 
-    //restore region servers
+    return deferred.isEmpty();
+  }
+
+  protected boolean restoreRegionServers(ClusterStatus initial, ClusterStatus current) {
     HashMap<String, ServerName> initialServers = new HashMap<String, ServerName>();
     HashMap<String, ServerName> currentServers = new HashMap<String, ServerName>();
 
@@ -285,17 +332,39 @@ public class DistributedHBaseCluster ext
       currentServers.put(server.getHostname(), server);
     }
 
+    List<IOException> deferred = new ArrayList<IOException>();
     for (String hostname : Sets.difference(initialServers.keySet(), currentServers.keySet())) {
-      if(!clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER, hostname)) {
-        startRegionServer(hostname);
+      try {
+        if(!clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER, hostname)) {
+          LOG.info("Restoring cluster - starting initial region server: " + hostname);
+          startRegionServer(hostname);
+        }
+      } catch (IOException ex) {
+        deferred.add(ex);
       }
     }
 
     for (String hostname : Sets.difference(currentServers.keySet(), initialServers.keySet())) {
-      if(clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER, hostname)) {
-        stopRegionServer(currentServers.get(hostname));
+      try {
+        if(clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER, hostname)) {
+          LOG.info("Restoring cluster - stopping initial region server: " + hostname);
+          stopRegionServer(currentServers.get(hostname));
+        }
+      } catch (IOException ex) {
+        deferred.add(ex);
       }
     }
+    if (!deferred.isEmpty()) {
+      LOG.warn("Restoring cluster - restoring region servers reported " + deferred.size() + " errors:");
+      for (int i=0; i<deferred.size() && i < 3; i++) {
+        LOG.warn(deferred.get(i));
+      }
+    }
+
+    return deferred.isEmpty();
+  }
+
+  protected boolean restoreAdmin() throws IOException {
     // While restoring above, if the HBase Master which was initially the Active one, was down
     // and the restore put the cluster back to Initial configuration, HAdmin instance will need
     // to refresh its connections (otherwise it will return incorrect information) or we can
@@ -303,9 +372,10 @@ public class DistributedHBaseCluster ext
     try {
       admin.close();
     } catch (IOException ioe) {
-      LOG.info("While closing the old connection", ioe);
+      LOG.warn("While closing the old connection", ioe);
     }
     this.admin = new HBaseAdmin(conf);
     LOG.info("Added new HBaseAdmin");
+    return true;
   }
 }

Modified: hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java?rev=1529046&r1=1529045&r2=1529046&view=diff
==============================================================================
--- hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java (original)
+++ hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java Fri Oct  4 00:56:09 2013
@@ -226,16 +226,23 @@ public abstract class HBaseCluster imple
   /**
    * Restores the cluster to it's initial state if this is a real cluster,
    * otherwise does nothing.
+   * This is a best effort restore. If the servers are not reachable, or insufficient
+   * permissions, etc. restoration might be partial.
+   * @return whether restoration is complete
    */
-  public void restoreInitialStatus() throws IOException {
-    restoreClusterStatus(getInitialClusterStatus());
+  public boolean restoreInitialStatus() throws IOException {
+    return restoreClusterStatus(getInitialClusterStatus());
   }
 
   /**
    * Restores the cluster to given state if this is a real cluster,
    * otherwise does nothing.
+   * This is a best effort restore. If the servers are not reachable, or insufficient
+   * permissions, etc. restoration might be partial.
+   * @return whether restoration is complete
    */
-  public void restoreClusterStatus(ClusterStatus desiredStatus) throws IOException {
+  public boolean restoreClusterStatus(ClusterStatus desiredStatus) throws IOException {
+    return true;
   }
 
   /**