You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by en...@apache.org on 2013/10/04 02:56:10 UTC
svn commit: r1529046 - in /hbase/branches/0.96:
hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java
hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java
Author: enis
Date: Fri Oct 4 00:56:09 2013
New Revision: 1529046
URL: http://svn.apache.org/r1529046
Log:
HBASE-9703 DistributedHBaseCluster should not throw exceptions, but do a best effort restore
Modified:
hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java
hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java
Modified: hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java?rev=1529046&r1=1529045&r2=1529046&view=diff
==============================================================================
--- hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java (original)
+++ hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java Fri Oct 4 00:56:09 2013
@@ -18,9 +18,9 @@
package org.apache.hadoop.hbase;
import java.io.IOException;
+import java.util.ArrayList;
import java.util.HashMap;
-
-import com.google.common.collect.Sets;
+import java.util.List;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
@@ -36,6 +36,8 @@ import org.apache.hadoop.hbase.protobuf.
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Threads;
+import com.google.common.collect.Sets;
+
/**
* Manages the interactions with an already deployed distributed cluster (as opposed to
* a pseudo-distributed, or mini/local cluster). This is used by integration and system tests.
@@ -215,38 +217,64 @@ public class DistributedHBaseCluster ext
}
@Override
- public void restoreClusterStatus(ClusterStatus initial) throws IOException {
- //TODO: caution: not tested throughly
+ public boolean restoreClusterStatus(ClusterStatus initial) throws IOException {
ClusterStatus current = getClusterStatus();
- //restore masters
+ LOG.info("Restoring cluster - started");
+
+ // do a best effort restore
+ boolean success = true;
+ success = restoreMasters(initial, current) & success;
+ success = restoreRegionServers(initial, current) & success;
+ success = restoreAdmin() & success;
+
+ LOG.info("Restoring cluster - done");
+ return success;
+ }
+ protected boolean restoreMasters(ClusterStatus initial, ClusterStatus current) {
+ List<IOException> deferred = new ArrayList<IOException>();
//check whether current master has changed
if (!ServerName.isSameHostnameAndPort(initial.getMaster(), current.getMaster())) {
- LOG.info("Initial active master : " + initial.getMaster().getHostname()
+ LOG.info("Restoring cluster - Initial active master : " + initial.getMaster().getHostname()
+ " has changed to : " + current.getMaster().getHostname());
// If initial master is stopped, start it, before restoring the state.
// It will come up as a backup master, if there is already an active master.
- if (!clusterManager.isRunning(ServiceType.HBASE_MASTER, initial.getMaster().getHostname())) {
- startMaster(initial.getMaster().getHostname());
- }
+ try {
+ if (!clusterManager.isRunning(ServiceType.HBASE_MASTER, initial.getMaster().getHostname())) {
+ LOG.info("Restoring cluster - starting initial active master at:" + initial.getMaster().getHostname());
+ startMaster(initial.getMaster().getHostname());
+ }
- //master has changed, we would like to undo this.
- //1. Kill the current backups
- //2. Stop current master
- //3. Start backup masters
- for (ServerName currentBackup : current.getBackupMasters()) {
- if (!ServerName.isSameHostnameAndPort(currentBackup, initial.getMaster())) {
- stopMaster(currentBackup);
+ //master has changed, we would like to undo this.
+ //1. Kill the current backups
+ //2. Stop current master
+ //3. Start backup masters
+ for (ServerName currentBackup : current.getBackupMasters()) {
+ if (!ServerName.isSameHostnameAndPort(currentBackup, initial.getMaster())) {
+ LOG.info("Restoring cluster - stopping backup master: " + currentBackup);
+ stopMaster(currentBackup);
+ }
}
+ LOG.info("Restoring cluster - stopping active master: " + current.getMaster());
+ stopMaster(current.getMaster());
+ waitForActiveAndReadyMaster(); //wait so that active master takes over
+ } catch (IOException ex) {
+ // if we fail to start the initial active master, we do not want to continue stopping
+ // backup masters. Just keep what we have now
+ deferred.add(ex);
}
- stopMaster(current.getMaster());
- waitForActiveAndReadyMaster(); //wait so that active master takes over
+
//start backup masters
for (ServerName backup : initial.getBackupMasters()) {
- //these are not started in backup mode, but we should already have an active master
- if(!clusterManager.isRunning(ServiceType.HBASE_MASTER, backup.getHostname())) {
- startMaster(backup.getHostname());
+ try {
+ //these are not started in backup mode, but we should already have an active master
+ if(!clusterManager.isRunning(ServiceType.HBASE_MASTER, backup.getHostname())) {
+ LOG.info("Restoring cluster - starting initial backup master: " + backup.getHostname());
+ startMaster(backup.getHostname());
+ }
+ } catch (IOException ex) {
+ deferred.add(ex);
}
}
} else {
@@ -262,19 +290,38 @@ public class DistributedHBaseCluster ext
}
for (String hostname : Sets.difference(initialBackups.keySet(), currentBackups.keySet())) {
- if(!clusterManager.isRunning(ServiceType.HBASE_MASTER, hostname)) {
- startMaster(hostname);
+ try {
+ if(!clusterManager.isRunning(ServiceType.HBASE_MASTER, hostname)) {
+ LOG.info("Restoring cluster - starting initial backup master: " + hostname);
+ startMaster(hostname);
+ }
+ } catch (IOException ex) {
+ deferred.add(ex);
}
}
for (String hostname : Sets.difference(currentBackups.keySet(), initialBackups.keySet())) {
- if(clusterManager.isRunning(ServiceType.HBASE_MASTER, hostname)) {
- stopMaster(currentBackups.get(hostname));
+ try {
+ if(clusterManager.isRunning(ServiceType.HBASE_MASTER, hostname)) {
+ LOG.info("Restoring cluster - stopping backup master: " + hostname);
+ stopMaster(currentBackups.get(hostname));
+ }
+ } catch (IOException ex) {
+ deferred.add(ex);
}
}
}
+ if (!deferred.isEmpty()) {
+ LOG.warn("Restoring cluster - restoring region servers reported " + deferred.size() + " errors:");
+ for (int i=0; i<deferred.size() && i < 3; i++) {
+ LOG.warn(deferred.get(i));
+ }
+ }
- //restore region servers
+ return deferred.isEmpty();
+ }
+
+ protected boolean restoreRegionServers(ClusterStatus initial, ClusterStatus current) {
HashMap<String, ServerName> initialServers = new HashMap<String, ServerName>();
HashMap<String, ServerName> currentServers = new HashMap<String, ServerName>();
@@ -285,17 +332,39 @@ public class DistributedHBaseCluster ext
currentServers.put(server.getHostname(), server);
}
+ List<IOException> deferred = new ArrayList<IOException>();
for (String hostname : Sets.difference(initialServers.keySet(), currentServers.keySet())) {
- if(!clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER, hostname)) {
- startRegionServer(hostname);
+ try {
+ if(!clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER, hostname)) {
+ LOG.info("Restoring cluster - starting initial region server: " + hostname);
+ startRegionServer(hostname);
+ }
+ } catch (IOException ex) {
+ deferred.add(ex);
}
}
for (String hostname : Sets.difference(currentServers.keySet(), initialServers.keySet())) {
- if(clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER, hostname)) {
- stopRegionServer(currentServers.get(hostname));
+ try {
+ if(clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER, hostname)) {
+ LOG.info("Restoring cluster - stopping initial region server: " + hostname);
+ stopRegionServer(currentServers.get(hostname));
+ }
+ } catch (IOException ex) {
+ deferred.add(ex);
}
}
+ if (!deferred.isEmpty()) {
+ LOG.warn("Restoring cluster - restoring region servers reported " + deferred.size() + " errors:");
+ for (int i=0; i<deferred.size() && i < 3; i++) {
+ LOG.warn(deferred.get(i));
+ }
+ }
+
+ return deferred.isEmpty();
+ }
+
+ protected boolean restoreAdmin() throws IOException {
// While restoring above, if the HBase Master which was initially the Active one, was down
// and the restore put the cluster back to Initial configuration, HAdmin instance will need
// to refresh its connections (otherwise it will return incorrect information) or we can
@@ -303,9 +372,10 @@ public class DistributedHBaseCluster ext
try {
admin.close();
} catch (IOException ioe) {
- LOG.info("While closing the old connection", ioe);
+ LOG.warn("While closing the old connection", ioe);
}
this.admin = new HBaseAdmin(conf);
LOG.info("Added new HBaseAdmin");
+ return true;
}
}
Modified: hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java?rev=1529046&r1=1529045&r2=1529046&view=diff
==============================================================================
--- hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java (original)
+++ hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java Fri Oct 4 00:56:09 2013
@@ -226,16 +226,23 @@ public abstract class HBaseCluster imple
/**
* Restores the cluster to it's initial state if this is a real cluster,
* otherwise does nothing.
+ * This is a best effort restore. If the servers are not reachable, or insufficient
+ * permissions, etc. restoration might be partial.
+ * @return whether restoration is complete
*/
- public void restoreInitialStatus() throws IOException {
- restoreClusterStatus(getInitialClusterStatus());
+ public boolean restoreInitialStatus() throws IOException {
+ return restoreClusterStatus(getInitialClusterStatus());
}
/**
* Restores the cluster to given state if this is a real cluster,
* otherwise does nothing.
+ * This is a best effort restore. If the servers are not reachable, or insufficient
+ * permissions, etc. restoration might be partial.
+ * @return whether restoration is complete
*/
- public void restoreClusterStatus(ClusterStatus desiredStatus) throws IOException {
+ public boolean restoreClusterStatus(ClusterStatus desiredStatus) throws IOException {
+ return true;
}
/**