You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2010/12/05 05:51:47 UTC
svn commit: r1042283 - in /hbase/branches/0.90/src:
main/java/org/apache/hadoop/hbase/master/
main/java/org/apache/hadoop/hbase/zookeeper/
test/java/org/apache/hadoop/hbase/master/
Author: stack
Date: Sun Dec 5 04:51:46 2010
New Revision: 1042283
URL: http://svn.apache.org/viewvc?rev=1042283&view=rev
Log:
HBASE-3311 Addendum patch on HBASE-3298 broke build
Modified:
hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java
hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java
hbase/branches/0.90/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java
Modified: hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java?rev=1042283&r1=1042282&r2=1042283&view=diff
==============================================================================
--- hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java (original)
+++ hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java Sun Dec 5 04:51:46 2010
@@ -126,13 +126,14 @@ class ActiveMasterManager extends ZooKee
this.watcher.masterAddressZNode, this.address)) {
// We are the master, return
this.clusterHasActiveMaster.set(true);
+ LOG.info("Master=" + this.address);
return cleanSetOfActiveMaster;
}
+ cleanSetOfActiveMaster = false;
// There is another active master running elsewhere or this is a restart
// and the master ephemeral node has not expired yet.
this.clusterHasActiveMaster.set(true);
- cleanSetOfActiveMaster = false;
HServerAddress currentMaster =
ZKUtil.getDataAsAddress(this.watcher, this.watcher.masterAddressZNode);
if (currentMaster != null && currentMaster.equals(this.address)) {
Modified: hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java?rev=1042283&r1=1042282&r2=1042283&view=diff
==============================================================================
--- hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (original)
+++ hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java Sun Dec 5 04:51:46 2010
@@ -25,7 +25,6 @@ import java.io.IOException;
import java.net.ConnectException;
import java.util.ArrayList;
import java.util.HashMap;
-import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@@ -55,6 +54,7 @@ import org.apache.hadoop.hbase.client.Re
import org.apache.hadoop.hbase.executor.EventHandler.EventType;
import org.apache.hadoop.hbase.executor.ExecutorService;
import org.apache.hadoop.hbase.executor.RegionTransitionData;
+import org.apache.hadoop.hbase.master.AssignmentManager.RegionState;
import org.apache.hadoop.hbase.master.LoadBalancer.RegionPlan;
import org.apache.hadoop.hbase.master.handler.ClosedRegionHandler;
import org.apache.hadoop.hbase.master.handler.OpenedRegionHandler;
@@ -333,6 +333,10 @@ public class AssignmentManager extends Z
*/
private void handleRegion(final RegionTransitionData data) {
synchronized(regionsInTransition) {
+ if (data == null || data.getServerName() == null) {
+ LOG.warn("Unexpected NULL input " + data);
+ return;
+ }
// Verify this is a known server
if (!serverManager.isServerOnline(data.getServerName()) &&
!this.master.getServerName().equals(data.getServerName())) {
@@ -1039,47 +1043,39 @@ public class AssignmentManager extends Z
region.getRegionNameAsString());
return;
}
- LOG.debug("Server " + server + " region CLOSE RPC returned false");
+ // This never happens. Currently regionserver close always return true.
+ LOG.debug("Server " + server + " region CLOSE RPC returned false for " +
+ region.getEncodedName());
} catch (NotServingRegionException nsre) {
- // Failed to close, so pass through and reassign
LOG.info("Server " + server + " returned " + nsre + " for " +
region.getEncodedName());
+ // Presume that master has stale data. Presume remote side just split.
+ // Presume that the split message when it comes in will fix up the master's
+ // in memory cluster state.
+ return;
} catch (ConnectException e) {
- // Failed to connect, so pass through and reassign
+ LOG.info("Failed connect to " + server + ", message=" + e.getMessage() +
+ ", region=" + region.getEncodedName());
+ // Presume that regionserver just failed and we haven't got expired
+ // server from zk yet. Let expired server deal with clean up.
+ } catch (java.net.SocketTimeoutException e) {
LOG.info("Server " + server + " returned " + e.getMessage() + " for " +
region.getEncodedName());
- } catch (java.net.SocketTimeoutException e) {
- // Failed to connect, so pass through and reassign
- LOG.info("Server " + server + " returned " + e.getMessage());
+ // Presume retry or server will expire.
} catch (RemoteException re) {
- if (re.unwrapRemoteException() instanceof NotServingRegionException) {
+ IOException ioe = re.unwrapRemoteException();
+ if (ioe instanceof NotServingRegionException) {
// Failed to close, so pass through and reassign
- LOG.debug("Server " + server + " returned NotServingRegionException");
+ LOG.debug("Server " + server + " returned " + ioe + " for " +
+ region.getEncodedName());
} else {
- this.master.abort("Remote unexpected exception",
- re.unwrapRemoteException());
+ this.master.abort("Remote unexpected exception", ioe);
}
} catch (Throwable t) {
// For now call abort if unexpected exception -- radical, but will get
// fellas attention. St.Ack 20101012
this.master.abort("Remote unexpected exception", t);
}
- /* This looks way wrong at least for the case where close failed because
- * it was being concurrently split. It also looks wrong for case where
- * we cannot connect to remote server. In that case, let the server
- * expiration do the fixup. I'm leaving this code here commented out for
- * the moment in case I've missed something and this code is actually needed.
- * St.Ack 12/04/2010.
- *
- // Did not CLOSE, so set region offline and assign it
- LOG.debug("Attempted to send CLOSE to " + server +
- " for region " + region.getRegionNameAsString() + " but failed, " +
- "setting region as OFFLINE and reassigning");
- synchronized (regionsInTransition) {
- forceRegionStateToOffline(region);
- }
- assign(region, true);
- */
}
/**
@@ -1555,8 +1551,8 @@ public class AssignmentManager extends Z
// Attempt to transition node into OFFLINE
try {
data = new RegionTransitionData(
- EventType.M_ZK_REGION_OFFLINE,
- regionInfo.getRegionName());
+ EventType.M_ZK_REGION_OFFLINE, regionInfo.getRegionName(),
+ master.getServerName());
if (ZKUtil.setData(watcher, node, data.getBytes(),
stat.getVersion())) {
// Node is now OFFLINE, let's trigger another assignment
Modified: hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1042283&r1=1042282&r2=1042283&view=diff
==============================================================================
--- hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Sun Dec 5 04:51:46 2010
@@ -155,7 +155,7 @@ implements HMasterInterface, HMasterRegi
// flag set after we become the active master (used for testing)
private volatile boolean isActiveMaster = false;
// flag set after we complete initialization once active (used for testing)
- private volatile boolean isInitialized = false;
+ private volatile boolean initialized = false;
// Instance of the hbase executor service.
ExecutorService executorService;
@@ -399,7 +399,7 @@ implements HMasterInterface, HMasterRegi
Threads.setDaemonThreadRunning(new CatalogJanitor(this, this));
LOG.info("Master has completed initialization");
- isInitialized = true;
+ initialized = true;
}
/**
@@ -997,7 +997,7 @@ implements HMasterInterface, HMasterRegi
* @return true if master is ready to go, false if not.
*/
public boolean isInitialized() {
- return isInitialized;
+ return initialized;
}
@Override
Modified: hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=1042283&r1=1042282&r2=1042283&view=diff
==============================================================================
--- hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java (original)
+++ hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java Sun Dec 5 04:51:46 2010
@@ -578,17 +578,13 @@ public class ServerManager {
*/
public boolean sendRegionClose(HServerInfo server, HRegionInfo region)
throws IOException {
- if (server == null) {
- LOG.debug("Unable to send region close because server is null; region=" +
- region.getRegionNameAsString());
- return false;
- }
+ if (server == null) throw new NullPointerException("Passed server is null");
HRegionInterface hri = getServerConnection(server);
- if(hri == null) {
- LOG.warn("Attempting to send CLOSE RPC to server " +
- server.getServerName() + " for region " + region.getRegionNameAsString()
- + " failed because no RPC connection found to this server");
- return false;
+ if (hri == null) {
+ throw new IOException("Attempting to send CLOSE RPC to server " +
+ server.getServerName() + " for region " +
+ region.getRegionNameAsString() +
+ " failed because no RPC connection found to this server");
}
return hri.closeRegion(region);
}
Modified: hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java?rev=1042283&r1=1042282&r2=1042283&view=diff
==============================================================================
--- hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java (original)
+++ hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java Sun Dec 5 04:51:46 2010
@@ -294,8 +294,6 @@ public class ZooKeeperWatcher implements
private void connectionEvent(WatchedEvent event) {
switch(event.getState()) {
case SyncConnected:
- // Update our identifier. Otherwise ignore.
- LOG.debug(this.identifier + " connected");
// Now, this callback can be invoked before the this.zookeeper is set.
// Wait a little while.
long finished = System.currentTimeMillis() +
@@ -312,6 +310,8 @@ public class ZooKeeperWatcher implements
}
this.identifier = this.identifier + "-0x" +
Long.toHexString(this.zooKeeper.getSessionId());
+ // Update our identifier. Otherwise ignore.
+ LOG.debug(this.identifier + " connected");
break;
// Abort the server if Disconnected or Expired
Modified: hbase/branches/0.90/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.90/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java?rev=1042283&r1=1042282&r2=1042283&view=diff
==============================================================================
--- hbase/branches/0.90/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java (original)
+++ hbase/branches/0.90/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java Sun Dec 5 04:51:46 2010
@@ -230,12 +230,11 @@ public class TestMasterFailover {
// Create a ZKW to use in the test
ZooKeeperWatcher zkw = new ZooKeeperWatcher(TEST_UTIL.getConfiguration(),
- "unittest", new Abortable() {
- @Override
- public void abort(String why, Throwable e) {
- LOG.error("Fatal ZK Error: " + why, e);
- org.junit.Assert.assertFalse("Fatal ZK error", true);
- }
+ "unittest", new Abortable() {
+ @Override
+ public void abort(String why, Throwable e) {
+ throw new RuntimeException("Fatal ZK error, why=" + why, e);
+ }
});
// get all the master threads
@@ -820,22 +819,40 @@ public class TestMasterFailover {
master.assignmentManager.regionsInTransition.put(region.getEncodedName(),
new RegionState(region, RegionState.State.PENDING_OPEN, 0));
ZKAssign.createNodeOffline(zkw, region, master.getServerName());
+ // This test is bad. It puts up a PENDING_CLOSE but doesn't say what
+ // server we were PENDING_CLOSE against -- i.e. an entry in
+ // AssignmentManager#regions. W/o a server, we NPE trying to resend close.
+ // In past, there was wonky logic that had us reassign region if no server
+ // at tail of the unassign. This was removed. Commenting out for now.
+ // TODO: Remove completely.
+ /*
// PENDING_CLOSE and enabled
region = enabledRegions.remove(0);
+ LOG.info("Setting PENDING_CLOSE enabled " + region.getEncodedName());
regionsThatShouldBeOnline.add(region);
master.assignmentManager.regionsInTransition.put(region.getEncodedName(),
- new RegionState(region, RegionState.State.PENDING_CLOSE, 0));
+ new RegionState(region, RegionState.State.PENDING_CLOSE, 0));
// PENDING_CLOSE and disabled
region = disabledRegions.remove(0);
+ LOG.info("Setting PENDING_CLOSE disabled " + region.getEncodedName());
regionsThatShouldBeOffline.add(region);
master.assignmentManager.regionsInTransition.put(region.getEncodedName(),
- new RegionState(region, RegionState.State.PENDING_CLOSE, 0));
+ new RegionState(region, RegionState.State.PENDING_CLOSE, 0));
+ */
// Failover should be completed, now wait for no RIT
log("Waiting for no more RIT");
ZKAssign.blockUntilNoRIT(zkw);
log("No more RIT in ZK");
- master.assignmentManager.waitUntilNoRegionsInTransition(120000);
+ long now = System.currentTimeMillis();
+ final long maxTime = 120000;
+ boolean done = master.assignmentManager.waitUntilNoRegionsInTransition(maxTime);
+ if (!done) {
+ LOG.info("rit=" + master.assignmentManager.getRegionsInTransition());
+ }
+ long elapsed = System.currentTimeMillis() - now;
+ assertTrue("Elapsed=" + elapsed + ", maxTime=" + maxTime + ", done=" + done,
+ elapsed < maxTime);
log("No more RIT in RIT map, doing final test verification");
// Grab all the regions that are online across RSs
@@ -847,7 +864,7 @@ public class TestMasterFailover {
// Now, everything that should be online should be online
for (HRegionInfo hri : regionsThatShouldBeOnline) {
- assertTrue(onlineRegions.contains(hri));
+ assertTrue("region=" + hri.getRegionNameAsString(), onlineRegions.contains(hri));
}
// Everything that should be offline should not be online