You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2009/01/07 22:03:20 UTC
svn commit: r732491 - in /hadoop/hbase/trunk: ./
src/java/org/apache/hadoop/hbase/ src/java/org/apache/hadoop/hbase/master/
Author: stack
Date: Wed Jan 7 13:03:20 2009
New Revision: 732491
URL: http://svn.apache.org/viewvc?rev=732491&view=rev
Log:
HBASE-1099 Regions assigned while master is splitting logs of recently crashed server; regionserver tries to execute incomplete log
Modified:
hadoop/hbase/trunk/CHANGES.txt
hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/Leases.java
hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/BaseScanner.java
hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ChangeTableState.java
hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/HMaster.java
hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java
hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/RegionManager.java
hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ServerManager.java
Modified: hadoop/hbase/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/CHANGES.txt?rev=732491&r1=732490&r2=732491&view=diff
==============================================================================
--- hadoop/hbase/trunk/CHANGES.txt (original)
+++ hadoop/hbase/trunk/CHANGES.txt Wed Jan 7 13:03:20 2009
@@ -131,6 +131,8 @@
HBASE-1083 Will keep scheduling major compactions if last time one ran, we
didn't.
HBASE-1101 NPE in HConnectionManager$TableServers.processBatchOfRows
+ HBASE-1099 Regions assigned while master is splitting logs of recently
+ crashed server; regionserver tries to execute incomplete log
IMPROVEMENTS
HBASE-901 Add a limit to key length, check key and value length on client side
Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/Leases.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/Leases.java?rev=732491&r1=732490&r2=732491&view=diff
==============================================================================
--- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/Leases.java (original)
+++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/Leases.java Wed Jan 7 13:03:20 2009
@@ -55,7 +55,6 @@
private final int leasePeriod;
private final int leaseCheckFrequency;
private volatile DelayQueue<Lease> leaseQueue = new DelayQueue<Lease>();
-
protected final Map<String, Lease> leases = new HashMap<String, Lease>();
private volatile boolean stopRequested = false;
@@ -88,15 +87,16 @@
if (lease == null) {
continue;
}
- // A lease expired
+ // A lease expired. Run the expired code before removing from queue
+ // since its presence in queue is used to see if lease exists still.
+ if (lease.getListener() == null) {
+ LOG.error("lease listener is null for lease " + lease.getLeaseName());
+ } else {
+ lease.getListener().leaseExpired();
+ }
synchronized (leaseQueue) {
leases.remove(lease.getLeaseName());
- if (lease.getListener() == null) {
- LOG.error("lease listener is null for lease " + lease.getLeaseName());
- continue;
- }
}
- lease.getListener().leaseExpired();
}
close();
}
Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/BaseScanner.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/BaseScanner.java?rev=732491&r1=732490&r2=732491&view=diff
==============================================================================
--- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/BaseScanner.java (original)
+++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/BaseScanner.java Wed Jan 7 13:03:20 2009
@@ -338,17 +338,16 @@
throws IOException {
synchronized (regionManager) {
- // Skip region - if ...
- if(info.isOffline() // offline
- || regionManager.isOfflined(info.getRegionName())) { // queued for offline
-
+ // Skip region - if
+ if(info.isOffline() ||
+ regionManager.isOfflined(info.getRegionName())) { // queued for offline
regionManager.removeRegion(info);
return;
}
HServerInfo storedInfo = null;
+ boolean deadServerAndLogsSplit = false;
boolean deadServer = false;
if (serverName.length() != 0) {
-
if (regionManager.isOfflined(info.getRegionName())) {
// Skip if region is on kill list
if(LOG.isDebugEnabled()) {
@@ -357,31 +356,31 @@
}
return;
}
-
- storedInfo = master.serverManager.getServerInfo(serverName);
- deadServer = master.serverManager.isDead(serverName);
+ storedInfo = this.master.serverManager.getServerInfo(serverName);
+ deadServer = this.master.serverManager.isDead(serverName);
+ deadServerAndLogsSplit =
+ this.master.serverManager.isDeadServerLogsSplit(serverName);
}
/*
- * If the server is a dead server or its startcode is off -- either null
+ * If the server is a dead server and its logs have been split or its
+ * not on the dead server lists and its startcode is off -- either null
* or doesn't match the start code for the address -- then add it to the
* list of unassigned regions IF not already there (or pending open).
*/
- if ((deadServer ||
- (storedInfo == null || storedInfo.getStartCode() != startCode)) &&
- (!regionManager.isUnassigned(info) &&
- !regionManager.isPending(info.getRegionName()) &&
- !regionManager.isAssigned(info.getRegionName()))) {
-
+ if ((deadServerAndLogsSplit ||
+ (!deadServer && (storedInfo == null ||
+ (storedInfo.getStartCode() != startCode)))) &&
+ this.regionManager.assignable(info)) {
// The current assignment is invalid
-
if (LOG.isDebugEnabled()) {
- LOG.debug("Current assignment of " +
- info.getRegionNameAsString() +
- " is not valid." +
- (storedInfo == null ? " Server '" + serverName + "' unknown." :
+ LOG.debug("Current assignment of " + info.getRegionNameAsString() +
+ " is not valid; deadServerAndLogsSplit=" + deadServerAndLogsSplit +
+ ", deadServer=" + deadServer + ". " +
+ (storedInfo == null ? " Server '" + serverName + "' unknown." :
" serverInfo: " + storedInfo + ", passed startCode: " +
- startCode + ", storedInfo.startCode: " + storedInfo.getStartCode()) +
+ startCode + ", storedInfo.startCode: " +
+ storedInfo.getStartCode()) +
" Region is not unassigned, assigned or pending");
}
@@ -389,7 +388,6 @@
// This is only done from here if we are restarting and there is stale
// data in the meta region. Once we are on-line, dead server log
// recovery is handled by lease expiration and ProcessServerShutdown
-
if (!regionManager.isInitialMetaScanComplete() &&
serverName.length() != 0) {
StringBuilder dirName = new StringBuilder("log_");
@@ -418,7 +416,7 @@
}
}
}
-
+
/**
* Notify the thread to die at the end of its next run
*/
Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ChangeTableState.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ChangeTableState.java?rev=732491&r1=732490&r2=732491&view=diff
==============================================================================
--- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ChangeTableState.java (original)
+++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ChangeTableState.java Wed Jan 7 13:03:20 2009
@@ -91,9 +91,7 @@
synchronized (master.regionManager) {
if (online) {
// Bring offline regions on-line
- if (!master.regionManager.isUnassigned(i) &&
- !master.regionManager.isAssigned(i.getRegionName()) &&
- !master.regionManager.isPending(i.getRegionName())) {
+ if (!master.regionManager.assignable(i)) {
master.regionManager.setUnassigned(i, false);
}
} else {
Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/HMaster.java?rev=732491&r1=732490&r2=732491&view=diff
==============================================================================
--- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/HMaster.java Wed Jan 7 13:03:20 2009
@@ -532,15 +532,14 @@
/*
* HMasterRegionInterface
*/
- public MapWritable regionServerStartup(HServerInfo serverInfo) {
- // Set the address for now even tho it will not be persisted on
- // the HRS side.
+ public MapWritable regionServerStartup(final HServerInfo serverInfo) {
+ // Set the address for now even tho it will not be persisted on HRS side.
String rsAddress = HBaseServer.getRemoteAddress();
- serverInfo.setServerAddress(new HServerAddress
- (rsAddress, serverInfo.getServerAddress().getPort()));
- // register with server manager
- serverManager.regionServerStartup(serverInfo);
- // send back some config info
+ serverInfo.setServerAddress(new HServerAddress(rsAddress,
+ serverInfo.getServerAddress().getPort()));
+ // Register with server manager
+ this.serverManager.regionServerStartup(serverInfo);
+ // Send back some config info
return createConfigurationSubset();
}
Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java?rev=732491&r1=732490&r2=732491&view=diff
==============================================================================
--- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java (original)
+++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java Wed Jan 7 13:03:20 2009
@@ -45,11 +45,13 @@
*/
class ProcessServerShutdown extends RegionServerOperation {
private final HServerAddress deadServer;
- private final String deadServerName;
+ /*
+ * Cache of the server name.
+ */
+ private final String deadServerStr;
private final boolean rootRegionServer;
private boolean rootRegionReassigned = false;
private Path oldLogDir;
- private boolean logSplit;
private boolean rootRescanned;
@@ -74,9 +76,8 @@
boolean rootRegionServer) {
super(master);
this.deadServer = serverInfo.getServerAddress();
- this.deadServerName = this.deadServer.toString();
+ this.deadServerStr = this.deadServer.toString();
this.rootRegionServer = rootRegionServer;
- this.logSplit = false;
this.rootRescanned = false;
this.oldLogDir =
new Path(master.rootdir, HLog.getHLogDirectoryName(serverInfo));
@@ -84,13 +85,14 @@
@Override
public String toString() {
- return "ProcessServerShutdown of " + this.deadServer.toString();
+ return "ProcessServerShutdown of " + this.deadServerStr;
}
- /** Finds regions that the dead region server was serving */
+ /** Finds regions that the dead region server was serving
+ */
protected void scanMetaRegion(HRegionInterface server, long scannerId,
- byte [] regionName) throws IOException {
-
+ byte [] regionName)
+ throws IOException {
List<ToDoEntry> toDoList = new ArrayList<ToDoEntry>();
Set<HRegionInfo> regions = new HashSet<HRegionInfo>();
List<byte []> emptyRows = new ArrayList<byte []>();
@@ -107,14 +109,13 @@
if (values == null || values.size() == 0) {
break;
}
-
byte [] row = values.getRow();
- // Check server name. If null, be conservative and treat as though
- // region had been on shutdown server (could be null because we
- // missed edits in hlog because hdfs does not do write-append).
+ // Check server name. If null, skip (We used to consider it was on
+ // shutdown server but that would mean that we'd reassign regions that
+ // were already out being assigned, ones that were product of a split
+ // that happened while the shutdown was being processed.
String serverName = Writables.cellToString(values.get(COL_SERVER));
- if (serverName != null && serverName.length() > 0 &&
- deadServerName.compareTo(serverName) != 0) {
+ if (serverName == null || !deadServerStr.equals(serverName)) {
// This isn't the server you're looking for - move along
continue;
}
@@ -159,7 +160,7 @@
}
}
} finally {
- if(scannerId != -1L) {
+ if (scannerId != -1L) {
try {
server.close(scannerId);
} catch (IOException e) {
@@ -222,21 +223,22 @@
long scannerId =
server.openScanner(m.getRegionName(), COLUMN_FAMILY_ARRAY,
EMPTY_START_ROW, HConstants.LATEST_TIMESTAMP, null);
-
- scanMetaRegion(server, scannerId, m.getRegionName());
+ scanMetaRegion(server, scannerId, m.getRegionName());
return true;
}
}
@Override
protected boolean process() throws IOException {
- LOG.info("process shutdown of server " + deadServer + ": logSplit: " +
- this.logSplit + ", rootRescanned: " + rootRescanned +
+ boolean logSplit =
+ this.master.serverManager.isDeadServerLogsSplit(this.deadServerStr);
+ LOG.info("process shutdown of server " + this.deadServerStr +
+ ": logSplit: " +
+ logSplit + ", rootRescanned: " + rootRescanned +
", numberOfMetaRegions: " +
master.regionManager.numMetaRegions() +
", onlineMetaRegions.size(): " +
master.regionManager.numOnlineMetaRegions());
-
if (!logSplit) {
// Process the old log file
if (master.fs.exists(oldLogDir)) {
@@ -250,9 +252,9 @@
master.regionManager.splitLogLock.unlock();
}
}
- logSplit = true;
+ this.master.serverManager.setDeadServerLogsSplit(this.deadServerStr);
}
-
+
if (this.rootRegionServer && !this.rootRegionReassigned) {
// avoid multiple root region reassignment
this.rootRegionReassigned = true;
@@ -277,7 +279,6 @@
new MetaRegion(master.getRootRegionLocation(),
HRegionInfo.ROOT_REGIONINFO.getRegionName(),
HConstants.EMPTY_START_ROW), this.master).doWithRetries();
-
if (result == null) {
// Master is closing - give up
return true;
@@ -290,7 +291,6 @@
}
rootRescanned = true;
}
-
if (!metaTableAvailable()) {
// We can't proceed because not all meta regions are online.
// metaAvailable() has put this request on the delayedToDoQueue
@@ -309,7 +309,11 @@
Bytes.toString(r.getRegionName()) + " on " + r.getServer());
}
}
- master.serverManager.removeDeadServer(deadServerName);
+ // Remove this server from dead servers list. Finished splitting logs.
+ this.master.serverManager.removeDeadServer(deadServerStr);
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Removed " + deadServerStr + " from deadservers Map");
+ }
return true;
}
-}
+}
\ No newline at end of file
Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/RegionManager.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/RegionManager.java?rev=732491&r1=732490&r2=732491&view=diff
==============================================================================
--- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/RegionManager.java (original)
+++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/RegionManager.java Wed Jan 7 13:03:20 2009
@@ -694,6 +694,17 @@
}
return false;
}
+
+ /**
+ * @param hri
+ * @return True if the passed region is assignable: i.e. not assigned, not
+ * pending and not unassigned.
+ */
+ public boolean assignable(final HRegionInfo hri) {
+ return !isUnassigned(hri) &&
+ !isPending(hri.getRegionName()) &&
+ !isAssigned(hri.getRegionName());
+ }
/**
* @param regionName
Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ServerManager.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=732491&r1=732490&r2=732491&view=diff
==============================================================================
--- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ServerManager.java (original)
+++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/master/ServerManager.java Wed Jan 7 13:03:20 2009
@@ -59,9 +59,13 @@
final Map<String, HServerInfo> serversToServerInfo =
new ConcurrentHashMap<String, HServerInfo>();
- /** Set of known dead servers */
- final Set<String> deadServers =
- Collections.synchronizedSet(new HashSet<String>());
+ /**
+ * Set of known dead servers. On lease expiration, servers are added here.
+ * Boolean holds whether its logs have been split or not. Initially set to
+ * false.
+ */
+ private final Map<String, Boolean> deadServers =
+ new ConcurrentHashMap<String, Boolean>();
/** SortedMap server load -> Set of server names */
final SortedMap<HServerLoad, Set<String>> loadToServers =
@@ -89,24 +93,67 @@
this.loggingPeriodForAverageLoad = master.getConfiguration().
getLong("hbase.master.avgload.logging.period", 60000);
}
-
+
+ /*
+ * Look to see if we have ghost references to this regionserver such as
+ * still-existing leases or if regionserver is on the dead servers list
+ * getting its logs processed.
+ * @param serverInfo
+ * @return True if still ghost references and we have not been able to clear
+ * them or the server is shutting down.
+ */
+ private boolean checkForGhostReferences(final HServerInfo serverInfo) {
+ String s = serverInfo.getServerAddress().toString().trim();
+ boolean result = false;
+ boolean lease = false;
+ for (long sleepTime = -1; !master.closed.get() && !result;) {
+ if (sleepTime != -1) {
+ try {
+ Thread.sleep(sleepTime);
+ } catch (InterruptedException e) {
+ // Continue
+ }
+ }
+ if (!lease) {
+ try {
+ this.serverLeases.createLease(s, new ServerExpirer(s));
+ } catch (Leases.LeaseStillHeldException e) {
+ LOG.debug("Waiting on current lease to expire for " + e.getName());
+ sleepTime = this.master.leaseTimeout / 4;
+ continue;
+ }
+ lease = true;
+ }
+ // May be on list of dead servers. If so, wait till we've cleared it.
+ String addr = serverInfo.getServerAddress().toString();
+ if (isDead(addr) && !isDeadServerLogsSplit(addr)) {
+ LOG.debug("Waiting on " + addr + " removal from dead list before " +
+ "processing report-for-duty request");
+ sleepTime = this.master.threadWakeFrequency;
+ try {
+ // Keep up lease. May be here > lease expiration.
+ this.serverLeases.renewLease(s);
+ } catch (LeaseException e) {
+ LOG.warn("Failed renewal. Retrying.", e);
+ }
+ continue;
+ }
+ result = true;
+ }
+ return result;
+ }
+
/**
* Let the server manager know a new regionserver has come online
* @param serverInfo
*/
- public void regionServerStartup(HServerInfo serverInfo) {
+ public void regionServerStartup(final HServerInfo serverInfo) {
String s = serverInfo.getServerAddress().toString().trim();
LOG.info("Received start message from: " + s);
- // Do the lease check up here. There might already be one out on this
- // server expecially if it just shutdown and came back up near-immediately.
- if (!master.closed.get()) {
- try {
- serverLeases.createLease(s, new ServerExpirer(s));
- } catch (Leases.LeaseStillHeldException e) {
- LOG.debug("Lease still held on " + e.getName());
- return;
- }
+ if (!checkForGhostReferences(serverInfo)) {
+ return;
}
+ // Go on to process the regionserver registration.
HServerLoad load = serversToLoad.remove(s);
if (load != null) {
// The startup message was from a known server.
@@ -119,7 +166,6 @@
}
}
}
-
HServerInfo storedInfo = serversToServerInfo.remove(s);
if (storedInfo != null && !master.closed.get()) {
// The startup message was from a known server with the same name.
@@ -137,7 +183,6 @@
LOG.error("Insertion into toDoQueue was interrupted", e);
}
}
-
// record new server
load = new HServerLoad();
serverInfo.setLoad(load);
@@ -703,7 +748,7 @@
}
}
}
- deadServers.add(server);
+ deadServers.put(server, Boolean.FALSE);
try {
master.toDoQueue.put(
new ProcessServerShutdown(master, info, rootServer));
@@ -742,6 +787,23 @@
* @return true if server is dead
*/
public boolean isDead(String serverName) {
- return deadServers.contains(serverName);
+ return deadServers.containsKey(serverName);
+ }
+
+ /**
+ * @param serverName
+ * @return True if this is a dead server and it has had its logs split.
+ */
+ public boolean isDeadServerLogsSplit(final String serverName) {
+ Boolean b = this.deadServers.get(serverName);
+ return b == null? false: b.booleanValue();
+ }
+
+ /**
+ * Set that this deadserver has had its log split.
+ * @param serverName
+ */
+ public void setDeadServerLogsSplit(final String serverName) {
+ this.deadServers.put(serverName, Boolean.TRUE);
}
-}
\ No newline at end of file
+}