You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2009/09/22 18:32:41 UTC
svn commit: r817719 - in /hadoop/hbase/trunk: ./
src/java/org/apache/hadoop/hbase/client/
src/java/org/apache/hadoop/hbase/ipc/
src/java/org/apache/hadoop/hbase/regionserver/
Author: stack
Date: Tue Sep 22 16:32:41 2009
New Revision: 817719
URL: http://svn.apache.org/viewvc?rev=817719&view=rev
Log:
HBASE-1815 HBaseClient can get stuck in an infinite loop while attempting to contact a failed regionserver
Modified:
hadoop/hbase/trunk/CHANGES.txt
hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java
hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/client/RetriesExhaustedException.java
hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/ipc/HBaseClient.java
hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/ipc/HBaseRPC.java
hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
Modified: hadoop/hbase/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/CHANGES.txt?rev=817719&r1=817718&r2=817719&view=diff
==============================================================================
--- hadoop/hbase/trunk/CHANGES.txt (original)
+++ hadoop/hbase/trunk/CHANGES.txt Tue Sep 22 16:32:41 2009
@@ -36,6 +36,8 @@
HBASE-1850 src/examples/mapred do not compile after HBASE-1822
HBASE-1853 Each time around the regionserver core loop, we clear the
messages to pass master, even if we failed to deliver them
+ HBASE-1815 HBaseClient can get stuck in an infinite loop while attempting
+ to contact a failed regionserver
IMPROVEMENTS
HBASE-1760 Cleanup TODOs in HTable
Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java?rev=817719&r1=817718&r2=817719&view=diff
==============================================================================
--- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java (original)
+++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java Tue Sep 22 16:32:41 2009
@@ -329,6 +329,8 @@
} catch (IOException e) {
if (tries == numRetries - 1) {
// This was our last chance - don't bother sleeping
+ LOG.info("getMaster attempt " + tries + " of " + this.numRetries +
+ " failed; no more retrying.", e);
break;
}
LOG.info("getMaster attempt " + tries + " of " + this.numRetries +
@@ -847,7 +849,7 @@
public HRegionInterface getHRegionConnection(
HServerAddress regionServer, boolean getMaster)
throws IOException {
- if(getMaster) {
+ if (getMaster) {
getMaster();
}
HRegionInterface server;
@@ -925,9 +927,9 @@
"Timed out trying to locate root region");
}
- // get a connection to the region server
- HRegionInterface server = getHRegionConnection(rootRegionAddress);
try {
+ // Get a connection to the region server
+ HRegionInterface server = getHRegionConnection(rootRegionAddress);
// if this works, then we're good, and we have an acceptable address,
// so we can stop doing retries and return the result.
server.getRegionInfo(HRegionInfo.ROOT_REGIONINFO.getRegionName());
Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/client/RetriesExhaustedException.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/client/RetriesExhaustedException.java?rev=817719&r1=817718&r2=817719&view=diff
==============================================================================
--- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/client/RetriesExhaustedException.java (original)
+++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/client/RetriesExhaustedException.java Tue Sep 22 16:32:41 2009
@@ -26,6 +26,11 @@
*/
public class RetriesExhaustedException extends IOException {
private static final long serialVersionUID = 1876775844L;
+
+ public RetriesExhaustedException(final String msg) {
+ super(msg);
+ }
+
/**
* Create a new RetriesExhaustedException from the list of prior failures.
* @param serverName name of HRegionServer
@@ -35,11 +40,9 @@
* @param exceptions List of exceptions that failed before giving up
*/
public RetriesExhaustedException(String serverName, final byte [] regionName,
- final byte [] row,
- int numTries, List<Throwable> exceptions) {
+ final byte [] row, int numTries, List<Throwable> exceptions) {
super(getMessage(serverName, regionName, row, numTries, exceptions));
}
-
private static String getMessage(String serverName, final byte [] regionName,
final byte [] row,
@@ -59,4 +62,4 @@
}
return buffer.toString();
}
-}
+}
\ No newline at end of file
Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/ipc/HBaseClient.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/ipc/HBaseClient.java?rev=817719&r1=817718&r2=817719&view=diff
==============================================================================
--- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/ipc/HBaseClient.java (original)
+++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/ipc/HBaseClient.java Tue Sep 22 16:32:41 2009
@@ -76,6 +76,7 @@
final protected int maxIdleTime; //connections will be culled if it was idle for
//maxIdleTime msecs
final protected int maxRetries; //the max. no. of retries for socket connections
+ final protected long failureSleep; // Time to sleep before retry on failure.
protected boolean tcpNoDelay; // if T then disable Nagle's Algorithm
protected boolean tcpKeepAlive; // if T then use keepalives
protected int pingInterval; // how often sends ping to the server in msecs
@@ -308,10 +309,7 @@
this.socket.setSoTimeout(pingInterval);
break;
} catch (SocketTimeoutException toe) {
- /* The max number of retries is 45,
- * which amounts to 20s*45 = 15 minutes retries.
- */
- handleConnectionFailure(timeoutFailures++, 45, toe);
+ handleConnectionFailure(timeoutFailures++, maxRetries, toe);
} catch (IOException ie) {
handleConnectionFailure(ioFailures++, maxRetries, ie);
}
@@ -338,7 +336,7 @@
/* Handle connection failures
*
* If the current number of retries is equal to the max number of retries,
- * stop retrying and throw the exception; Otherwise backoff 1 second and
+ * stop retrying and throw the exception; Otherwise backoff N seconds and
* try connecting again.
*
* This Method is only called from inside setupIOstreams(), which is
@@ -368,11 +366,12 @@
// otherwise back off and retry
try {
- Thread.sleep(1000);
+ Thread.sleep(failureSleep);
} catch (InterruptedException ignored) {}
LOG.info("Retrying connect to server: " + remoteId.getAddress() +
- ". Already tried " + curRetries + " time(s).");
+ " after sleeping " + failureSleep + "ms. Already tried " + curRetries +
+ " time(s).");
}
/* Write the header for each connection
@@ -636,10 +635,11 @@
SocketFactory factory) {
this.valueClass = valueClass;
this.maxIdleTime =
- conf.getInt("ipc.client.connection.maxidletime", 10000); //10s
- this.maxRetries = conf.getInt("ipc.client.connect.max.retries", 10);
- this.tcpNoDelay = conf.getBoolean("ipc.client.tcpnodelay", false);
- this.tcpKeepAlive = conf.getBoolean("ipc.client.tcpkeepalive", true);
+ conf.getInt("hbase.ipc.client.connection.maxidletime", 10000); //10s
+ this.maxRetries = conf.getInt("hbase.ipc.client.connect.max.retries", 0);
+ this.failureSleep = conf.getInt("hbase.client.pause", 2000);
+ this.tcpNoDelay = conf.getBoolean("hbase.ipc.client.tcpnodelay", false);
+ this.tcpKeepAlive = conf.getBoolean("hbase.ipc.client.tcpkeepalive", true);
this.pingInterval = getPingInterval(conf);
if (LOG.isDebugEnabled()) {
LOG.debug("The ping interval is" + this.pingInterval + "ms.");
Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/ipc/HBaseRPC.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/ipc/HBaseRPC.java?rev=817719&r1=817718&r2=817719&view=diff
==============================================================================
--- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/ipc/HBaseRPC.java (original)
+++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/ipc/HBaseRPC.java Tue Sep 22 16:32:41 2009
@@ -417,14 +417,12 @@
try {
return getProxy(protocol, clientVersion, addr, conf);
} catch(ConnectException se) { // namenode has not been started
- LOG.info("Server at " + addr + " not available yet, Zzzzz...");
ioe = se;
if (maxAttempts >= 0 && ++reconnectAttempts >= maxAttempts) {
LOG.info("Server at " + addr + " could not be reached after " +
- reconnectAttempts + " tries, giving up.");
- throw new RetriesExhaustedException(addr.toString(), "unknown".getBytes(),
- "unknown".getBytes(), reconnectAttempts - 1,
- new ArrayList<Throwable>());
+ reconnectAttempts + " tries, giving up.");
+ throw new RetriesExhaustedException("Failed setting up proxy to " +
+ addr.toString() + " after attempts=" + reconnectAttempts);
}
} catch(SocketTimeoutException te) { // namenode is busy
LOG.info("Problem connecting to server: " + addr);
Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java?rev=817719&r1=817718&r2=817719&view=diff
==============================================================================
--- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (original)
+++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java Tue Sep 22 16:32:41 2009
@@ -567,24 +567,20 @@
if (e instanceof IOException) {
e = RemoteExceptionHandler.checkIOException((IOException) e);
}
- if (tries < this.numRetries) {
- LOG.warn("Processing message (Retry: " + tries + ")", e);
- tries++;
- } else {
- LOG.error("Exceeded max retries: " + this.numRetries, e);
- if (checkFileSystem()) {
- // Filesystem is OK. Something is up w/ ZK or master. Sleep
- // a little while if only to stop our logging many times a
- // millisecond.
- Thread.sleep(1000);
- }
+ tries++;
+ if (tries > 0 && (tries % this.numRetries) == 0) {
+ // Check filesystem every so often.
+ checkFileSystem();
}
if (this.stopRequested.get()) {
- LOG.info("Stop was requested, clearing the toDo " +
- "despite of the exception");
- toDo.clear();
- continue;
+ LOG.info("Stop requested, clearing toDo despite exception");
+ toDo.clear();
+ continue;
}
+ LOG.warn("Attempt=" + tries, e);
+ // No point retrying immediately; this is probably connection to
+ // master issue. Doing below will cause us to sleep.
+ lastMsg = System.currentTimeMillis();
}
}
// Do some housekeeping before going to sleep