You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2010/05/01 02:16:30 UTC
svn commit: r939885 - in /hadoop/hbase/branches/0.20: ./
src/java/org/apache/hadoop/hbase/client/
src/java/org/apache/hadoop/hbase/regionserver/
src/test/org/apache/hadoop/hbase/
Author: stack
Date: Sat May 1 00:16:29 2010
New Revision: 939885
URL: http://svn.apache.org/viewvc?rev=939885&view=rev
Log:
HBASE-2421 Put hangs for 10 retries on failed region servers
Modified:
hadoop/hbase/branches/0.20/CHANGES.txt
hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnection.java
hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java
hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HTable.java
hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
hadoop/hbase/branches/0.20/src/test/org/apache/hadoop/hbase/TestMultiParallelPut.java
Modified: hadoop/hbase/branches/0.20/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.20/CHANGES.txt?rev=939885&r1=939884&r2=939885&view=diff
==============================================================================
--- hadoop/hbase/branches/0.20/CHANGES.txt (original)
+++ hadoop/hbase/branches/0.20/CHANGES.txt Sat May 1 00:16:29 2010
@@ -108,6 +108,8 @@ Release 0.20.4 - Unreleased
HBASE-2497 ProcessServerShutdown throws NullPointerException for offline
regions (Miklos Kurucz via Stack)
HBASE-2499 Race condition when disabling a table leaves regions in transition
+ HBASE-2421 Put hangs for 10 retries on failed region servers
+ (Todd Lipcon via Stack)
IMPROVEMENTS
HBASE-2180 Bad read performance from synchronizing hfile.fddatainputstream
Modified: hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnection.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnection.java?rev=939885&r1=939884&r2=939885&view=diff
==============================================================================
--- hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnection.java (original)
+++ hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnection.java Sat May 1 00:16:29 2010
@@ -192,7 +192,7 @@ public interface HConnection {
* @throws IOException
* @throws RuntimeException
*/
- public <T> T getRegionServerForWithoutRetries(ServerCallable<T> callable)
+ public <T> T getRegionServerWithoutRetries(ServerCallable<T> callable)
throws IOException, RuntimeException;
@@ -219,5 +219,5 @@ public interface HConnection {
public void processBatchOfPuts(List<Put> list,
final byte[] tableName, ExecutorService pool) throws IOException;
-
+
}
Modified: hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java?rev=939885&r1=939884&r2=939885&view=diff
==============================================================================
--- hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java (original)
+++ hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java Sat May 1 00:16:29 2010
@@ -681,7 +681,7 @@ public class HConnectionManager implemen
// This block guards against two threads trying to load the meta
// region at the same time. The first will load the meta region and
// the second will use the value that the first one found.
- synchronized(regionLockObject) {
+ synchronized (regionLockObject) {
// Check the cache again for a hit in case some other thread made the
// same query while we were waiting on the lock. If not supposed to
// be using the cache, delete any existing cached location so it won't
@@ -1082,15 +1082,19 @@ public class HConnectionManager implemen
return null;
}
- public <T> T getRegionServerForWithoutRetries(ServerCallable<T> callable)
+ public <T> T getRegionServerWithoutRetries(ServerCallable<T> callable)
throws IOException, RuntimeException {
try {
callable.instantiateServer(false);
return callable.call();
} catch (Throwable t) {
- t = translateException(t);
+ Throwable t2 = translateException(t);
+ if (t2 instanceof IOException) {
+ throw (IOException)t2;
+ } else {
+ throw new RuntimeException(t2);
+ }
}
- return null;
}
private HRegionLocation
@@ -1305,8 +1309,28 @@ public class HConnectionManager implemen
}
}
+ /**
+ * Process a batch of Puts on the given executor service.
+ *
+ * @param list the puts to make - successful puts will be removed.
+ * @param pool thread pool to execute requests on
+ *
+ * In the case of an exception, we take different actions depending on the
+ * situation:
+ * - If the exception is a DoNotRetryException, we rethrow it and leave the
+ * 'list' parameter in an indeterminate state.
+ * - If the 'list' parameter is a singleton, we directly throw the specific
+ * exception for that put.
+ * - Otherwise, we throw a generic exception indicating that an error occurred.
+ * The 'list' parameter is mutated to contain those puts that did not succeed.
+ */
public void processBatchOfPuts(List<Put> list,
- final byte[] tableName, ExecutorService pool) throws IOException {
+ final byte[] tableName,
+ ExecutorService pool) throws IOException {
+ boolean singletonList = list.size() == 1;
+ Throwable singleRowCause = null;
+ List<Put> permFails = new ArrayList<Put>();
+
for ( int tries = 0 ; tries < numRetries && !list.isEmpty(); ++tries) {
Collections.sort(list);
Map<HServerAddress, MultiPut> regionPuts =
@@ -1372,10 +1396,20 @@ public class HConnectionManager implemen
LOG.debug("Failed all from " + request.address, e);
failed.addAll(request.allPuts());
} catch (ExecutionException e) {
- System.out.println(e);
// all go into the failed list.
LOG.debug("Failed all from " + request.address, e);
failed.addAll(request.allPuts());
+
+ // Just give up, leaving the batch put list in an untouched/semi-committed state
+ if (e.getCause() instanceof DoNotRetryIOException) {
+ throw (DoNotRetryIOException) e.getCause();
+ }
+
+ if (singletonList) {
+ // be richer for reporting in a 1 row case.
+ singleRowCause = e.getCause();
+ }
+
}
}
list.clear();
@@ -1391,15 +1425,20 @@ public class HConnectionManager implemen
" ms!");
try {
Thread.sleep(sleepTime);
- } catch (InterruptedException e) {
-
+ } catch (InterruptedException ignored) {
}
}
}
+
if (!list.isEmpty()) {
+ if (singletonList && singleRowCause != null) {
+ throw new IOException(singleRowCause);
+ }
+
+
// ran out of retries and didnt succeed everything!
throw new RetriesExhaustedException("Still had " + list.size() + " puts left after retrying " +
- numRetries + " times. Should have detail on which Regions failed the most");
+ numRetries + " times.");
}
}
@@ -1410,7 +1449,7 @@ public class HConnectionManager implemen
final HConnection connection = this;
return new Callable<MultiPutResponse>() {
public MultiPutResponse call() throws IOException {
- return getRegionServerWithRetries(
+ return getRegionServerWithoutRetries(
new ServerCallable<MultiPutResponse>(connection, tableName, null) {
public MultiPutResponse call() throws IOException {
MultiPutResponse resp = server.multiPut(puts);
Modified: hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HTable.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HTable.java?rev=939885&r1=939884&r2=939885&view=diff
==============================================================================
--- hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HTable.java (original)
+++ hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HTable.java Sat May 1 00:16:29 2010
@@ -702,10 +702,10 @@ public class HTable {
connection.processBatchOfPuts(writeBuffer,
tableName, pool);
} finally {
- // the write buffer was adjsuted by processBatchOfPuts
+ // the write buffer was adjusted by processBatchOfPuts
currentWriteBufferSize = 0;
- for (int i = 0; i < writeBuffer.size(); i++) {
- currentWriteBufferSize += writeBuffer.get(i).heapSize();
+ for (Put aPut : writeBuffer) {
+ currentWriteBufferSize += aPut.heapSize();
}
}
}
Modified: hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java?rev=939885&r1=939884&r2=939885&view=diff
==============================================================================
--- hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (original)
+++ hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java Sat May 1 00:16:29 2010
@@ -234,7 +234,7 @@ public class HRegionServer implements HC
// Run HDFS shutdown thread on exit if this is set. We clear this out when
// doing a restart() to prevent closing of HDFS.
- private final AtomicBoolean shutdownHDFS = new AtomicBoolean(true);
+ public final AtomicBoolean shutdownHDFS = new AtomicBoolean(true);
private final String machineName;
Modified: hadoop/hbase/branches/0.20/src/test/org/apache/hadoop/hbase/TestMultiParallelPut.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.20/src/test/org/apache/hadoop/hbase/TestMultiParallelPut.java?rev=939885&r1=939884&r2=939885&view=diff
==============================================================================
--- hadoop/hbase/branches/0.20/src/test/org/apache/hadoop/hbase/TestMultiParallelPut.java (original)
+++ hadoop/hbase/branches/0.20/src/test/org/apache/hadoop/hbase/TestMultiParallelPut.java Sat May 1 00:16:29 2010
@@ -20,15 +20,15 @@
package org.apache.hadoop.hbase;
+import org.apache.hadoop.hbase.client.Get;
+import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
-import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.Result;
-import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.util.Bytes;
-import java.util.List;
import java.util.ArrayList;
+import java.util.List;
public class TestMultiParallelPut extends MultiRegionTable {
private static final byte[] VALUE = Bytes.toBytes("value");
@@ -58,7 +58,14 @@ public class TestMultiParallelPut extend
List<byte[]> keys = new ArrayList<byte[]>();
- public void testMultiPut() throws Exception {
+ public void testParallelPut() throws Exception {
+ doATest(false);
+ }
+ public void testParallelPutWithRSAbort() throws Exception {
+ doATest(true);
+ }
+
+ public void doATest(boolean doAbort) throws Exception {
HTable table = new HTable(TEST_TABLE);
table.setAutoFlush(false);
@@ -73,6 +80,19 @@ public class TestMultiParallelPut extend
table.flushCommits();
+ if (doAbort) {
+ cluster.abortRegionServer(0);
+
+ // try putting more keys after the abort.
+ for ( byte [] k : keys ) {
+ Put put = new Put(k);
+ put.add(BYTES_FAMILY, QUALIFIER, VALUE);
+
+ table.put(put);
+ }
+ table.flushCommits();
+ }
+
for (byte [] k : keys ) {
Get get = new Get(k);
get.addColumn(BYTES_FAMILY, QUALIFIER);
@@ -88,10 +108,15 @@ public class TestMultiParallelPut extend
HBaseAdmin admin = new HBaseAdmin(conf);
ClusterStatus cs = admin.getClusterStatus();
- assertEquals(2, cs.getServers());
+ int expectedServerCount = 2;
+ if (doAbort)
+ expectedServerCount = 1;
+
+ assertEquals(expectedServerCount, cs.getServers());
for ( HServerInfo info : cs.getServerInfo()) {
System.out.println(info);
assertTrue( info.getLoad().getNumberOfRegions() > 10);
}
}
+
}