You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2010/05/01 02:16:30 UTC

svn commit: r939885 - in /hadoop/hbase/branches/0.20: ./ src/java/org/apache/hadoop/hbase/client/ src/java/org/apache/hadoop/hbase/regionserver/ src/test/org/apache/hadoop/hbase/

Author: stack
Date: Sat May  1 00:16:29 2010
New Revision: 939885

URL: http://svn.apache.org/viewvc?rev=939885&view=rev
Log:
HBASE-2421 Put hangs for 10 retries on failed region servers

Modified:
    hadoop/hbase/branches/0.20/CHANGES.txt
    hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnection.java
    hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java
    hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HTable.java
    hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
    hadoop/hbase/branches/0.20/src/test/org/apache/hadoop/hbase/TestMultiParallelPut.java

Modified: hadoop/hbase/branches/0.20/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.20/CHANGES.txt?rev=939885&r1=939884&r2=939885&view=diff
==============================================================================
--- hadoop/hbase/branches/0.20/CHANGES.txt (original)
+++ hadoop/hbase/branches/0.20/CHANGES.txt Sat May  1 00:16:29 2010
@@ -108,6 +108,8 @@ Release 0.20.4 - Unreleased
    HBASE-2497  ProcessServerShutdown throws NullPointerException for offline
                regions (Miklos Kurucz via Stack)
    HBASE-2499  Race condition when disabling a table leaves regions in transition
+   HBASE-2421  Put hangs for 10 retries on failed region servers
+               (Todd Lipcon via Stack)
 
   IMPROVEMENTS
    HBASE-2180  Bad read performance from synchronizing hfile.fddatainputstream

Modified: hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnection.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnection.java?rev=939885&r1=939884&r2=939885&view=diff
==============================================================================
--- hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnection.java (original)
+++ hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnection.java Sat May  1 00:16:29 2010
@@ -192,7 +192,7 @@ public interface HConnection {
    * @throws IOException
    * @throws RuntimeException
    */
-  public <T> T getRegionServerForWithoutRetries(ServerCallable<T> callable) 
+  public <T> T getRegionServerWithoutRetries(ServerCallable<T> callable)
   throws IOException, RuntimeException;
   
     
@@ -219,5 +219,5 @@ public interface HConnection {
   public void processBatchOfPuts(List<Put> list,
                                  final byte[] tableName, ExecutorService pool) throws IOException;
 
-  
+
 }

Modified: hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java?rev=939885&r1=939884&r2=939885&view=diff
==============================================================================
--- hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java (original)
+++ hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HConnectionManager.java Sat May  1 00:16:29 2010
@@ -681,7 +681,7 @@ public class HConnectionManager implemen
           // This block guards against two threads trying to load the meta
           // region at the same time. The first will load the meta region and
           // the second will use the value that the first one found.
-          synchronized(regionLockObject) {
+          synchronized (regionLockObject) {
             // Check the cache again for a hit in case some other thread made the
             // same query while we were waiting on the lock. If not supposed to
             // be using the cache, delete any existing cached location so it won't
@@ -1082,15 +1082,19 @@ public class HConnectionManager implemen
       return null;    
     }
     
-    public <T> T getRegionServerForWithoutRetries(ServerCallable<T> callable)
+    public <T> T getRegionServerWithoutRetries(ServerCallable<T> callable)
         throws IOException, RuntimeException {
       try {
         callable.instantiateServer(false);
         return callable.call();
       } catch (Throwable t) {
-        t = translateException(t);
+        Throwable t2 = translateException(t);
+        if (t2 instanceof IOException) {
+          throw (IOException)t2;
+        } else {
+          throw new RuntimeException(t2);
+        }
       }
-      return null;
     }
 
     private HRegionLocation
@@ -1305,8 +1309,28 @@ public class HConnectionManager implemen
       }
     }
 
+    /**
+     * Process a batch of Puts on the given executor service.
+     *
+     * @param list the puts to make - successful puts will be removed.
+     * @param pool thread pool to execute requests on
+     *
+     * In the case of an exception, we take different actions depending on the
+     * situation:
+     *  - If the exception is a DoNotRetryException, we rethrow it and leave the
+     *    'list' parameter in an indeterminate state.
+     *  - If the 'list' parameter is a singleton, we directly throw the specific
+     *    exception for that put.
+     *  - Otherwise, we throw a generic exception indicating that an error occurred.
+     *    The 'list' parameter is mutated to contain those puts that did not succeed.
+     */
     public void processBatchOfPuts(List<Put> list,
-                                   final byte[] tableName, ExecutorService pool) throws IOException {
+                                   final byte[] tableName,
+                                   ExecutorService pool) throws IOException {
+      boolean singletonList = list.size() == 1;
+      Throwable singleRowCause = null;
+      List<Put> permFails = new ArrayList<Put>();
+
       for ( int tries = 0 ; tries < numRetries && !list.isEmpty(); ++tries) {
         Collections.sort(list);
         Map<HServerAddress, MultiPut> regionPuts =
@@ -1372,10 +1396,20 @@ public class HConnectionManager implemen
             LOG.debug("Failed all from " + request.address, e);
             failed.addAll(request.allPuts());
           } catch (ExecutionException e) {
-            System.out.println(e);
             // all go into the failed list.
             LOG.debug("Failed all from " + request.address, e);
             failed.addAll(request.allPuts());
+
+            // Just give up, leaving the batch put list in an untouched/semi-committed state
+            if (e.getCause() instanceof DoNotRetryIOException) {
+              throw (DoNotRetryIOException) e.getCause();
+            }
+
+            if (singletonList) {
+              // be richer for reporting in a 1 row case.
+              singleRowCause = e.getCause();
+            }
+
           }
         }
         list.clear();
@@ -1391,15 +1425,20 @@ public class HConnectionManager implemen
               " ms!");
           try {
             Thread.sleep(sleepTime);
-          } catch (InterruptedException e) {
-
+          } catch (InterruptedException ignored) {
           }
         }
       }
+
       if (!list.isEmpty()) {
+        if (singletonList && singleRowCause != null) {
+          throw new IOException(singleRowCause);
+        }
+
+
         // ran out of retries and didnt succeed everything!
         throw new RetriesExhaustedException("Still had " + list.size() + " puts left after retrying " +
-            numRetries + " times. Should have detail on which Regions failed the most");
+            numRetries + " times.");
       }
     }
 
@@ -1410,7 +1449,7 @@ public class HConnectionManager implemen
       final HConnection connection = this;
       return new Callable<MultiPutResponse>() {
         public MultiPutResponse call() throws IOException {
-          return getRegionServerWithRetries(
+          return getRegionServerWithoutRetries(
               new ServerCallable<MultiPutResponse>(connection, tableName, null) {
                 public MultiPutResponse call() throws IOException {
                   MultiPutResponse resp = server.multiPut(puts);

Modified: hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HTable.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HTable.java?rev=939885&r1=939884&r2=939885&view=diff
==============================================================================
--- hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HTable.java (original)
+++ hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/client/HTable.java Sat May  1 00:16:29 2010
@@ -702,10 +702,10 @@ public class HTable {
       connection.processBatchOfPuts(writeBuffer,
           tableName, pool);
     } finally {
-      // the write buffer was adjsuted by processBatchOfPuts
+      // the write buffer was adjusted by processBatchOfPuts
       currentWriteBufferSize = 0;
-      for (int i = 0; i < writeBuffer.size(); i++) {
-        currentWriteBufferSize += writeBuffer.get(i).heapSize();
+      for (Put aPut : writeBuffer) {
+        currentWriteBufferSize += aPut.heapSize();
       }
     }
   }

Modified: hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java?rev=939885&r1=939884&r2=939885&view=diff
==============================================================================
--- hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (original)
+++ hadoop/hbase/branches/0.20/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java Sat May  1 00:16:29 2010
@@ -234,7 +234,7 @@ public class HRegionServer implements HC
 
   // Run HDFS shutdown thread on exit if this is set. We clear this out when
   // doing a restart() to prevent closing of HDFS.
-  private final AtomicBoolean shutdownHDFS = new AtomicBoolean(true);
+  public final AtomicBoolean shutdownHDFS = new AtomicBoolean(true);
 
   private final String machineName;
 

Modified: hadoop/hbase/branches/0.20/src/test/org/apache/hadoop/hbase/TestMultiParallelPut.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/branches/0.20/src/test/org/apache/hadoop/hbase/TestMultiParallelPut.java?rev=939885&r1=939884&r2=939885&view=diff
==============================================================================
--- hadoop/hbase/branches/0.20/src/test/org/apache/hadoop/hbase/TestMultiParallelPut.java (original)
+++ hadoop/hbase/branches/0.20/src/test/org/apache/hadoop/hbase/TestMultiParallelPut.java Sat May  1 00:16:29 2010
@@ -20,15 +20,15 @@
 
 package org.apache.hadoop.hbase;
 
+import org.apache.hadoop.hbase.client.Get;
+import org.apache.hadoop.hbase.client.HBaseAdmin;
 import org.apache.hadoop.hbase.client.HTable;
 import org.apache.hadoop.hbase.client.Put;
-import org.apache.hadoop.hbase.client.Get;
 import org.apache.hadoop.hbase.client.Result;
-import org.apache.hadoop.hbase.client.HBaseAdmin;
 import org.apache.hadoop.hbase.util.Bytes;
 
-import java.util.List;
 import java.util.ArrayList;
+import java.util.List;
 
 public class TestMultiParallelPut extends MultiRegionTable {
   private static final byte[] VALUE = Bytes.toBytes("value");
@@ -58,7 +58,14 @@ public class TestMultiParallelPut extend
 
   List<byte[]> keys = new ArrayList<byte[]>();
 
-  public void testMultiPut() throws Exception {
+  public void testParallelPut() throws Exception {
+    doATest(false);
+  }
+  public void testParallelPutWithRSAbort() throws Exception {
+    doATest(true);
+  }
+
+  public void doATest(boolean doAbort) throws Exception {
 
     HTable table = new HTable(TEST_TABLE);
     table.setAutoFlush(false);
@@ -73,6 +80,19 @@ public class TestMultiParallelPut extend
 
     table.flushCommits();
 
+    if (doAbort) {
+      cluster.abortRegionServer(0);
+
+      // try putting more keys after the abort.
+      for ( byte [] k : keys ) {
+        Put put = new Put(k);
+        put.add(BYTES_FAMILY, QUALIFIER, VALUE);
+
+        table.put(put);
+      }
+      table.flushCommits();
+    }
+
     for (byte [] k : keys ) {
       Get get = new Get(k);
       get.addColumn(BYTES_FAMILY, QUALIFIER);
@@ -88,10 +108,15 @@ public class TestMultiParallelPut extend
     HBaseAdmin admin = new HBaseAdmin(conf);
     ClusterStatus cs = admin.getClusterStatus();
 
-    assertEquals(2, cs.getServers());
+    int expectedServerCount = 2;
+    if (doAbort)
+      expectedServerCount = 1;
+
+    assertEquals(expectedServerCount, cs.getServers());
     for ( HServerInfo info : cs.getServerInfo()) {
       System.out.println(info);
       assertTrue( info.getLoad().getNumberOfRegions() > 10);
     }
   }
+
 }