You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by ns...@apache.org on 2011/10/11 19:43:45 UTC

svn commit: r1181949 - /hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HConnectionManager.java

Author: nspiegelberg
Date: Tue Oct 11 17:43:44 2011
New Revision: 1181949

URL: http://svn.apache.org/viewvc?rev=1181949&view=rev
Log:
Improve recovery time of the HBase client when a region server dies.

Summary:
When a region server dies, the HBase client waits until the RPC timesout before
learning that it needs to check META to find the new location of the region.
And it incurs this *timeout* cost for every region being served by the dead
region server.
This diff fixes this by clearing the entries in cache that have the dead region
server as their values.

Test Plan:
I wrote a interative client program that inserts to two rows in two different
regions served by the same region server. While this client is waiting for user
input after the insertions, I power off this region server from radium. After
the regions originally served by the dead server are re-distributed to other
region servers, I tell the client program to perform gets on the two rows.
I saw only one RPC timeout rather than two.

Reviewed By: kannan
Reviewers: kannan, liyintang
Commenters: liyintang
CC: liyintang, kannan, hbase@lists, itapai
Differential Revision: 305582
Task ID: 618176

Modified:
    hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HConnectionManager.java

Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HConnectionManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HConnectionManager.java?rev=1181949&r1=1181948&r2=1181949&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HConnectionManager.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HConnectionManager.java Tue Oct 11 17:43:44 2011
@@ -50,12 +50,16 @@ import org.apache.zookeeper.Watcher.Even
 
 import java.io.IOException;
 import java.lang.reflect.UndeclaredThrowableException;
+import java.net.ConnectException;
+import java.net.SocketTimeoutException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Map.Entry;
 import java.util.Set;
 import java.util.TreeSet;
 import java.util.concurrent.Callable;
@@ -301,6 +305,13 @@ public class HConnectionManager {
       cachedRegionLocations =
         new HashMap<Integer, SoftValueSortedMap<byte [], HRegionLocation>>();
 
+    // The presence of a server in the map implies it's likely that there is an
+    // entry in cachedRegionLocations that map to this server; but the absence
+    // of a server in this map guarentees that there is no entry in cache that
+    // maps to the absent server.
+    private final Set<String> cachedServers =
+        new HashSet<String>();
+
     // region cache prefetch is enabled by default. this set contains all
     // tables whose region cache prefetch are disabled.
     private final Set<Integer> regionCachePrefetchDisabledTables =
@@ -977,6 +988,35 @@ public class HConnectionManager {
     }
 
     /*
+     * Delete all cached entries of a table that maps to a specific location.
+     *
+     * @param tablename
+     * @param server
+     */
+    private void clearCachedLocationForServer(
+        final String server) {
+      boolean deletedSomething = false;
+      synchronized (this.cachedRegionLocations) {
+        if (!cachedServers.contains(server)) {
+          return;
+        }
+        for (SoftValueSortedMap<byte[], HRegionLocation> tableLocations :
+            cachedRegionLocations.values()) {
+          for (Entry<byte[], HRegionLocation> e : tableLocations.entrySet()) {
+            if (e.getValue().getServerAddress().toString().equals(server)) {
+              tableLocations.remove(e.getKey());
+              deletedSomething = true;
+            }
+          }
+        }
+        cachedServers.remove(server);
+      }
+      if (deletedSomething && LOG.isDebugEnabled()) {
+        LOG.debug("Removed all cached region locations that map to " + server);
+      }
+    }
+
+    /*
      * @param tableName
      * @return Map of cached locations for passed <code>tableName</code>
      */
@@ -1001,7 +1041,10 @@ public class HConnectionManager {
      * Allows flushing the region cache.
      */
     public void clearRegionCache() {
-     cachedRegionLocations.clear();
+      synchronized (this.cachedRegionLocations) {
+        cachedRegionLocations.clear();
+        cachedServers.clear();
+      }
     }
 
     /*
@@ -1012,10 +1055,15 @@ public class HConnectionManager {
       byte [] startKey = location.getRegionInfo().getStartKey();
       SoftValueSortedMap<byte [], HRegionLocation> tableLocations =
         getTableLocations(tableName);
-      if (tableLocations.put(startKey, location) == null) {
+      boolean hasNewCache = false;
+      synchronized (this.cachedRegionLocations) {
+        cachedServers.add(location.getServerAddress().toString());
+        hasNewCache = (tableLocations.put(startKey, location) == null);
+      }
+      if (hasNewCache) {
         LOG.debug("Cached location for " +
             location.getRegionInfo().getRegionNameAsString() +
-            " is " + location.getServerAddress());
+            " is " + location.getServerAddress().toString());
       }
     }
 
@@ -1162,6 +1210,17 @@ public class HConnectionManager {
           return callable.call();
         } catch (Throwable t) {
           t = translateException(t);
+          if (t instanceof SocketTimeoutException ||
+              t instanceof ConnectException ||
+              t instanceof RetriesExhaustedException) {
+            // if thrown these exceptions, we clear all the cache entries that
+            // map to that slow/dead server; otherwise, let cache miss and ask
+            // .META. again to find the new location
+            HRegionLocation hrl = callable.location;
+            if (hrl != null) {
+              clearCachedLocationForServer(hrl.getServerAddress().toString());
+            }
+          }
           exceptions.add(t);
           if (tries == numRetries - 1) {
             throw new RetriesExhaustedException(callable.getServerName(),