You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2016/09/26 15:59:40 UTC

[6/6] kudu git commit: [java client] Reinstate KUDU-1364's behavior, fix NPE

[java client] Reinstate KUDU-1364's behavior, fix NPE

When d5082d8 tried to fix the client2tablets leak, it also undid the work
from KUDU-1364, while also adding new problems.

This patch brings back the caching of replica locations even when getting
TS disconnections by not purging the RemoteTablet caches on disconnection.
Instead, it is now done by the retried RPCs themselves after TabletClient
detects an uncaughtException, similarly to how it was calling
demoteAsLeaderForAllTablets before.

The NPE is fixed with a null check, it's an unfortunate race. I spent some
time trying to come up with a simple test but failed. ITClient has found
the issue before so we know we have _some_ coverage.

Change-Id: I8e0ed23fbf4c655037b77173a187c3fa11de4f63
Reviewed-on: http://gerrit.cloudera.org:8080/4501
Tested-by: Kudu Jenkins
Reviewed-by: David Ribeiro Alves <dr...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/e14bb60c
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/e14bb60c
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/e14bb60c

Branch: refs/heads/master
Commit: e14bb60ccd8290b0162bc701837f4d137625d912
Parents: 92f7c19
Author: Jean-Daniel Cryans <jd...@apache.org>
Authored: Wed Sep 21 17:20:37 2016 -0700
Committer: Jean-Daniel Cryans <jd...@apache.org>
Committed: Mon Sep 26 00:05:10 2016 +0000

----------------------------------------------------------------------
 .../org/apache/kudu/client/AsyncKuduClient.java | 45 ++++----------------
 .../org/apache/kudu/client/TabletClient.java    | 18 ++++++--
 2 files changed, 23 insertions(+), 40 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/e14bb60c/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java
----------------------------------------------------------------------
diff --git a/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java b/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java
index c478714..7bbfdb9 100644
--- a/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java
+++ b/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java
@@ -1775,12 +1775,11 @@ public class AsyncKuduClient implements AutoCloseable {
     }
 
     TabletClient old;
-    ArrayList<RemoteTablet> tablets = null;
     synchronized (ip2client) {
-      if ((old = ip2client.remove(hostport)) != null) {
-        tablets = client2tablets.remove(client);
-      }
+      old = ip2client.remove(hostport);
+      client2tablets.remove(client);
     }
+
     LOG.debug("Removed from IP cache: {" + hostport + "} -> {" + client + "}");
     if (old == null) {
       // Currently we're seeing this message when masters are disconnected and the hostport we got
@@ -1790,37 +1789,6 @@ public class AsyncKuduClient implements AutoCloseable {
       LOG.trace("When expiring " + client + " from the client cache (host:port="
           + hostport + "), it was found that there was no entry"
           + " corresponding to " + remote + ".  This shouldn't happen.");
-    } else {
-      if (tablets != null) {
-        // Make a copy so we don't need to synchronize on it while iterating.
-        RemoteTablet[] tablets_copy;
-        synchronized (tablets) {
-          tablets_copy = tablets.toArray(new RemoteTablet[tablets.size()]);
-        }
-        for (final RemoteTablet remoteTablet : tablets_copy) {
-          remoteTablet.removeTabletClient(client);
-        }
-      }
-    }
-  }
-
-  /**
-   * Call this method after encountering an error connecting to a tablet server so that we stop
-   * considering it a leader for the tablets it serves.
-   * @param client tablet server to use for demotion
-   */
-  void demoteAsLeaderForAllTablets(final TabletClient client) {
-    ArrayList<RemoteTablet> tablets = client2tablets.get(client);
-    if (tablets != null) {
-      // Make a copy so we don't need to synchronize on it while iterating.
-      RemoteTablet[] tablets_copy;
-      synchronized (tablets) {
-        tablets_copy = tablets.toArray(new RemoteTablet[tablets.size()]);
-      }
-      for (final RemoteTablet remoteTablet : tablets_copy) {
-        // It will be a no-op if it's not already a leader.
-        remoteTablet.demoteLeader(client);
-      }
     }
   }
 
@@ -2076,7 +2044,12 @@ public class AsyncKuduClient implements AutoCloseable {
       }
       TabletClient client = newClient(uuid, ip, port);
 
-      final ArrayList<RemoteTablet> tablets = client2tablets.get(client);
+      ArrayList<RemoteTablet> tablets = client2tablets.get(client);
+
+      if (tablets == null) {
+        // We lost a race, someone removed the client we received.
+        return;
+      }
 
       synchronized (tablets) {
         tabletServers.add(client);

http://git-wip-us.apache.org/repos/asf/kudu/blob/e14bb60c/java/kudu-client/src/main/java/org/apache/kudu/client/TabletClient.java
----------------------------------------------------------------------
diff --git a/java/kudu-client/src/main/java/org/apache/kudu/client/TabletClient.java b/java/kudu-client/src/main/java/org/apache/kudu/client/TabletClient.java
index 59ecf3b..d4d5d13 100644
--- a/java/kudu-client/src/main/java/org/apache/kudu/client/TabletClient.java
+++ b/java/kudu-client/src/main/java/org/apache/kudu/client/TabletClient.java
@@ -145,6 +145,10 @@ public class TabletClient extends ReplayingDecoder<VoidEnum> {
 
   private final RequestTracker requestTracker;
 
+  // If an uncaught exception forced us to shutdown this TabletClient, we'll handle the retry
+  // differently by also clearing the caches.
+  private volatile boolean gotUncaughtException = false;
+
   public TabletClient(AsyncKuduClient client, String uuid, String host, int port) {
     this.kuduClient = client;
     this.uuid = uuid;
@@ -752,7 +756,13 @@ public class TabletClient extends ReplayingDecoder<VoidEnum> {
     if (tablet == null) {  // Can't retry, dunno where this RPC should go.
       rpc.errback(exception);
     } else {
-      kuduClient.handleRetryableError(rpc, exception);
+      if (gotUncaughtException) {
+        // This will remove this TabletClient from this RPC's cache since there's something wrong
+        // about it.
+        kuduClient.handleTabletNotFound(rpc, exception, this);
+      } else {
+        kuduClient.handleRetryableError(rpc, exception);
+      }
     }
   }
 
@@ -770,9 +780,9 @@ public class TabletClient extends ReplayingDecoder<VoidEnum> {
       LOG.debug(getPeerUuidLoggingString() + "Encountered a read timeout, will close the channel");
     } else {
       LOG.error(getPeerUuidLoggingString() + "Unexpected exception from downstream on " + c, e);
-      // For any other exception, likely a connection error, we clear the leader state
-      // for those tablets that this TS is the cached leader of.
-      kuduClient.demoteAsLeaderForAllTablets(this);
+      // For any other exception, likely a connection error, we'll clear the tablet caches for the
+      // RPCs we're going to retry.
+      gotUncaughtException = true;
     }
     if (c.isOpen()) {
       Channels.close(c);  // Will trigger channelClosed(), which will cleanup()