You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2021/03/07 07:21:44 UTC

[kudu] 01/02: [java] KUDU-3213: try at different server on TABLET_NOT_RUNNING

This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit c167c1dc39d7089c4b1216bc62e423f3e2638479
Author: Andrew Wong <aw...@cloudera.com>
AuthorDate: Wed Feb 24 15:59:46 2021 -0800

    [java] KUDU-3213: try at different server on TABLET_NOT_RUNNING
    
    Prior to this patch, if a tablet server were quiescing for a prolonged
    period, scan requests could time out, complaining that the tablet server
    is quiescing, but without ever retrying the scan at another tablet
    server. This is because tablet servers will return TABLET_NOT_RUNNING to
    clients when attempting a scan while quiescing. The behavior in the C++
    client is that the location is then blacklisted and the request is
    retried elsewhere. The behavior in the Java client, though, is that the
    same location is retried until failure.
    
    This patch addresses this by treating TABLET_NOT_RUNNING errors in the
    Java client as we would for TABLET_NOT_FOUND, which is actually quite
    similar to the handling for TABLET_NOT_RUNNING in the C++ client: the
    location is invalidated for further attempts, and the request is retried
    elsewhere.
    
    Why not just have quiescing tablet servers return TABLET_NOT_FOUND,
    then? TABLET_NOT_FOUND errors in the C++ client actually have some
    behavior not present in the Java client: a tablet whose location is
    invalidated with TABLET_NOT_FOUND in the C++ client will be required to
    be looked up again, requiring a round trip to the master. This behavior
    doesn't exist in the Java client, so I thought it easiest to piggyback
    on TABLET_NOT_FOUND handling for now.
    
    Change-Id: I38ac84a52676ff361fa1ba996665b338d1bbfba1
    Reviewed-on: http://gerrit.cloudera.org:8080/17124
    Tested-by: Kudu Jenkins
    Reviewed-by: Alexey Serbin <as...@cloudera.com>
---
 .../main/java/org/apache/kudu/client/RpcProxy.java |  9 ++--
 .../org/apache/kudu/client/TestKuduScanner.java    | 52 ++++++++++++++++++++++
 .../java/org/apache/kudu/test/KuduTestHarness.java |  2 +-
 3 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/java/kudu-client/src/main/java/org/apache/kudu/client/RpcProxy.java b/java/kudu-client/src/main/java/org/apache/kudu/client/RpcProxy.java
index cf564b9..28784ab 100644
--- a/java/kudu-client/src/main/java/org/apache/kudu/client/RpcProxy.java
+++ b/java/kudu-client/src/main/java/org/apache/kudu/client/RpcProxy.java
@@ -357,12 +357,15 @@ class RpcProxy {
     Tserver.TabletServerErrorPB.Code errCode = error.getCode();
     WireProtocol.AppStatusPB.ErrorCode errStatusCode = error.getStatus().getCode();
     Status status = Status.fromTabletServerErrorPB(error);
-    if (errCode == Tserver.TabletServerErrorPB.Code.TABLET_NOT_FOUND) {
+    if (errCode == Tserver.TabletServerErrorPB.Code.TABLET_NOT_FOUND ||
+        errCode == Tserver.TabletServerErrorPB.Code.TABLET_NOT_RUNNING) {
+      // TODO(awong): for TABLET_NOT_FOUND, we may want to force a location
+      // lookup for the tablet. For now, this just invalidates the location
+      // and tries somewhere else.
       client.handleTabletNotFound(
           rpc, new RecoverableException(status), connection.getServerInfo());
       // we're not calling rpc.callback() so we rely on the client to retry that RPC
-    } else if (errCode == Tserver.TabletServerErrorPB.Code.TABLET_NOT_RUNNING ||
-        errStatusCode == WireProtocol.AppStatusPB.ErrorCode.SERVICE_UNAVAILABLE) {
+    } else if (errStatusCode == WireProtocol.AppStatusPB.ErrorCode.SERVICE_UNAVAILABLE) {
       client.handleRetryableError(rpc, new RecoverableException(status));
       // The following two error codes are an indication that the tablet isn't a leader.
     } else if (errStatusCode == WireProtocol.AppStatusPB.ErrorCode.ILLEGAL_STATE ||
diff --git a/java/kudu-client/src/test/java/org/apache/kudu/client/TestKuduScanner.java b/java/kudu-client/src/test/java/org/apache/kudu/client/TestKuduScanner.java
index c0fbac3..54e7d65 100644
--- a/java/kudu-client/src/test/java/org/apache/kudu/client/TestKuduScanner.java
+++ b/java/kudu-client/src/test/java/org/apache/kudu/client/TestKuduScanner.java
@@ -38,6 +38,7 @@ import java.util.Map;
 import java.util.Random;
 import java.util.Set;
 
+import com.google.common.collect.Lists;
 import org.junit.Before;
 import org.junit.Rule;
 import org.junit.Test;
@@ -52,6 +53,7 @@ import org.apache.kudu.client.Operation.ChangeType;
 import org.apache.kudu.test.CapturingLogAppender;
 import org.apache.kudu.test.KuduTestHarness;
 import org.apache.kudu.test.RandomUtils;
+import org.apache.kudu.test.cluster.KuduBinaryLocator;
 import org.apache.kudu.util.DataGenerator;
 import org.apache.kudu.util.Pair;
 
@@ -79,6 +81,56 @@ public class TestKuduScanner {
         .build();
   }
 
+  /**
+   * Test that scans get retried at other tablet servers when they're quiescing.
+   */
+  @Test(timeout = 100000)
+  public void testScanQuiescingTabletServer() throws Exception {
+    int rowCount = 500;
+    Schema tableSchema = new Schema(Collections.singletonList(
+        new ColumnSchema.ColumnSchemaBuilder("key", Type.INT32).key(true).build()
+    ));
+
+    // Create a table with some rows in it. For simplicity, use a
+    // single-partition table with replicas on each server (we're required
+    // to set some partitioning though).
+    CreateTableOptions tableOptions = new CreateTableOptions()
+        .setRangePartitionColumns(Collections.singletonList("key"))
+        .setNumReplicas(3);
+    KuduTable table = client.createTable(tableName, tableSchema, tableOptions);
+    KuduSession session = client.newSession();
+    for (int i = 0; i < rowCount; i++) {
+      Insert insert = table.newInsert();
+      PartialRow row = insert.getRow();
+      row.addInt(0, i);
+      session.apply(insert);
+    }
+
+    // Quiesce a single tablet server.
+    List<HostAndPort> tservers = harness.getTabletServers();
+    KuduBinaryLocator.ExecutableInfo exeInfo = KuduBinaryLocator.findBinary("kudu");
+    List<String> commandLine = Lists.newArrayList(exeInfo.exePath(),
+        "tserver",
+        "quiesce",
+        "start",
+        tservers.get(0).toString());
+    ProcessBuilder processBuilder = new ProcessBuilder(commandLine);
+    processBuilder.environment().putAll(exeInfo.environment());
+    Process quiesceTserver = processBuilder.start();
+    assertEquals(0, quiesceTserver.waitFor());
+
+    // Now start a scan. Even if the scan goes to the quiescing server, the
+    // scan request should eventually be routed to a non-quiescing server
+    // and complete. We aren't guaranteed to hit the quiescing server, but this
+    // test would frequently fail if we didn't handle quiescing servers properly.
+    KuduScanner scanner = client.newScannerBuilder(table).build();
+    KuduScannerIterator iterator = scanner.iterator();
+    assertTrue(iterator.hasNext());
+    while (iterator.hasNext()) {
+      iterator.next();
+    }
+  }
+
   @Test(timeout = 100000)
   public void testIterable() throws Exception {
     KuduTable table = client.createTable(tableName, getBasicSchema(), getBasicCreateTableOptions());
diff --git a/java/kudu-test-utils/src/main/java/org/apache/kudu/test/KuduTestHarness.java b/java/kudu-test-utils/src/main/java/org/apache/kudu/test/KuduTestHarness.java
index 2b65844..805704f 100644
--- a/java/kudu-test-utils/src/main/java/org/apache/kudu/test/KuduTestHarness.java
+++ b/java/kudu-test-utils/src/main/java/org/apache/kudu/test/KuduTestHarness.java
@@ -334,7 +334,7 @@ public class KuduTestHarness extends ExternalResource {
    * @return the list of tablet servers
    */
   public List<HostAndPort> getTabletServers() {
-    return miniCluster.getMasterServers();
+    return miniCluster.getTabletServers();
   }
 
   /**