You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2021/03/07 07:21:44 UTC
[kudu] 01/02: [java] KUDU-3213: try at different server on
TABLET_NOT_RUNNING
This is an automated email from the ASF dual-hosted git repository.
alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
commit c167c1dc39d7089c4b1216bc62e423f3e2638479
Author: Andrew Wong <aw...@cloudera.com>
AuthorDate: Wed Feb 24 15:59:46 2021 -0800
[java] KUDU-3213: try at different server on TABLET_NOT_RUNNING
Prior to this patch, if a tablet server were quiescing for a prolonged
period, scan requests could time out, complaining that the tablet server
is quiescing, but without ever retrying the scan at another tablet
server. This is because tablet servers will return TABLET_NOT_RUNNING to
clients when attempting a scan while quiescing. The behavior in the C++
client is that the location is then blacklisted and the request is
retried elsewhere. The behavior in the Java client, though, is that the
same location is retried until failure.
This patch addresses this by treating TABLET_NOT_RUNNING errors in the
Java client as we would for TABLET_NOT_FOUND, which is actually quite
similar to the handling for TABLET_NOT_RUNNING in the C++ client: the
location is invalidated for further attempts, and the request is retried
elsewhere.
Why not just have quiescing tablet servers return TABLET_NOT_FOUND,
then? TABLET_NOT_FOUND errors in the C++ client actually have some
behavior not present in the Java client: a tablet whose location is
invalidated with TABLET_NOT_FOUND in the C++ client will be required to
be looked up again, requiring a round trip to the master. This behavior
doesn't exist in the Java client, so I thought it easiest to piggyback
on TABLET_NOT_FOUND handling for now.
Change-Id: I38ac84a52676ff361fa1ba996665b338d1bbfba1
Reviewed-on: http://gerrit.cloudera.org:8080/17124
Tested-by: Kudu Jenkins
Reviewed-by: Alexey Serbin <as...@cloudera.com>
---
.../main/java/org/apache/kudu/client/RpcProxy.java | 9 ++--
.../org/apache/kudu/client/TestKuduScanner.java | 52 ++++++++++++++++++++++
.../java/org/apache/kudu/test/KuduTestHarness.java | 2 +-
3 files changed, 59 insertions(+), 4 deletions(-)
diff --git a/java/kudu-client/src/main/java/org/apache/kudu/client/RpcProxy.java b/java/kudu-client/src/main/java/org/apache/kudu/client/RpcProxy.java
index cf564b9..28784ab 100644
--- a/java/kudu-client/src/main/java/org/apache/kudu/client/RpcProxy.java
+++ b/java/kudu-client/src/main/java/org/apache/kudu/client/RpcProxy.java
@@ -357,12 +357,15 @@ class RpcProxy {
Tserver.TabletServerErrorPB.Code errCode = error.getCode();
WireProtocol.AppStatusPB.ErrorCode errStatusCode = error.getStatus().getCode();
Status status = Status.fromTabletServerErrorPB(error);
- if (errCode == Tserver.TabletServerErrorPB.Code.TABLET_NOT_FOUND) {
+ if (errCode == Tserver.TabletServerErrorPB.Code.TABLET_NOT_FOUND ||
+ errCode == Tserver.TabletServerErrorPB.Code.TABLET_NOT_RUNNING) {
+ // TODO(awong): for TABLET_NOT_FOUND, we may want to force a location
+ // lookup for the tablet. For now, this just invalidates the location
+ // and tries somewhere else.
client.handleTabletNotFound(
rpc, new RecoverableException(status), connection.getServerInfo());
// we're not calling rpc.callback() so we rely on the client to retry that RPC
- } else if (errCode == Tserver.TabletServerErrorPB.Code.TABLET_NOT_RUNNING ||
- errStatusCode == WireProtocol.AppStatusPB.ErrorCode.SERVICE_UNAVAILABLE) {
+ } else if (errStatusCode == WireProtocol.AppStatusPB.ErrorCode.SERVICE_UNAVAILABLE) {
client.handleRetryableError(rpc, new RecoverableException(status));
// The following two error codes are an indication that the tablet isn't a leader.
} else if (errStatusCode == WireProtocol.AppStatusPB.ErrorCode.ILLEGAL_STATE ||
diff --git a/java/kudu-client/src/test/java/org/apache/kudu/client/TestKuduScanner.java b/java/kudu-client/src/test/java/org/apache/kudu/client/TestKuduScanner.java
index c0fbac3..54e7d65 100644
--- a/java/kudu-client/src/test/java/org/apache/kudu/client/TestKuduScanner.java
+++ b/java/kudu-client/src/test/java/org/apache/kudu/client/TestKuduScanner.java
@@ -38,6 +38,7 @@ import java.util.Map;
import java.util.Random;
import java.util.Set;
+import com.google.common.collect.Lists;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
@@ -52,6 +53,7 @@ import org.apache.kudu.client.Operation.ChangeType;
import org.apache.kudu.test.CapturingLogAppender;
import org.apache.kudu.test.KuduTestHarness;
import org.apache.kudu.test.RandomUtils;
+import org.apache.kudu.test.cluster.KuduBinaryLocator;
import org.apache.kudu.util.DataGenerator;
import org.apache.kudu.util.Pair;
@@ -79,6 +81,56 @@ public class TestKuduScanner {
.build();
}
+ /**
+ * Test that scans get retried at other tablet servers when they're quiescing.
+ */
+ @Test(timeout = 100000)
+ public void testScanQuiescingTabletServer() throws Exception {
+ int rowCount = 500;
+ Schema tableSchema = new Schema(Collections.singletonList(
+ new ColumnSchema.ColumnSchemaBuilder("key", Type.INT32).key(true).build()
+ ));
+
+ // Create a table with some rows in it. For simplicity, use a
+ // single-partition table with replicas on each server (we're required
+ // to set some partitioning though).
+ CreateTableOptions tableOptions = new CreateTableOptions()
+ .setRangePartitionColumns(Collections.singletonList("key"))
+ .setNumReplicas(3);
+ KuduTable table = client.createTable(tableName, tableSchema, tableOptions);
+ KuduSession session = client.newSession();
+ for (int i = 0; i < rowCount; i++) {
+ Insert insert = table.newInsert();
+ PartialRow row = insert.getRow();
+ row.addInt(0, i);
+ session.apply(insert);
+ }
+
+ // Quiesce a single tablet server.
+ List<HostAndPort> tservers = harness.getTabletServers();
+ KuduBinaryLocator.ExecutableInfo exeInfo = KuduBinaryLocator.findBinary("kudu");
+ List<String> commandLine = Lists.newArrayList(exeInfo.exePath(),
+ "tserver",
+ "quiesce",
+ "start",
+ tservers.get(0).toString());
+ ProcessBuilder processBuilder = new ProcessBuilder(commandLine);
+ processBuilder.environment().putAll(exeInfo.environment());
+ Process quiesceTserver = processBuilder.start();
+ assertEquals(0, quiesceTserver.waitFor());
+
+ // Now start a scan. Even if the scan goes to the quiescing server, the
+ // scan request should eventually be routed to a non-quiescing server
+ // and complete. We aren't guaranteed to hit the quiescing server, but this
+ // test would frequently fail if we didn't handle quiescing servers properly.
+ KuduScanner scanner = client.newScannerBuilder(table).build();
+ KuduScannerIterator iterator = scanner.iterator();
+ assertTrue(iterator.hasNext());
+ while (iterator.hasNext()) {
+ iterator.next();
+ }
+ }
+
@Test(timeout = 100000)
public void testIterable() throws Exception {
KuduTable table = client.createTable(tableName, getBasicSchema(), getBasicCreateTableOptions());
diff --git a/java/kudu-test-utils/src/main/java/org/apache/kudu/test/KuduTestHarness.java b/java/kudu-test-utils/src/main/java/org/apache/kudu/test/KuduTestHarness.java
index 2b65844..805704f 100644
--- a/java/kudu-test-utils/src/main/java/org/apache/kudu/test/KuduTestHarness.java
+++ b/java/kudu-test-utils/src/main/java/org/apache/kudu/test/KuduTestHarness.java
@@ -334,7 +334,7 @@ public class KuduTestHarness extends ExternalResource {
* @return the list of tablet servers
*/
public List<HostAndPort> getTabletServers() {
- return miniCluster.getMasterServers();
+ return miniCluster.getTabletServers();
}
/**