You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by aw...@apache.org on 2021/10/08 23:55:52 UTC
[kudu] branch master updated: [master] KUDU-1885: re-resolve
TSDescriptor proxies on network error
This is an automated email from the ASF dual-hosted git repository.
awong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new 1584e43 [master] KUDU-1885: re-resolve TSDescriptor proxies on network error
1584e43 is described below
commit 1584e4325cfce37f89fd1dbebff5f24ed980780f
Author: Andrew Wong <aw...@cloudera.com>
AuthorDate: Wed Sep 15 19:50:34 2021 -0700
[master] KUDU-1885: re-resolve TSDescriptor proxies on network error
This patch uses the capabilities added in KUDU-75 to add the ability for
the master to re-resolve the addresses of tablet servers if its requests
fail.
Change-Id: I6245f75a232fd4827de684cfc04d6b6e53b7ddef
Reviewed-on: http://gerrit.cloudera.org:8080/17909
Reviewed-by: Alexey Serbin <as...@cloudera.com>
Tested-by: Andrew Wong <aw...@cloudera.com>
---
src/kudu/integration-tests/dns_alias-itest.cc | 82 +++++++++++++++++++++++++--
src/kudu/integration-tests/test_workload.cc | 2 +
src/kudu/integration-tests/test_workload.h | 5 ++
src/kudu/master/ts_descriptor.cc | 12 ++--
4 files changed, 92 insertions(+), 9 deletions(-)
diff --git a/src/kudu/integration-tests/dns_alias-itest.cc b/src/kudu/integration-tests/dns_alias-itest.cc
index a801005..58735bd 100644
--- a/src/kudu/integration-tests/dns_alias-itest.cc
+++ b/src/kudu/integration-tests/dns_alias-itest.cc
@@ -35,11 +35,15 @@
#include "kudu/integration-tests/test_workload.h"
#include "kudu/mini-cluster/external_mini_cluster.h"
#include "kudu/mini-cluster/mini_cluster.h"
+#include "kudu/rpc/rpc_controller.h"
+#include "kudu/tserver/tserver.pb.h"
+#include "kudu/tserver/tserver_service.proxy.h"
#include "kudu/util/metrics.h"
#include "kudu/util/monotime.h"
#include "kudu/util/net/net_util.h"
#include "kudu/util/net/sockaddr.h"
#include "kudu/util/net/socket.h"
+#include "kudu/util/pb_util.h"
#include "kudu/util/status.h"
#include "kudu/util/test_macros.h"
#include "kudu/util/test_util.h"
@@ -56,6 +60,7 @@ using kudu::client::KuduTabletServer;
using kudu::cluster::ExternalDaemon;
using kudu::cluster::ExternalMiniCluster;
using kudu::cluster::ExternalMiniClusterOptions;
+using kudu::pb_util::SecureShortDebugString;
using std::string;
using std::unique_ptr;
using std::vector;
@@ -76,8 +81,10 @@ class DnsAliasITest : public KuduTest {
SetUpCluster();
}
- void SetUpCluster(vector<string> extra_master_flags = {},
- vector<string> extra_tserver_flags = {}) {
+ // TODO(awong): more plumbing is needed to allow the server to be restarted
+ // bound to a different address with the webserver, so just disable it.
+ void SetUpCluster(vector<string> extra_master_flags = { "--webserver_enabled=false" },
+ vector<string> extra_tserver_flags = { "--webserver_enabled=false" }) {
ExternalMiniClusterOptions opts;
opts.num_masters = 3;
opts.num_tablet_servers = 3;
@@ -133,9 +140,6 @@ class DnsAliasITest : public KuduTest {
HostPort new_ip_hp(new_addr.host(), new_addr.port());
daemon->SetRpcBindAddress(new_ip_hp);
daemon->mutable_flags()->emplace_back("--rpc_reuseport=true");
- // TODO(awong): more plumbing is needed to allow the server to startup with
- // the webserver, so just disable it.
- daemon->mutable_flags()->emplace_back("--webserver_enabled=false");
daemon->mutable_flags()->emplace_back(
Substitute("--dns_addr_resolution_override=$0", new_overrides_str));
}
@@ -356,6 +360,74 @@ TEST_F(DnsAliasITest, TestMasterReresolveOnStartup) {
});
}
+// Regression test for KUDU-1885, wherein tserver proxies on the masters don't
+// eventually succeed when the tserver's address changes.
+TEST_F(DnsAliasITest, Kudu1885) {
+ // First, wait for all tablet servers to report to the masters.
+ ASSERT_EVENTUALLY([&] {
+ vector<KuduTabletServer*> tservers;
+ ElementDeleter deleter(&tservers);
+ ASSERT_OK(client_->ListTabletServers(&tservers));
+ ASSERT_EQ(cluster_->num_tablet_servers(), tservers.size());
+ });
+ auto* tserver = cluster_->tablet_server(cluster_->num_tablet_servers() - 1);
+ // Shut down a tablet server so we can start it up with a different address.
+ tserver->Shutdown();
+ unique_ptr<Socket> reserved_socket;
+ ASSERT_OK(cluster_->ReserveDaemonSocket(cluster::ExternalMiniCluster::DaemonType::TSERVER, 3,
+ kDefaultBindMode, &reserved_socket,
+ tserver->bound_rpc_hostport().port()));
+ Sockaddr new_addr;
+ ASSERT_OK(reserved_socket->GetSocketAddress(&new_addr));
+ auto new_overrides_str = GetNewOverridesFlag(ExternalMiniCluster::DaemonType::TSERVER, new_addr);
+ SetUpDaemonForNewAddr(new_addr, new_overrides_str, tserver);
+
+ // Create several tables. Based on Kudu's tablet placement algorithm, some
+ // should be assigned to the tserver with a new address. This will start some
+ // tasks on the master to send requests to tablet servers (some of which will
+ // fail because of the down server).
+ // NOTE: master's will wait up to --tserver_unresponsive_timeout_ms before
+ // stopping replica placement on the down server. By default, this is 60
+ // seconds, so we can proceed expecting placement on the down tserver.
+ for (int i = 0; i < 10; i++) {
+ TestWorkload w(cluster_.get());
+ w.set_table_name(Substitute("default.table_$0", i));
+ w.set_num_replicas(1);
+ // Some tablet creations will initially fail until we restart the down
+ // server, so have our client not wait for creation to finish.
+ w.set_wait_for_create(false);
+ w.Setup();
+ }
+ ASSERT_OK(tserver->Restart());
+
+ // Allow the rest of the cluster to start seeing the re-addressed server.
+ SetFlagsOnRemainingCluster(ExternalMiniCluster::DaemonType::TSERVER, new_overrides_str);
+ FLAGS_dns_addr_resolution_override = new_overrides_str;
+ ClusterVerifier v(cluster_.get());
+ ASSERT_EVENTUALLY([&] {
+ ASSERT_OK(v.RunKsck());
+ });
+
+ // Ensure there's no funny business with the tserver coming up at a new
+ // address -- we should have the same number of tablet servers.
+ vector<KuduTabletServer*> tservers;
+ ElementDeleter deleter(&tservers);
+ ASSERT_OK(client_->ListTabletServers(&tservers));
+ ASSERT_EQ(cluster_->num_tablet_servers(), tservers.size());
+
+ // Some tablets should be assigned to the tablet we re-addressed, and the
+ // create tablet requests from the masters should have been routed as
+ // appropriate.
+ auto tserver_proxy = cluster_->tserver_proxy(cluster_->num_tablet_servers() - 1);
+ tserver::ListTabletsRequestPB req;
+ req.set_need_schema_info(false);
+ tserver::ListTabletsResponsePB resp;
+ rpc::RpcController controller;
+ ASSERT_OK(tserver_proxy->ListTablets(req, &resp, &controller));
+ ASSERT_FALSE(resp.has_error()) << SecureShortDebugString(resp.error());
+ ASSERT_GT(resp.status_and_schema_size(), 0);
+}
+
#endif
} // namespace itest
diff --git a/src/kudu/integration-tests/test_workload.cc b/src/kudu/integration-tests/test_workload.cc
index 2cbc05a..1044ec6 100644
--- a/src/kudu/integration-tests/test_workload.cc
+++ b/src/kudu/integration-tests/test_workload.cc
@@ -81,6 +81,7 @@ TestWorkload::TestWorkload(MiniCluster* cluster,
begin_txn_(false),
commit_txn_(false),
rollback_txn_(false),
+ wait_for_create_(true),
fault_tolerant_(true),
verify_num_rows_(true),
read_errors_allowed_(false),
@@ -340,6 +341,7 @@ void TestWorkload::Setup() {
table_creator
.table_name(table_name_)
.schema(&schema_)
+ .wait(wait_for_create_)
.num_replicas(num_replicas_);
switch (partitioning_) {
diff --git a/src/kudu/integration-tests/test_workload.h b/src/kudu/integration-tests/test_workload.h
index 57ef175..ee72304 100644
--- a/src/kudu/integration-tests/test_workload.h
+++ b/src/kudu/integration-tests/test_workload.h
@@ -167,6 +167,10 @@ class TestWorkload {
write_timeout_millis_ = t;
}
+ void set_wait_for_create(bool wait) {
+ wait_for_create_ = wait;
+ }
+
// Set whether to fail if we see a TimedOut() error inserting a row.
// By default, this triggers a CHECK failure.
void set_timeout_allowed(bool allowed) {
@@ -350,6 +354,7 @@ class TestWorkload {
bool begin_txn_;
bool commit_txn_;
bool rollback_txn_;
+ bool wait_for_create_;
bool fault_tolerant_;
bool verify_num_rows_;
bool read_errors_allowed_;
diff --git a/src/kudu/master/ts_descriptor.cc b/src/kudu/master/ts_descriptor.cc
index 678392e..6499e68 100644
--- a/src/kudu/master/ts_descriptor.cc
+++ b/src/kudu/master/ts_descriptor.cc
@@ -278,15 +278,17 @@ Status TSDescriptor::GetTSAdminProxy(const shared_ptr<rpc::Messenger>& messenger
return Status::OK();
}
}
-
Sockaddr addr;
string host;
RETURN_NOT_OK(ResolveSockaddr(&addr, &host));
std::lock_guard<rw_spinlock> l(lock_);
if (!ts_admin_proxy_) {
+ HostPort hp;
+ RETURN_NOT_OK(hp.ParseString(host, addr.port()));
ts_admin_proxy_.reset(new tserver::TabletServerAdminServiceProxy(
- messenger, addr, std::move(host)));
+ messenger, hp, dns_resolver_));
+ ts_admin_proxy_->Init(addr);
}
*proxy = ts_admin_proxy_;
return Status::OK();
@@ -301,15 +303,17 @@ Status TSDescriptor::GetConsensusProxy(const shared_ptr<rpc::Messenger>& messeng
return Status::OK();
}
}
-
Sockaddr addr;
string host;
RETURN_NOT_OK(ResolveSockaddr(&addr, &host));
std::lock_guard<rw_spinlock> l(lock_);
if (!consensus_proxy_) {
+ HostPort hp;
+ RETURN_NOT_OK(hp.ParseString(host, addr.port()));
consensus_proxy_.reset(new consensus::ConsensusServiceProxy(
- messenger, addr, std::move(host)));
+ messenger, hp, dns_resolver_));
+ consensus_proxy_->Init(addr);
}
*proxy = consensus_proxy_;
return Status::OK();