You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by aw...@apache.org on 2021/10/08 23:55:52 UTC

[kudu] branch master updated: [master] KUDU-1885: re-resolve TSDescriptor proxies on network error

This is an automated email from the ASF dual-hosted git repository.

awong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new 1584e43  [master] KUDU-1885: re-resolve TSDescriptor proxies on network error
1584e43 is described below

commit 1584e4325cfce37f89fd1dbebff5f24ed980780f
Author: Andrew Wong <aw...@cloudera.com>
AuthorDate: Wed Sep 15 19:50:34 2021 -0700

    [master] KUDU-1885: re-resolve TSDescriptor proxies on network error
    
    This patch uses the capabilities added in KUDU-75 to add the ability for
    the master to re-resolve the addresses of tablet servers if its requests
    fail.
    
    Change-Id: I6245f75a232fd4827de684cfc04d6b6e53b7ddef
    Reviewed-on: http://gerrit.cloudera.org:8080/17909
    Reviewed-by: Alexey Serbin <as...@cloudera.com>
    Tested-by: Andrew Wong <aw...@cloudera.com>
---
 src/kudu/integration-tests/dns_alias-itest.cc | 82 +++++++++++++++++++++++++--
 src/kudu/integration-tests/test_workload.cc   |  2 +
 src/kudu/integration-tests/test_workload.h    |  5 ++
 src/kudu/master/ts_descriptor.cc              | 12 ++--
 4 files changed, 92 insertions(+), 9 deletions(-)

diff --git a/src/kudu/integration-tests/dns_alias-itest.cc b/src/kudu/integration-tests/dns_alias-itest.cc
index a801005..58735bd 100644
--- a/src/kudu/integration-tests/dns_alias-itest.cc
+++ b/src/kudu/integration-tests/dns_alias-itest.cc
@@ -35,11 +35,15 @@
 #include "kudu/integration-tests/test_workload.h"
 #include "kudu/mini-cluster/external_mini_cluster.h"
 #include "kudu/mini-cluster/mini_cluster.h"
+#include "kudu/rpc/rpc_controller.h"
+#include "kudu/tserver/tserver.pb.h"
+#include "kudu/tserver/tserver_service.proxy.h"
 #include "kudu/util/metrics.h"
 #include "kudu/util/monotime.h"
 #include "kudu/util/net/net_util.h"
 #include "kudu/util/net/sockaddr.h"
 #include "kudu/util/net/socket.h"
+#include "kudu/util/pb_util.h"
 #include "kudu/util/status.h"
 #include "kudu/util/test_macros.h"
 #include "kudu/util/test_util.h"
@@ -56,6 +60,7 @@ using kudu::client::KuduTabletServer;
 using kudu::cluster::ExternalDaemon;
 using kudu::cluster::ExternalMiniCluster;
 using kudu::cluster::ExternalMiniClusterOptions;
+using kudu::pb_util::SecureShortDebugString;
 using std::string;
 using std::unique_ptr;
 using std::vector;
@@ -76,8 +81,10 @@ class DnsAliasITest : public KuduTest {
     SetUpCluster();
   }
 
-  void SetUpCluster(vector<string> extra_master_flags = {},
-                    vector<string> extra_tserver_flags = {}) {
+  // TODO(awong): more plumbing is needed to allow the server to be restarted
+  // bound to a different address with the webserver, so just disable it.
+  void SetUpCluster(vector<string> extra_master_flags = { "--webserver_enabled=false" },
+                    vector<string> extra_tserver_flags = { "--webserver_enabled=false" }) {
     ExternalMiniClusterOptions opts;
     opts.num_masters = 3;
     opts.num_tablet_servers = 3;
@@ -133,9 +140,6 @@ class DnsAliasITest : public KuduTest {
     HostPort new_ip_hp(new_addr.host(), new_addr.port());
     daemon->SetRpcBindAddress(new_ip_hp);
     daemon->mutable_flags()->emplace_back("--rpc_reuseport=true");
-    // TODO(awong): more plumbing is needed to allow the server to startup with
-    // the webserver, so just disable it.
-    daemon->mutable_flags()->emplace_back("--webserver_enabled=false");
     daemon->mutable_flags()->emplace_back(
         Substitute("--dns_addr_resolution_override=$0", new_overrides_str));
   }
@@ -356,6 +360,74 @@ TEST_F(DnsAliasITest, TestMasterReresolveOnStartup) {
   });
 }
 
+// Regression test for KUDU-1885, wherein tserver proxies on the masters don't
+// eventually succeed when the tserver's address changes.
+TEST_F(DnsAliasITest, Kudu1885) {
+  // First, wait for all tablet servers to report to the masters.
+  ASSERT_EVENTUALLY([&] {
+    vector<KuduTabletServer*> tservers;
+    ElementDeleter deleter(&tservers);
+    ASSERT_OK(client_->ListTabletServers(&tservers));
+    ASSERT_EQ(cluster_->num_tablet_servers(), tservers.size());
+  });
+  auto* tserver = cluster_->tablet_server(cluster_->num_tablet_servers() - 1);
+  // Shut down a tablet server so we can start it up with a different address.
+  tserver->Shutdown();
+  unique_ptr<Socket> reserved_socket;
+  ASSERT_OK(cluster_->ReserveDaemonSocket(cluster::ExternalMiniCluster::DaemonType::TSERVER, 3,
+                                          kDefaultBindMode, &reserved_socket,
+                                          tserver->bound_rpc_hostport().port()));
+  Sockaddr new_addr;
+  ASSERT_OK(reserved_socket->GetSocketAddress(&new_addr));
+  auto new_overrides_str = GetNewOverridesFlag(ExternalMiniCluster::DaemonType::TSERVER, new_addr);
+  SetUpDaemonForNewAddr(new_addr, new_overrides_str, tserver);
+
+  // Create several tables. Based on Kudu's tablet placement algorithm, some
+  // should be assigned to the tserver with a new address. This will start some
+  // tasks on the master to send requests to tablet servers (some of which will
+  // fail because of the down server).
+  // NOTE: master's will wait up to --tserver_unresponsive_timeout_ms before
+  // stopping replica placement on the down server. By default, this is 60
+  // seconds, so we can proceed expecting placement on the down tserver.
+  for (int i = 0; i < 10; i++) {
+    TestWorkload w(cluster_.get());
+    w.set_table_name(Substitute("default.table_$0", i));
+    w.set_num_replicas(1);
+    // Some tablet creations will initially fail until we restart the down
+    // server, so have our client not wait for creation to finish.
+    w.set_wait_for_create(false);
+    w.Setup();
+  }
+  ASSERT_OK(tserver->Restart());
+
+  // Allow the rest of the cluster to start seeing the re-addressed server.
+  SetFlagsOnRemainingCluster(ExternalMiniCluster::DaemonType::TSERVER, new_overrides_str);
+  FLAGS_dns_addr_resolution_override = new_overrides_str;
+  ClusterVerifier v(cluster_.get());
+  ASSERT_EVENTUALLY([&] {
+    ASSERT_OK(v.RunKsck());
+  });
+
+  // Ensure there's no funny business with the tserver coming up at a new
+  // address -- we should have the same number of tablet servers.
+  vector<KuduTabletServer*> tservers;
+  ElementDeleter deleter(&tservers);
+  ASSERT_OK(client_->ListTabletServers(&tservers));
+  ASSERT_EQ(cluster_->num_tablet_servers(), tservers.size());
+
+  // Some tablets should be assigned to the tablet we re-addressed, and the
+  // create tablet requests from the masters should have been routed as
+  // appropriate.
+  auto tserver_proxy = cluster_->tserver_proxy(cluster_->num_tablet_servers() - 1);
+  tserver::ListTabletsRequestPB req;
+  req.set_need_schema_info(false);
+  tserver::ListTabletsResponsePB resp;
+  rpc::RpcController controller;
+  ASSERT_OK(tserver_proxy->ListTablets(req, &resp, &controller));
+  ASSERT_FALSE(resp.has_error()) << SecureShortDebugString(resp.error());
+  ASSERT_GT(resp.status_and_schema_size(), 0);
+}
+
 #endif
 
 } // namespace itest
diff --git a/src/kudu/integration-tests/test_workload.cc b/src/kudu/integration-tests/test_workload.cc
index 2cbc05a..1044ec6 100644
--- a/src/kudu/integration-tests/test_workload.cc
+++ b/src/kudu/integration-tests/test_workload.cc
@@ -81,6 +81,7 @@ TestWorkload::TestWorkload(MiniCluster* cluster,
     begin_txn_(false),
     commit_txn_(false),
     rollback_txn_(false),
+    wait_for_create_(true),
     fault_tolerant_(true),
     verify_num_rows_(true),
     read_errors_allowed_(false),
@@ -340,6 +341,7 @@ void TestWorkload::Setup() {
     table_creator
         .table_name(table_name_)
         .schema(&schema_)
+        .wait(wait_for_create_)
         .num_replicas(num_replicas_);
 
     switch (partitioning_) {
diff --git a/src/kudu/integration-tests/test_workload.h b/src/kudu/integration-tests/test_workload.h
index 57ef175..ee72304 100644
--- a/src/kudu/integration-tests/test_workload.h
+++ b/src/kudu/integration-tests/test_workload.h
@@ -167,6 +167,10 @@ class TestWorkload {
     write_timeout_millis_ = t;
   }
 
+  void set_wait_for_create(bool wait) {
+    wait_for_create_ = wait;
+  }
+
   // Set whether to fail if we see a TimedOut() error inserting a row.
   // By default, this triggers a CHECK failure.
   void set_timeout_allowed(bool allowed) {
@@ -350,6 +354,7 @@ class TestWorkload {
   bool begin_txn_;
   bool commit_txn_;
   bool rollback_txn_;
+  bool wait_for_create_;
   bool fault_tolerant_;
   bool verify_num_rows_;
   bool read_errors_allowed_;
diff --git a/src/kudu/master/ts_descriptor.cc b/src/kudu/master/ts_descriptor.cc
index 678392e..6499e68 100644
--- a/src/kudu/master/ts_descriptor.cc
+++ b/src/kudu/master/ts_descriptor.cc
@@ -278,15 +278,17 @@ Status TSDescriptor::GetTSAdminProxy(const shared_ptr<rpc::Messenger>& messenger
       return Status::OK();
     }
   }
-
   Sockaddr addr;
   string host;
   RETURN_NOT_OK(ResolveSockaddr(&addr, &host));
 
   std::lock_guard<rw_spinlock> l(lock_);
   if (!ts_admin_proxy_) {
+    HostPort hp;
+    RETURN_NOT_OK(hp.ParseString(host, addr.port()));
     ts_admin_proxy_.reset(new tserver::TabletServerAdminServiceProxy(
-        messenger, addr, std::move(host)));
+        messenger, hp, dns_resolver_));
+    ts_admin_proxy_->Init(addr);
   }
   *proxy = ts_admin_proxy_;
   return Status::OK();
@@ -301,15 +303,17 @@ Status TSDescriptor::GetConsensusProxy(const shared_ptr<rpc::Messenger>& messeng
       return Status::OK();
     }
   }
-
   Sockaddr addr;
   string host;
   RETURN_NOT_OK(ResolveSockaddr(&addr, &host));
 
   std::lock_guard<rw_spinlock> l(lock_);
   if (!consensus_proxy_) {
+    HostPort hp;
+    RETURN_NOT_OK(hp.ParseString(host, addr.port()));
     consensus_proxy_.reset(new consensus::ConsensusServiceProxy(
-        messenger, addr, std::move(host)));
+        messenger, hp, dns_resolver_));
+    consensus_proxy_->Init(addr);
   }
   *proxy = consensus_proxy_;
   return Status::OK();