You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by aw...@apache.org on 2021/11/23 21:33:08 UTC

[kudu] branch master updated: [master] KUDU-3311 Allow to start with diff num of masters

This is an automated email from the ASF dual-hosted git repository.

awong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new e3bcd74  [master] KUDU-3311 Allow to start with diff num of masters
e3bcd74 is described below

commit e3bcd743d6c2d8791591c69260e7d9e6bc59fef9
Author: Zoltan Chovan <zc...@cloudera.com>
AuthorDate: Thu Oct 28 14:32:40 2021 +0200

    [master] KUDU-3311 Allow to start with diff num of masters
    
    Modified SysCatalogTable:Load to allow cluster startup if +1
    master address is present in the flags.
    The startup is halted, if the number of masters on-disk is higher than
    what was provided in the flags, discouraging decommissioning masters
    this way.
    
    Additional manual test was done with the following steps in order to
    ensure, that the documented way of removing a master works:
    - ./start_kudu.sh -m 4 -t 3 -b $KUDU_BUILD_DIR -c /tmp/data --rpc-master 8800
    - looked up the "last" master which was listening on :8006 and removed
      it by running:
      ./kudu master remove 127.0.0.1:8800,127.0.0.1:8802,127.0.0.1:8804,127.0.0.1:8806 127.0.0.1:8806
    - stopped the cluster by running ./stop_kudu.sh
    - restarted with 3 masters by running
      ./start_kudu.sh -m 3 -t 3 -b $KUDU_BUILD_DIR -c /tmp/data --rpc-master 8800
    
    Change-Id: I39aeee2f52a55a8c29770f748895d38c9adff8a2
    Reviewed-on: http://gerrit.cloudera.org:8080/17995
    Reviewed-by: Andrew Wong <aw...@cloudera.com>
    Tested-by: Kudu Jenkins
---
 .../integration-tests/master_replication-itest.cc  |  4 +-
 src/kudu/master/dynamic_multi_master-test.cc       | 57 ++++++++++++++++++++++
 src/kudu/master/master_options-test.cc             | 55 ++++++++++++++++++++-
 src/kudu/master/sys_catalog.cc                     | 33 ++++++++++---
 src/kudu/mini-cluster/external_mini_cluster.h      |  7 ++-
 5 files changed, 141 insertions(+), 15 deletions(-)

diff --git a/src/kudu/integration-tests/master_replication-itest.cc b/src/kudu/integration-tests/master_replication-itest.cc
index b7732ea..5995fa8 100644
--- a/src/kudu/integration-tests/master_replication-itest.cc
+++ b/src/kudu/integration-tests/master_replication-itest.cc
@@ -307,17 +307,19 @@ TEST_F(MasterReplicationTest, TestHeartbeatAcceptedByAnyMaster) {
 }
 
 TEST_F(MasterReplicationTest, TestMasterPeerSetsDontMatch) {
-  // Restart one master with an additional entry in --master_addresses. The
+  // Restart one master with two additional entries  in --master_addresses. The
   // discrepancy with the on-disk list of masters should trigger a failure.
   vector<HostPort> master_rpc_addrs = cluster_->master_rpc_addrs();
   cluster_->mini_master(0)->Shutdown();
   master_rpc_addrs.emplace_back("127.0.0.1", 55555);
+  master_rpc_addrs.emplace_back("127.0.0.1", 55556);
   cluster_->mini_master(0)->SetMasterAddresses(master_rpc_addrs);
   ASSERT_OK(cluster_->mini_master(0)->Start());
   Status s = cluster_->mini_master(0)->WaitForCatalogManagerInit();
   SCOPED_TRACE(s.ToString());
   ASSERT_TRUE(s.IsInvalidArgument());
   ASSERT_STR_CONTAINS(s.ToString(), "55555");
+  ASSERT_STR_CONTAINS(s.ToString(), "55556");
 }
 
 TEST_F(MasterReplicationTest, TestConnectToClusterReturnsAddresses) {
diff --git a/src/kudu/master/dynamic_multi_master-test.cc b/src/kudu/master/dynamic_multi_master-test.cc
index 71b278f..147707c 100644
--- a/src/kudu/master/dynamic_multi_master-test.cc
+++ b/src/kudu/master/dynamic_multi_master-test.cc
@@ -35,6 +35,7 @@
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+#include <kudu/gutil/strings/util.h>
 
 #include "kudu/client/client.h"
 #include "kudu/client/schema.h"
@@ -1713,6 +1714,62 @@ TEST_F(AutoAddMasterTest, TestAddWithOnGoingDdl) {
   }
 }
 
+TEST_F(AutoAddMasterTest, TestAddNewMaster) {
+  SKIP_IF_SLOW_NOT_ALLOWED();
+
+  // Let's get the current master addresses and add a new one to them
+  vector<HostPort> master_addrs = cluster_->master_rpc_addrs();
+  unique_ptr<Socket> reserved_socket;
+  ASSERT_OK(MiniCluster::ReserveDaemonSocket(
+      MiniCluster::DaemonType::MASTER,
+      master_addrs.size(),
+      opts_.bind_mode,
+      &reserved_socket));
+
+  Sockaddr addr;
+  ASSERT_OK(reserved_socket->GetSocketAddress(&addr));
+  master_addrs.emplace_back(addr.host(), addr.port());
+
+  const auto& new_master_addrs_list =
+      HostPort::ToCommaSeparatedString(master_addrs);
+
+  cluster_->Shutdown();
+
+  // We have to remove the previous '--master_addresses' flag to ensure there
+  // are no duplicates, then we set the new addresses on the existing masters.
+  for (int i = 0; i < cluster_->num_masters(); i++) {
+    auto* flags = cluster_->master(i)->mutable_flags();
+    flags->erase(
+        remove_if(flags->begin(),
+                  flags->end(),
+                  [](const string &s) {
+                    return HasPrefixString(s, "--master_addresses=");
+                  }), flags->end());
+  }
+
+  for (int i = 0; i < cluster_->num_masters(); i++) {
+    cluster_->master(i)->mutable_flags()->emplace_back(
+      Substitute("--master_addresses=$0", new_master_addrs_list));
+  }
+
+  // Starting up the two existing masters with three addresses should cause no
+  // issues.
+  ASSERT_OK(cluster_->Restart());
+  ASSERT_OK(cluster_->master(0)->WaitForCatalogManager());
+
+  // Let's create the new master and start it to ensure it starts up okay.
+  scoped_refptr<ExternalMaster> peer;
+  auto idx = cluster_->master_rpc_addrs().size();
+  ASSERT_OK(cluster_->CreateMaster(master_addrs, idx, &peer));
+  ASSERT_OK(peer->Start());
+  ASSERT_OK(peer->WaitForCatalogManager());
+  auto expected_num_masters = ++idx;
+  ASSERT_EVENTUALLY([&] {
+    ASSERT_OK(VerifyVotersOnAllMasters(expected_num_masters, cluster_.get()));
+  });
+  NO_FATALS(cluster_->AssertNoCrashes());
+}
+
 class ParameterizedAutoAddMasterTest : public AutoAddMasterTest,
                                        public ::testing::WithParamInterface<tuple<int, bool>> {
  public:
diff --git a/src/kudu/master/master_options-test.cc b/src/kudu/master/master_options-test.cc
index 58ca409..6d6969e 100644
--- a/src/kudu/master/master_options-test.cc
+++ b/src/kudu/master/master_options-test.cc
@@ -189,17 +189,68 @@ TEST_F(MasterOptionsTest, TestSingleMasterForRaftConfigUpgrade) {
   // result in master bring up error.
   cluster.mini_master()->Shutdown();
   cluster.mini_master()->SetMasterAddresses(
-      {master_addresses[0], HostPort("master-2", Master::kDefaultPort)});
+      {master_addresses[0],
+          HostPort("master-2", Master::kDefaultPort),
+          HostPort("master-3", Master::kDefaultPort),
+      });
   // For multi-master configuration, as derived from --master_addresses flag, Restart()
   // doesn't wait for catalog manager init.
   s = cluster.mini_master(0)->Restart();
   ASSERT_TRUE(s.ok() || s.IsInvalidArgument());
   s = cluster.mini_master(0)->master()->WaitForCatalogManagerInit();
-  ASSERT_TRUE(s.IsInvalidArgument());
   ASSERT_STR_MATCHES(s.ToString(),
                      "Unable to initialize catalog manager: Failed to initialize sys tables async.*"
                      "Their symmetric difference is: master-2.*");
 }
 
+TEST_F(MasterOptionsTest, TestAddSingleMaster) {
+  InternalMiniClusterOptions opts;
+  opts.num_masters = 2;
+  opts.supply_single_master_addr = false;
+  InternalMiniCluster cluster(env_, opts);
+  ASSERT_OK(cluster.Start());
+
+  const vector<HostPort> master_addresses =
+      { HostPort(cluster.mini_master(0)->bound_rpc_addr()),
+        HostPort(cluster.mini_master(1)->bound_rpc_addr()),
+      };
+
+  cluster.mini_master(0)->Shutdown();
+  cluster.mini_master(0)->SetMasterAddresses({
+      master_addresses[0],
+      master_addresses[1],
+      HostPort("master-2", Master::kDefaultPort)
+  });
+
+  Status s = cluster.mini_master(0)->Restart();
+  ASSERT_TRUE(s.ok());
+  ASSERT_OK(cluster.mini_master(0)->master()->WaitForCatalogManagerInit());
+}
+
+TEST_F(MasterOptionsTest, TestRemoveMaster) {
+  InternalMiniClusterOptions opts;
+  opts.num_masters = 3;
+  opts.supply_single_master_addr = false;
+  InternalMiniCluster cluster(env_, opts);
+  ASSERT_OK(cluster.Start());
+
+  const vector<HostPort> master_addresses = cluster.master_rpc_addrs();
+
+  cluster.mini_master(0)->Shutdown();
+  cluster.mini_master(0)->SetMasterAddresses({
+      master_addresses[0],
+      master_addresses[1]
+  });
+
+  Status s = cluster.mini_master(0)->Restart();
+  ASSERT_TRUE(s.ok() || s.IsInvalidArgument());
+  s = cluster.mini_master(0)->master()->WaitForCatalogManagerInit();
+  ASSERT_STR_CONTAINS(s.ToString(), "Unable to initialize catalog manager");
+  ASSERT_STR_CONTAINS(s.ToString(),
+                      "If trying to remove one or more masters from the cluster,"
+                      " please follow the documented steps to do so.");
+}
+
+
 } // namespace master
 } // namespace kudu
diff --git a/src/kudu/master/sys_catalog.cc b/src/kudu/master/sys_catalog.cc
index 72ae70f..74662ce 100644
--- a/src/kudu/master/sys_catalog.cc
+++ b/src/kudu/master/sys_catalog.cc
@@ -291,14 +291,31 @@ Status SysCatalogTable::Load(FsManager *fs_manager) {
                                   peer_addrs_from_disk.begin(),
                                   peer_addrs_from_disk.end(),
                                   back_inserter(symm_diff));
-    if (!symm_diff.empty()) {
-      string msg = Substitute(
-          "on-disk master list ($0) and provided master list ($1) differ. "
-          "Their symmetric difference is: $2",
-          JoinStrings(peer_addrs_from_disk, ", "),
-          JoinStrings(peer_addrs_from_opts, ", "),
-          JoinStrings(symm_diff, ", "));
-      return Status::InvalidArgument(msg);
+
+
+    // We should prevent starting with fewer masters than previously. This way if the user wants to
+    // remove a master they would have to do it manually.
+    if (peer_addrs_from_opts.size() < peer_addrs_from_disk.size()) {
+      return Status::InvalidArgument(
+          Substitute("on-disk master list ($0) has more entries than provided master "
+                     "list ($1). Their symmetric difference is: $2. "
+                     "If trying to remove one or more masters from the cluster, please follow the "
+                     "documented steps to do so.",
+                     JoinStrings(peer_addrs_from_disk, ", "),
+                     JoinStrings(peer_addrs_from_opts, ", "),
+                     JoinStrings(symm_diff, ", ")));
+    }
+    if (symm_diff.size() > 1) {
+      return Status::InvalidArgument(
+          Substitute(
+              "on-disk master list ($0) and provided master list ($1) differ by more "
+              "than one address. Their symmetric difference is: $2",
+              JoinStrings(peer_addrs_from_disk, ", "),
+              JoinStrings(peer_addrs_from_opts, ", "),
+              JoinStrings(symm_diff, ", ")));
+    }
+    if (symm_diff.size() == 1) {
+      LOG(INFO) << Substitute("Detected one additional master provided: $0", symm_diff[0]);
     }
   }
 
diff --git a/src/kudu/mini-cluster/external_mini_cluster.h b/src/kudu/mini-cluster/external_mini_cluster.h
index 971727b..bcd7c26 100644
--- a/src/kudu/mini-cluster/external_mini_cluster.h
+++ b/src/kudu/mini-cluster/external_mini_cluster.h
@@ -504,10 +504,6 @@ class ExternalMiniCluster : public MiniCluster {
   const std::string& dns_overrides() const {
     return dns_overrides_;
   }
-
- private:
-  Status StartMasters();
-
   // Constructs an ExternalMaster based on 'opts_' but with the given set of
   // master addresses, giving the new master the address in the list
   // corresponding to 'idx'. Callers are expected to call Start() with the
@@ -518,6 +514,9 @@ class ExternalMiniCluster : public MiniCluster {
   Status CreateMaster(const std::vector<HostPort>& master_rpc_addrs, int idx,
                       scoped_refptr<ExternalMaster>* master);
 
+ private:
+  Status StartMasters();
+
   Status DeduceBinRoot(std::string* ret);
   Status HandleOptions();