You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@kudu.apache.org by ad...@apache.org on 2019/10/01 06:05:24 UTC

[kudu] branch master updated (1efa619 -> 2e1c679)

This is an automated email from the ASF dual-hosted git repository.

adar pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git.


    from 1efa619  [mini_cluster] introduce 'builtin' clock source
     new 5316a89  KUDU-2069 p4: stop replication from failed servers in maintenance mode
     new 3cd6bd0  KUDU-2069 p5: recheck tablet health when exiting maintenance mode
     new 8ef7e18  trace-test: fix kernel stack watchdog data race
     new 2e1c679  table_locations-itest: restrict benchmark to slow mode

The 4 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 src/kudu/consensus/quorum_util-test.cc             | 128 ++++++
 src/kudu/consensus/quorum_util.cc                  |  42 +-
 src/kudu/consensus/quorum_util.h                   |  20 +-
 src/kudu/integration-tests/CMakeLists.txt          |   2 +
 .../integration-tests/maintenance_mode-itest.cc    | 468 +++++++++++++++++++++
 .../integration-tests/table_locations-itest.cc     |   1 +
 src/kudu/master/catalog_manager.cc                 |   5 +-
 src/kudu/master/master_service.cc                  |  14 +
 src/kudu/master/ts_descriptor.cc                   |  11 +
 src/kudu/master/ts_descriptor.h                    |   9 +
 src/kudu/master/ts_manager.cc                      |  29 +-
 src/kudu/master/ts_manager.h                       |  20 +-
 src/kudu/master/ts_state-test.cc                   |   9 +-
 src/kudu/util/debug/trace_event_impl.cc            |  11 +-
 src/kudu/util/trace-test.cc                        |  25 +-
 15 files changed, 755 insertions(+), 39 deletions(-)
 create mode 100644 src/kudu/integration-tests/maintenance_mode-itest.cc

[kudu] 01/04: KUDU-2069 p4: stop replication from failed servers in maintenance mode

Posted by ad...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

adar pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 5316a89dfd13c36eef078b32043f161e6d0bbf01
Author: Andrew Wong <aw...@apache.org>
AuthorDate: Mon Sep 16 14:40:21 2019 -0700

    KUDU-2069 p4: stop replication from failed servers in maintenance mode
    
    When determining whether a replica needs to be added, we may now
    consider a set of of UUIDs that are allowed to be in a "bad" state while
    not counting towards being under-replicated.
    
    Since the goal of maintenance mode is to cope with unexpected failures,
    "healthy" movement, e.g. through tooling that uses REPLACE and PROMOTE
    tagging, is still allowed to and from tservers in maintenance mode.
    
    Testing:
    - a unit test is added to exercise the new quorum logic to ignore
      certain failed UUIDs, taking into account REPLACE and PROMOTE replicas
    - integration tests are added to test:
      - behavior with RF=3 through restarts of the master
      - behavior when running move_replica tooling
      - behavior with RF=5 with background failures
    
    Change-Id: I9a63b55011d16900c0d27eac0eb75880074204db
    Reviewed-on: http://gerrit.cloudera.org:8080/14222
    Reviewed-by: Adar Dembo <ad...@cloudera.com>
    Reviewed-by: Alexey Serbin <as...@cloudera.com>
    Tested-by: Andrew Wong <aw...@cloudera.com>
---
 src/kudu/consensus/quorum_util-test.cc             | 128 ++++++
 src/kudu/consensus/quorum_util.cc                  |  42 +-
 src/kudu/consensus/quorum_util.h                   |  20 +-
 src/kudu/integration-tests/CMakeLists.txt          |   2 +
 .../integration-tests/maintenance_mode-itest.cc    | 457 +++++++++++++++++++++
 src/kudu/master/catalog_manager.cc                 |   5 +-
 src/kudu/master/ts_manager.cc                      |  18 +-
 src/kudu/master/ts_manager.h                       |   9 +
 8 files changed, 657 insertions(+), 24 deletions(-)

diff --git a/src/kudu/consensus/quorum_util-test.cc b/src/kudu/consensus/quorum_util-test.cc
index c478be8..a0b3851 100644
--- a/src/kudu/consensus/quorum_util-test.cc
+++ b/src/kudu/consensus/quorum_util-test.cc
@@ -561,6 +561,134 @@ TEST(QuorumUtilTest, ShouldAddReplica) {
   }
 }
 
+// Test that when tablet replicas are ignored for underreplication (e.g. due to
+// maintenance mode of a tablet server), the decision to add a replica will
+// actually ignore failures as appropriate.
+TEST(QuorumUtilTest, ShouldAddReplicaIgnoreFailures) {
+  {
+    RaftConfigPB config;
+    AddPeer(&config, "A", V, '?');
+    AddPeer(&config, "B", V, '-');
+    AddPeer(&config, "C", V, '+');
+    // The failed server is ignored, and doesn't count towards being
+    // under-replicated. Note: The server with unknown health also doesn't
+    // count towards being under-replicated.
+    EXPECT_FALSE(ShouldAddReplica(config, 3, { "B" }));
+    // While the server with unknown health doesn't count towards being
+    // under-replicated, the failed server does. But since we require a
+    // majority to add replicas, we can't add a replica.
+    EXPECT_FALSE(ShouldAddReplica(config, 3, { "A" }));
+  }
+  {
+    RaftConfigPB config;
+    AddPeer(&config, "A", V, '?');
+    AddPeer(&config, "B", V, '+');
+    AddPeer(&config, "C", V, '+');
+    // This is healthy, with or without ignoring failures.
+    EXPECT_FALSE(ShouldAddReplica(config, 3));
+    EXPECT_FALSE(ShouldAddReplica(config, 3, { "A" }));
+  }
+  {
+    RaftConfigPB config;
+    AddPeer(&config, "A", V, '-');
+    AddPeer(&config, "B", V, '+');
+    AddPeer(&config, "C", V, '+');
+    // But when a healthy server is in maintenance mode, we should consider the
+    // unhealthy server as failed and add a replica.
+    EXPECT_TRUE(ShouldAddReplica(config, 3, { "B" }));
+    // When the unhealthy server is in maintenance mode, we shouldn't add a
+    // replica, since all three servers aren't considered failed.
+    EXPECT_FALSE(ShouldAddReplica(config, 3, { "A" }));
+    // And when everything is in maintenance mode, we shouldn't add a replica
+    // even though a majority exists.
+    EXPECT_FALSE(ShouldAddReplica(config, 3, { "A", "B", "C" }));
+  }
+  {
+    RaftConfigPB config;
+    AddPeer(&config, "A", V, '-');
+    AddPeer(&config, "B", V, '-');
+    AddPeer(&config, "C", V, '+');
+    // A majority doesn't exist, so no matter what failures are being ignored,
+    // we will not add a replica.
+    EXPECT_FALSE(ShouldAddReplica(config, 3, { "A" }));
+    EXPECT_FALSE(ShouldAddReplica(config, 3, { "A", "B" }));
+    EXPECT_FALSE(ShouldAddReplica(config, 3, { "A", "B", "C" }));
+  }
+  {
+    // Ignored servers shouldn't change the decision when we really are
+    // under-replicated.
+    RaftConfigPB config;
+    AddPeer(&config, "A", V, '-');
+    AddPeer(&config, "B", V, '+');
+
+    // No majority present.
+    EXPECT_FALSE(ShouldAddReplica(config, 3, { "A" }));
+    EXPECT_FALSE(ShouldAddReplica(config, 3, { "B" }));
+  }
+  {
+    RaftConfigPB config;
+    AddPeer(&config, "A", V, '-');
+    AddPeer(&config, "B", V, '-');
+    AddPeer(&config, "C", V, '+');
+    AddPeer(&config, "D", V, '+');
+    AddPeer(&config, "E", V, '+');
+    // When both failed replicas are being ignored, we shouldn't add a replica.
+    EXPECT_FALSE(ShouldAddReplica(config, 5, { "A", "B" }));
+    // When only one of them is ignored, we should.
+    EXPECT_TRUE(ShouldAddReplica(config, 5, { "A" }));
+  }
+}
+
+// Test that when tablet replicas are ignored for underreplication, replace is
+// still honored as appropriate.
+TEST(QuorumUtilTest, ShouldAddReplicaHonorReplaceWhenIgnoringFailures) {
+  // Even if the replica to replace is meant to be ignored on failure, we
+  // should honor the replacement and try to add a replica.
+  for (char health : { '+', '-', '?' }) {
+    RaftConfigPB config;
+    AddPeer(&config, "A", V, health, {{"REPLACE", true}});
+    AddPeer(&config, "B", V, '+');
+    AddPeer(&config, "C", V, '+');
+    EXPECT_TRUE(ShouldAddReplica(config, 3, { "A" }));
+  }
+  {
+    RaftConfigPB config;
+    AddPeer(&config, "A", V, '+', {{"REPLACE", true}});
+    AddPeer(&config, "B", V, '-');
+    AddPeer(&config, "C", V, '+');
+    // Ignoring failures shouldn't impede our ability to add a replica when the
+    // "ignored" server is actually healthy.
+    EXPECT_TRUE(ShouldAddReplica(config, 3, { "A" }));
+  }
+}
+
+TEST(QuorumUtilTest, ShouldAddReplicaHonorPromoteWhenIgnoringFailures) {
+  // If one of our replicas to promote has failed, and we are supposed to
+  // ignore its failure, we should not add a replica because of it.
+  // And if they're healthy or unknown, we also shouldn't add a replica.
+  for (char health : { '+', '-', '?' }) {
+    {
+      RaftConfigPB config;
+      AddPeer(&config, "A", N, health, {{"PROMOTE", true}});
+      AddPeer(&config, "B", V, '+');
+      AddPeer(&config, "C", V, '+');
+      EXPECT_FALSE(ShouldAddReplica(config, 3, { "A" }));
+    }
+    {
+      RaftConfigPB config;
+      AddPeer(&config, "A", N, health, {{"PROMOTE", true}});
+      AddPeer(&config, "B", N, '-', {{"PROMOTE", true}});
+      AddPeer(&config, "C", V, '+');
+      AddPeer(&config, "D", V, '+');
+      AddPeer(&config, "E", V, '+');
+      EXPECT_FALSE(ShouldAddReplica(config, 5, { "A", "B" }));
+      // But when there is a failure that isn't supposed to be ignored (B), we
+      // should add a replica.
+      EXPECT_TRUE(ShouldAddReplica(config, 5, { "A" }));
+    }
+  }
+}
+
 // Verify logic of the kudu::consensus::ShouldEvictReplica(), anticipating
 // removal of a voter replica.
 TEST(QuorumUtilTest, ShouldEvictReplicaVoters) {
diff --git a/src/kudu/consensus/quorum_util.cc b/src/kudu/consensus/quorum_util.cc
index 40f09cb..dccf0fa 100644
--- a/src/kudu/consensus/quorum_util.cc
+++ b/src/kudu/consensus/quorum_util.cc
@@ -22,6 +22,7 @@
 #include <queue>
 #include <set>
 #include <string>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -44,6 +45,7 @@ using std::pair;
 using std::priority_queue;
 using std::set;
 using std::string;
+using std::unordered_set;
 using std::vector;
 using strings::Substitute;
 
@@ -418,8 +420,9 @@ string DiffConsensusStates(const ConsensusStatePB& old_state,
 
 // The decision is based on:
 //
-//   * the number of voter replicas in definitively bad shape and replicas
-//     marked with the REPLACE attribute
+//   * the number of voter replicas that aren't ignored (i.e. that aren't
+//     allowed to be in bad shape), that are definitively in bad shape or are
+//     marked with the REPLACE attribute.
 //
 //   * the number of non-voter replicas marked with the PROMOTE=true attribute
 //     in good or possibly good state.
@@ -433,7 +436,8 @@ string DiffConsensusStates(const ConsensusStatePB& old_state,
 // TODO(aserbin): add a test scenario for the leader replica's logic to cover
 //                the latter case.
 bool ShouldAddReplica(const RaftConfigPB& config,
-                      int replication_factor) {
+                      int replication_factor,
+                      const unordered_set<string>& uuids_ignored_for_underreplication) {
   int num_voters_total = 0;
   int num_voters_healthy = 0;
   int num_voters_need_replacement = 0;
@@ -445,12 +449,24 @@ bool ShouldAddReplica(const RaftConfigPB& config,
   VLOG(2) << "config to evaluate: " << SecureDebugString(config);
   for (const RaftPeerPB& peer : config.peers()) {
     const auto overall_health = peer.health_report().overall_health();
+    const auto& peer_uuid = peer.permanent_uuid();
+    bool ignore_failure_for_underreplication = ContainsKey(uuids_ignored_for_underreplication,
+                                                           peer_uuid);
+    if (VLOG_IS_ON(1) && ignore_failure_for_underreplication) {
+      VLOG(1) << Substitute("ignoring $0 if failed", peer_uuid);
+    }
     switch (peer.member_type()) {
       case RaftPeerPB::VOTER:
         ++num_voters_total;
-        if (peer.attrs().replace() ||
-            overall_health == HealthReportPB::FAILED ||
-            overall_health == HealthReportPB::FAILED_UNRECOVERABLE) {
+        if (peer.attrs().replace()) {
+          ++num_voters_need_replacement;
+        } else if (overall_health == HealthReportPB::FAILED ||
+                   overall_health == HealthReportPB::FAILED_UNRECOVERABLE) {
+          // If the failed peer should be ignored, e.g. the server is in
+          // maintenance mode, don't count it towards under-replication.
+          if (ignore_failure_for_underreplication) {
+            continue;
+          }
           ++num_voters_need_replacement;
         }
         if (overall_health == HealthReportPB::HEALTHY) {
@@ -458,17 +474,19 @@ bool ShouldAddReplica(const RaftConfigPB& config,
         }
         break;
       case RaftPeerPB::NON_VOTER:
-        if (peer.attrs().promote() &&
-            overall_health != HealthReportPB::FAILED &&
-            overall_health != HealthReportPB::FAILED_UNRECOVERABLE) {
-          // A replica with HEALTHY or UNKNOWN overall health status
-          // is considered as a replica to promote: a new non-voter replica is
+        if (peer.attrs().promote()) {
+          // A replica with HEALTHY or UNKNOWN overall health status is
+          // considered as a replica to promote: a new non-voter replica is
           // added with UNKNOWN health status. If such a replica is not
           // responsive for a long time, then its state will change to
           // HealthReportPB::FAILED after some time and it will be evicted. But
           // before that, it's considered as a candidate for promotion in the
           // code below.
-          ++num_non_voters_to_promote;
+          if ((overall_health != HealthReportPB::FAILED &&
+               overall_health != HealthReportPB::FAILED_UNRECOVERABLE) ||
+              ignore_failure_for_underreplication) {
+            ++num_non_voters_to_promote;
+          }
         }
         break;
       default:
diff --git a/src/kudu/consensus/quorum_util.h b/src/kudu/consensus/quorum_util.h
index cc11319..e6afbaa 100644
--- a/src/kudu/consensus/quorum_util.h
+++ b/src/kudu/consensus/quorum_util.h
@@ -14,11 +14,10 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-
-#ifndef KUDU_CONSENSUS_QUORUM_UTIL_H_
-#define KUDU_CONSENSUS_QUORUM_UTIL_H_
+#pragma once
 
 #include <string>
+#include <unordered_set>
 
 #include "kudu/consensus/metadata.pb.h"
 #include "kudu/util/status.h"
@@ -106,12 +105,16 @@ std::string DiffConsensusStates(const ConsensusStatePB& old_state,
 std::string DiffRaftConfigs(const RaftConfigPB& old_config,
                             const RaftConfigPB& new_config);
 
-// Return 'true' iff the specified tablet configuration is under-replicated
-// given the 'replication_factor' and a healthy majority exists. The decision
-// is based on the health information provided by the Raft configuration in the
-// 'config' parameter.
+// Return 'true' iff there is a quorum and the specified tablet configuration
+// is under-replicated given the 'replication_factor', ignoring failures of
+// the UUIDs in 'uuids_ignored_for_underreplication'.
+//
+// The decision is based on the health information provided by the Raft
+// configuration in the 'config' parameter.
 bool ShouldAddReplica(const RaftConfigPB& config,
-                      int replication_factor);
+                      int replication_factor,
+                      const std::unordered_set<std::string>& uuids_ignored_for_underreplication =
+                          std::unordered_set<std::string>());
 
 // Check if the given Raft configuration contains at least one extra replica
 // which should (and can) be removed in accordance with the specified
@@ -126,4 +129,3 @@ bool ShouldEvictReplica(const RaftConfigPB& config,
 }  // namespace consensus
 }  // namespace kudu
 
-#endif /* KUDU_CONSENSUS_QUORUM_UTIL_H_ */
diff --git a/src/kudu/integration-tests/CMakeLists.txt b/src/kudu/integration-tests/CMakeLists.txt
index dd8b54f..cd103ab 100644
--- a/src/kudu/integration-tests/CMakeLists.txt
+++ b/src/kudu/integration-tests/CMakeLists.txt
@@ -45,6 +45,7 @@ target_link_libraries(itest_util
   kudu_curl_util
   kudu_fs
   kudu_test_util
+  kudu_tools_test_util
   kudu_tools_util
   security_test_util)
 add_dependencies(itest_util
@@ -81,6 +82,7 @@ ADD_KUDU_TEST(fuzz-itest RUN_SERIAL true)
 ADD_KUDU_TEST(heavy-update-compaction-itest RUN_SERIAL true)
 ADD_KUDU_TEST(linked_list-test RUN_SERIAL true)
 ADD_KUDU_TEST(log-rolling-itest)
+ADD_KUDU_TEST(maintenance_mode-itest)
 ADD_KUDU_TEST(master_cert_authority-itest PROCESSORS 2)
 ADD_KUDU_TEST(master_failover-itest NUM_SHARDS 4 PROCESSORS 3)
 ADD_KUDU_TEST_DEPENDENCIES(master_failover-itest
diff --git a/src/kudu/integration-tests/maintenance_mode-itest.cc b/src/kudu/integration-tests/maintenance_mode-itest.cc
new file mode 100644
index 0000000..6d620f4
--- /dev/null
+++ b/src/kudu/integration-tests/maintenance_mode-itest.cc
@@ -0,0 +1,457 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "kudu/consensus/consensus.pb.h"
+#include "kudu/consensus/metadata.pb.h"
+#include "kudu/gutil/map-util.h"
+#include "kudu/gutil/stl_util.h"
+#include "kudu/integration-tests/cluster_itest_util.h"
+#include "kudu/integration-tests/cluster_verifier.h"
+#include "kudu/integration-tests/external_mini_cluster-itest-base.h"
+#include "kudu/integration-tests/test_workload.h"
+#include "kudu/master/master.pb.h"
+#include "kudu/master/master.proxy.h"
+#include "kudu/mini-cluster/external_mini_cluster.h"
+#include "kudu/mini-cluster/mini_cluster.h"
+#include "kudu/rpc/rpc_controller.h"
+#include "kudu/tools/tool_test_util.h"
+#include "kudu/util/metrics.h"
+#include "kudu/util/monotime.h"
+#include "kudu/util/path_util.h"
+#include "kudu/util/net/sockaddr.h"
+#include "kudu/util/scoped_cleanup.h"
+#include "kudu/util/status.h"
+#include "kudu/util/test_macros.h"
+#include "kudu/util/test_util.h"
+
+METRIC_DECLARE_entity(server);
+METRIC_DECLARE_histogram(
+    handler_latency_kudu_tserver_TabletCopyService_BeginTabletCopySession);
+
+using kudu::master::ChangeTServerStateRequestPB;
+using kudu::master::ChangeTServerStateResponsePB;
+using kudu::cluster::ExternalDaemon;
+using kudu::cluster::ExternalMiniClusterOptions;
+using kudu::cluster::ExternalTabletServer;
+using kudu::consensus::ConsensusStatePB;
+using kudu::consensus::HealthReportPB;
+using kudu::consensus::IncludeHealthReport;
+using kudu::itest::GetInt64Metric;
+using kudu::master::MasterServiceProxy;
+using kudu::master::TServerStateChangePB;
+using kudu::tools::RunKuduTool;
+using std::pair;
+using std::shared_ptr;
+using std::string;
+using std::unique_ptr;
+using std::unordered_map;
+using std::vector;
+
+namespace kudu {
+namespace itest {
+
+typedef pair<unordered_map<string, TServerDetails*>, unique_ptr<ValueDeleter>> MapAndDeleter;
+static const vector<string> kTServerFlags = {
+  // Set a low unavailability timeout so replicas are considered failed and can
+  // be re-replicated more quickly.
+  "--raft_heartbeat_interval_ms=100",
+  "--follower_unavailable_considered_failed_sec=2",
+  // Disable log GC in case our write workloads lead to eviction because
+  // consensus will consider replicas that are too fare behind unrecoverable
+  // and will evict them regardless of maintenance mode.
+  "--enable_log_gc=false",
+};
+static const MonoDelta kDurationForSomeHeartbeats = MonoDelta::FromSeconds(3);
+
+class MaintenanceModeITest : public ExternalMiniClusterITestBase {
+ public:
+  void SetUpCluster(int num_tservers) {
+    ExternalMiniClusterOptions opts;
+    opts.num_tablet_servers = num_tservers;
+    opts.extra_master_flags = { "--master_support_maintenance_mode=true" };
+    opts.extra_tserver_flags = kTServerFlags;
+    NO_FATALS(StartClusterWithOpts(std::move(opts)));
+    const auto& addr = cluster_->master(0)->bound_rpc_addr();
+    m_proxy_.reset(new MasterServiceProxy(cluster_->messenger(), addr, addr.host()));
+  }
+
+  // Perform the given state change on the given tablet server.
+  Status ChangeTServerState(const string& uuid, TServerStateChangePB::StateChange change) {
+    ChangeTServerStateRequestPB req;
+    ChangeTServerStateResponsePB resp;
+    TServerStateChangePB* state_change = req.mutable_change();
+    state_change->set_uuid(uuid);
+    state_change->set_change(change);
+    rpc::RpcController rpc;
+    return cluster_->master_proxy()->ChangeTServerState(req, &resp, &rpc);
+  }
+
+  // Checks whether tablet copies have started.
+  void ExpectStartedTabletCopies(bool should_have_started) {
+    bool has_started = false;
+    for (int i = 0; i < cluster_->num_tablet_servers(); i++) {
+      ExternalTabletServer* tserver = cluster_->tablet_server(i);
+      if (tserver->IsShutdown()) {
+        continue;
+      }
+      int64_t copies_started = 0;
+      ASSERT_OK(GetInt64Metric(tserver->bound_http_hostport(), &METRIC_ENTITY_server,
+          /*entity_id=*/nullptr,
+          &METRIC_handler_latency_kudu_tserver_TabletCopyService_BeginTabletCopySession,
+          "total_count", &copies_started));
+      if (copies_started > 0) {
+        has_started = true;
+        break;
+      }
+    }
+    ASSERT_EQ(should_have_started, has_started);
+  }
+
+  // Return the number of failed replicas there are in the cluster, according
+  // to the tablet leaders.
+  Status GetNumFailedReplicas(const unordered_map<string, TServerDetails*>& ts_map,
+                              int* num_replicas_failed) {
+    int num_failed = 0;
+    for (int i = 0; i < cluster_->num_tablet_servers(); i++) {
+      ExternalTabletServer* tserver = cluster_->tablet_server(i);
+      if (tserver->IsShutdown()) {
+        continue;
+      }
+      const string& uuid = tserver->uuid();
+      const TServerDetails* ts_details = FindOrDie(ts_map, uuid);
+      vector<string> tablet_ids;
+      RETURN_NOT_OK(ListRunningTabletIds(ts_details, MonoDelta::FromSeconds(30), &tablet_ids));
+      for (const auto& tablet_id : tablet_ids) {
+        ConsensusStatePB consensus_state;
+        RETURN_NOT_OK(GetConsensusState(ts_details, tablet_id, MonoDelta::FromSeconds(30),
+            IncludeHealthReport::INCLUDE_HEALTH_REPORT, &consensus_state));
+        // Only consider the health states reported by the leaders.
+        if (consensus_state.leader_uuid() != uuid) {
+          continue;
+        }
+        // Go through all the peers and tally up any that are failed.
+        const auto& committed_config = consensus_state.committed_config();
+        for (int p = 0; p < committed_config.peers_size(); p++) {
+          const auto& peer = committed_config.peers(p);
+          if (peer.has_health_report() &&
+              peer.health_report().overall_health() == HealthReportPB::FAILED) {
+            num_failed++;
+          }
+        }
+      }
+    }
+    *num_replicas_failed = num_failed;
+    return Status::OK();
+  }
+
+  void AssertEventuallyNumFailedReplicas(const unordered_map<string, TServerDetails*>& ts_map,
+                                         int expected_failed) {
+    ASSERT_EVENTUALLY([&] {
+      int num_failed;
+      ASSERT_OK(GetNumFailedReplicas(ts_map, &num_failed));
+      ASSERT_EQ(expected_failed, num_failed);
+    });
+  }
+
+  void GenerateTServerMap(MapAndDeleter* map_and_deleter) {
+    unordered_map<string, TServerDetails*> ts_map;
+    ASSERT_EVENTUALLY([&] {
+      ASSERT_OK(CreateTabletServerMap(m_proxy_, cluster_->messenger(), &ts_map));
+      auto cleanup = MakeScopedCleanup([&] {
+        STLDeleteValues(&ts_map);
+      });
+      ASSERT_EQ(cluster_->num_tablet_servers(), ts_map.size());
+      cleanup.cancel();
+    });
+    map_and_deleter->first = std::move(ts_map);
+    map_and_deleter->second.reset(new ValueDeleter(&map_and_deleter->first));
+  }
+
+ protected:
+  shared_ptr<MasterServiceProxy> m_proxy_;
+};
+
+class MaintenanceModeRF3ITest : public MaintenanceModeITest {
+ public:
+  void SetUp() override {
+    SKIP_IF_SLOW_NOT_ALLOWED();
+    NO_FATALS(MaintenanceModeITest::SetUp());
+    NO_FATALS(SetUpCluster(3));
+  }
+  void TearDown() override {
+    SKIP_IF_SLOW_NOT_ALLOWED();
+    NO_FATALS(MaintenanceModeITest::TearDown());
+  }
+};
+
+// Test that placing a tablet server in maintenance mode leads to the failed
+// replicas on that server not being re-replicated.
+TEST_F(MaintenanceModeRF3ITest, TestFailedTServerInMaintenanceModeDoesntRereplicate) {
+  // This test will sleep a bit to ensure the master has had some time to
+  // receive heartbeats.
+  SKIP_IF_SLOW_NOT_ALLOWED();
+  const int kNumTablets = 6;
+
+  // Create the table with three tablet servers and then add one so we're
+  // guaranteed that the replicas are all on the first three servers.
+  TestWorkload create_table(cluster_.get());
+  create_table.set_num_tablets(kNumTablets);
+  create_table.Setup();
+  create_table.Start();
+  // Add a server so there's one we could move to after bringing down a
+  // tserver.
+  ASSERT_OK(cluster_->AddTabletServer());
+  MapAndDeleter ts_map_and_deleter;
+  NO_FATALS(GenerateTServerMap(&ts_map_and_deleter));
+  const auto& ts_map = ts_map_and_deleter.first;
+
+  // Do a sanity check that all our replicas are healthy.
+  NO_FATALS(AssertEventuallyNumFailedReplicas(ts_map, 0));
+
+  // Put one of the servers in maintenance mode.
+  ExternalTabletServer* maintenance_ts = cluster_->tablet_server(0);
+  const string maintenance_uuid = maintenance_ts->uuid();
+  ASSERT_OK(ChangeTServerState(maintenance_uuid, TServerStateChangePB::ENTER_MAINTENANCE_MODE));
+
+  // Bringing the tablet server down shouldn't lead to re-replication.
+  //
+  // Note: it's possible to set up re-replication scenarios in other ways (e.g.
+  // by hitting an IO error, or falling too far behind when replicating); these
+  // should all be treated the same way by virtue of them all using the same
+  // health reporting.
+  NO_FATALS(maintenance_ts->Shutdown());
+
+  // Now wait a bit for this failure to make its way to the master. The failure
+  // shouldn't have led to any re-replication.
+  SleepFor(kDurationForSomeHeartbeats);
+  NO_FATALS(AssertEventuallyNumFailedReplicas(ts_map, kNumTablets));
+  NO_FATALS(ExpectStartedTabletCopies(/*should_have_started*/false));
+
+  // Restarting the masters shouldn't lead to re-replication either, even
+  // though the tablet server is still down.
+  NO_FATALS(cluster_->ShutdownNodes(cluster::ClusterNodes::MASTERS_ONLY));
+  ASSERT_OK(cluster_->master()->Restart());
+  SleepFor(kDurationForSomeHeartbeats);
+  NO_FATALS(ExpectStartedTabletCopies(/*should_have_started*/false));
+
+  // Now bring the server back up and wait for it to become healthy. It should
+  // be able to do this without tablet copies.
+  ASSERT_OK(maintenance_ts->Restart());
+  NO_FATALS(AssertEventuallyNumFailedReplicas(ts_map, 0));
+
+  // Since our server is healthy, leaving maintenance mode shouldn't trigger
+  // any re-replication either.
+  ASSERT_OK(ChangeTServerState(maintenance_uuid, TServerStateChangePB::EXIT_MAINTENANCE_MODE));
+  SleepFor(kDurationForSomeHeartbeats);
+  NO_FATALS(ExpectStartedTabletCopies(/*should_have_started*/false));
+
+  // All the while, our workload should not have been interrupted. Assert
+  // eventually to wait for the rows to converge.
+  NO_FATALS(create_table.StopAndJoin());
+  ClusterVerifier v(cluster_.get());
+  ASSERT_EVENTUALLY([&] {
+    NO_FATALS(v.CheckRowCount(create_table.table_name(),
+                              ClusterVerifier::EXACTLY, create_table.rows_inserted()));
+  });
+}
+
+TEST_F(MaintenanceModeRF3ITest, TestMaintenanceModeDoesntObstructMove) {
+  SKIP_IF_SLOW_NOT_ALLOWED();
+  TestWorkload create_table(cluster_.get());
+  create_table.set_num_tablets(1);
+  create_table.Setup();
+
+  // Add a tablet server.
+  ASSERT_OK(cluster_->AddTabletServer());
+  const string& added_uuid = cluster_->tablet_server(3)->uuid();
+  MapAndDeleter ts_map_and_deleter;
+  NO_FATALS(GenerateTServerMap(&ts_map_and_deleter));
+  const auto& ts_map = ts_map_and_deleter.first;
+
+  // Put a tablet server into maintenance mode.
+  const string maintenance_uuid = cluster_->tablet_server(0)->uuid();
+  const TServerDetails* maintenance_details = FindOrDie(ts_map, maintenance_uuid);
+  vector<string> mnt_tablet_ids;
+  ASSERT_OK(ListRunningTabletIds(maintenance_details, MonoDelta::FromSeconds(30), &mnt_tablet_ids));
+  ASSERT_EQ(1, mnt_tablet_ids.size());
+
+  ASSERT_OK(ChangeTServerState(maintenance_uuid, TServerStateChangePB::ENTER_MAINTENANCE_MODE));
+
+  // While the maintenance mode tserver is still online, move a tablet from it.
+  // This should succeed, because maintenance mode will not obstruct manual
+  // movement of replicas.
+  {
+    vector<string> move_cmd = {
+      "tablet",
+      "change_config",
+      "move_replica",
+      cluster_->master()->bound_rpc_addr().ToString(),
+      mnt_tablet_ids[0],
+      maintenance_uuid,
+      added_uuid,
+    };
+    string stdout, stderr;
+    ASSERT_OK(RunKuduTool(move_cmd, &stdout, &stderr));
+  }
+  const TServerDetails* added_details = FindOrDie(ts_map, added_uuid);
+  ASSERT_EVENTUALLY([&] {
+    vector<string> added_tablet_ids;
+    ASSERT_OK(ListRunningTabletIds(added_details, MonoDelta::FromSeconds(30), &added_tablet_ids));
+    ASSERT_EQ(1, added_tablet_ids.size());
+  });
+}
+
+// Test that the health state FAILED_UNRECOVERABLE (e.g. if there's a disk
+// error, or if a replica is lagging too much) is still re-replicated during
+// maintenance mode.
+TEST_F(MaintenanceModeRF3ITest, TestMaintenanceModeDoesntObstructFailedUnrecoverable) {
+  SKIP_IF_SLOW_NOT_ALLOWED();
+  TestWorkload create_table(cluster_.get());
+  create_table.set_num_tablets(1);
+  create_table.Setup();
+  create_table.Start();
+
+  // Add a tablet server.
+  ASSERT_OK(cluster_->AddTabletServer());
+  const string& added_uuid = cluster_->tablet_server(3)->uuid();
+  MapAndDeleter ts_map_and_deleter;
+  NO_FATALS(GenerateTServerMap(&ts_map_and_deleter));
+  const auto& ts_map = ts_map_and_deleter.first;
+
+  // Put a tablet server into maintenance mode.
+  ExternalDaemon* maintenance_ts = cluster_->tablet_server(0);
+  const string maintenance_uuid = maintenance_ts->uuid();
+  const TServerDetails* maintenance_details = FindOrDie(ts_map, maintenance_uuid);
+  vector<string> mnt_tablet_ids;
+  ASSERT_OK(ListRunningTabletIds(maintenance_details, MonoDelta::FromSeconds(30), &mnt_tablet_ids));
+  ASSERT_OK(ChangeTServerState(maintenance_uuid, TServerStateChangePB::ENTER_MAINTENANCE_MODE));
+
+  // Now fail the tablet on the server in maintenance mode by injecting a disk
+  // error. Also speed up flushes so we actually hit an IO error.
+  ASSERT_OK(cluster_->SetFlag(maintenance_ts, "flush_threshold_secs", "1"));
+  ASSERT_OK(cluster_->SetFlag(maintenance_ts, "env_inject_eio_globs",
+      JoinPathSegments(maintenance_ts->data_dirs()[0], "**")));
+  ASSERT_OK(cluster_->SetFlag(maintenance_ts, "env_inject_eio", "1"));
+
+  // Eventually the disk failure will be noted and a copy will be made at the
+  // added server.
+  const TServerDetails* added_details = FindOrDie(ts_map, added_uuid);
+  ASSERT_EVENTUALLY([&] {
+    vector<string> added_tablet_ids;
+    ASSERT_OK(ListRunningTabletIds(added_details, MonoDelta::FromSeconds(30), &added_tablet_ids));
+    ASSERT_EQ(1, added_tablet_ids.size());
+  });
+  NO_FATALS(create_table.StopAndJoin());
+  ClusterVerifier v(cluster_.get());
+  ASSERT_EVENTUALLY([&] {
+    NO_FATALS(v.CheckRowCount(create_table.table_name(),
+                              ClusterVerifier::EXACTLY, create_table.rows_inserted()));
+  });
+}
+
+class MaintenanceModeRF5ITest : public MaintenanceModeITest {
+ public:
+  void SetUp() override {
+    SKIP_IF_SLOW_NOT_ALLOWED();
+    NO_FATALS(MaintenanceModeITest::SetUp());
+    NO_FATALS(SetUpCluster(5));
+  }
+  void TearDown() override {
+    SKIP_IF_SLOW_NOT_ALLOWED();
+    NO_FATALS(MaintenanceModeITest::TearDown());
+  }
+};
+
+// Test that a table with RF=5 will still be available through the failure of
+// two nodes if one is put in maintenance mode.
+TEST_F(MaintenanceModeRF5ITest, TestBackgroundFailureDuringMaintenanceMode) {
+  SKIP_IF_SLOW_NOT_ALLOWED();
+  // Create some tables with RF=5.
+  const int kNumTablets = 3;
+  TestWorkload create_table(cluster_.get());
+  create_table.set_num_tablets(kNumTablets);
+  create_table.set_num_replicas(5);
+  create_table.Setup();
+  create_table.Start();
+
+  // Add a server so we have one empty server to replicate to.
+  ASSERT_OK(cluster_->AddTabletServer());
+  MapAndDeleter ts_map_and_deleter;
+  NO_FATALS(GenerateTServerMap(&ts_map_and_deleter));
+  const auto& ts_map = ts_map_and_deleter.first;
+
+  // Do a sanity check that all our replicas are healthy.
+  NO_FATALS(AssertEventuallyNumFailedReplicas(ts_map, 0));
+
+  // Enter maintenance mode on a tserver and shut it down.
+  ExternalTabletServer* maintenance_ts = cluster_->tablet_server(0);
+  const string maintenance_uuid = maintenance_ts->uuid();
+  ASSERT_OK(ChangeTServerState(maintenance_uuid, TServerStateChangePB::ENTER_MAINTENANCE_MODE));
+  NO_FATALS(maintenance_ts->Shutdown());
+  SleepFor(kDurationForSomeHeartbeats);
+
+  // Wait for the failure to be recognized by the other replicas.
+  NO_FATALS(AssertEventuallyNumFailedReplicas(ts_map, kNumTablets));
+  NO_FATALS(ExpectStartedTabletCopies(/*should_have_started*/false));
+
+  // Now kill another server. We should be able to see some copies.
+  NO_FATALS(cluster_->tablet_server(1)->Shutdown());
+  ASSERT_EVENTUALLY([&] {
+    NO_FATALS(ExpectStartedTabletCopies(/*should_have_started*/true));
+  });
+  NO_FATALS(AssertEventuallyNumFailedReplicas(ts_map, 0));
+  // The previously empty tablet server should hold all the replicas that were
+  // re-replicated.
+  const TServerDetails* added_details = FindOrDie(ts_map, cluster_->tablet_server(5)->uuid());
+  ASSERT_EVENTUALLY([&] {
+    vector<string> added_tablet_ids;
+    ASSERT_OK(ListRunningTabletIds(added_details, MonoDelta::FromSeconds(30), &added_tablet_ids));
+    ASSERT_EQ(kNumTablets, added_tablet_ids.size());
+  });
+  // Now exit maintenance mode and restart the maintenance tserver. The
+  // original tablets on the maintenance mode tserver should still exist.
+  ASSERT_OK(ChangeTServerState(maintenance_uuid, TServerStateChangePB::EXIT_MAINTENANCE_MODE));
+  ASSERT_OK(maintenance_ts->Restart());
+  SleepFor(kDurationForSomeHeartbeats);
+  const TServerDetails* maintenance_details = FindOrDie(ts_map, maintenance_uuid);
+  vector<string> mnt_tablet_ids;
+  ASSERT_OK(ListRunningTabletIds(maintenance_details, MonoDelta::FromSeconds(30), &mnt_tablet_ids));
+  ASSERT_EQ(kNumTablets, mnt_tablet_ids.size());
+
+  // All the while, our workload should not have been interrupted. Assert
+  // eventually to wait for the rows to converge.
+  NO_FATALS(create_table.StopAndJoin());
+  ClusterVerifier v(cluster_.get());
+  ASSERT_EVENTUALLY([&] {
+    NO_FATALS(v.CheckRowCount(create_table.table_name(),
+                              ClusterVerifier::EXACTLY, create_table.rows_inserted()));
+  });
+}
+
+} // namespace itest
+} // namespace kudu
diff --git a/src/kudu/master/catalog_manager.cc b/src/kudu/master/catalog_manager.cc
index 83bea9e..ab1c7ad 100644
--- a/src/kudu/master/catalog_manager.cc
+++ b/src/kudu/master/catalog_manager.cc
@@ -3989,6 +3989,8 @@ Status CatalogManager::ProcessTabletReport(
   // 3. Process each tablet. This may not be in the order that the tablets
   // appear in 'full_report', but that has no bearing on correctness.
   vector<scoped_refptr<TabletInfo>> mutated_tablets;
+  unordered_set<string> uuids_ignored_for_underreplication =
+      master_->ts_manager()->GetUuidsToIgnoreForUnderreplication();
   for (const auto& e : tablet_infos) {
     const string& tablet_id = e.first;
     const scoped_refptr<TabletInfo>& tablet = e.second;
@@ -4199,7 +4201,8 @@ Status CatalogManager::ProcessTabletReport(
           rpcs.emplace_back(new AsyncEvictReplicaTask(
               master_, tablet, cstate, std::move(to_evict)));
         } else if (FLAGS_master_add_server_when_underreplicated &&
-                   ShouldAddReplica(config, replication_factor)) {
+                   ShouldAddReplica(config, replication_factor,
+                                    uuids_ignored_for_underreplication)) {
           rpcs.emplace_back(new AsyncAddReplicaTask(
               master_, tablet, cstate, RaftPeerPB::NON_VOTER, &rng_));
         }
diff --git a/src/kudu/master/ts_manager.cc b/src/kudu/master/ts_manager.cc
index 7106ed3..79d3088 100644
--- a/src/kudu/master/ts_manager.cc
+++ b/src/kudu/master/ts_manager.cc
@@ -20,6 +20,7 @@
 #include <algorithm>
 #include <limits>
 #include <mutex>
+#include <utility>
 
 #include <boost/optional/optional.hpp>
 #include <gflags/gflags.h>
@@ -57,6 +58,7 @@ METRIC_DEFINE_gauge_int32(server, cluster_replica_skew,
 
 using kudu::pb_util::SecureShortDebugString;
 using std::lock_guard;
+using std::unordered_set;
 using std::shared_ptr;
 using std::string;
 using strings::Substitute;
@@ -202,10 +204,22 @@ int TSManager::GetCount() const {
   return servers_by_id_.size();
 }
 
+unordered_set<string> TSManager::GetUuidsToIgnoreForUnderreplication() const {
+  unordered_set<string> uuids;
+  shared_lock<RWMutex> tsl(ts_state_lock_);
+  uuids.reserve(ts_state_by_uuid_.size());
+  for (const auto& ts_and_state : ts_state_by_uuid_) {
+    if (ts_and_state.second == TServerStatePB::MAINTENANCE_MODE) {
+      uuids.emplace(ts_and_state.first);
+    }
+  }
+  return uuids;
+}
+
 void TSManager::GetDescriptorsAvailableForPlacement(TSDescriptorVector* descs) const {
   descs->clear();
-  shared_lock<rw_spinlock> l(lock_);
   shared_lock<RWMutex> tsl(ts_state_lock_);
+  shared_lock<rw_spinlock> l(lock_);
   descs->reserve(servers_by_id_.size());
   for (const TSDescriptorMap::value_type& entry : servers_by_id_) {
     const shared_ptr<TSDescriptor>& ts = entry.second;
@@ -272,7 +286,7 @@ int TSManager::ClusterSkew() const {
 }
 
 bool TSManager::AvailableForPlacementUnlocked(const TSDescriptor& ts) const {
-  DCHECK(lock_.is_locked());
+  ts_state_lock_.AssertAcquired();
   // TODO(KUDU-1827): this should also be used when decommissioning a server.
   if (GetTServerStateUnlocked(ts.permanent_uuid()) == TServerStatePB::MAINTENANCE_MODE) {
     return false;
diff --git a/src/kudu/master/ts_manager.h b/src/kudu/master/ts_manager.h
index 7d3603f..d1cc925 100644
--- a/src/kudu/master/ts_manager.h
+++ b/src/kudu/master/ts_manager.h
@@ -19,6 +19,7 @@
 #include <memory>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 
 #include "kudu/gutil/macros.h"
 #include "kudu/gutil/ref_counted.h"
@@ -90,6 +91,11 @@ class TSManager {
   // replication to them (e.g. maintenance mode).
   void GetDescriptorsAvailableForPlacement(TSDescriptorVector* descs) const;
 
+  // Return any tablet servers UUIDs that can be in a failed state without
+  // counting towards under-replication (e.g. because they're in maintenance
+  // mode).
+  std::unordered_set<std::string> GetUuidsToIgnoreForUnderreplication() const;
+
   // Get the TS count.
   int GetCount() const;
 
@@ -129,7 +135,10 @@ class TSManager {
       std::string, std::shared_ptr<TSDescriptor>> TSDescriptorMap;
   TSDescriptorMap servers_by_id_;
 
+  // Protects 'ts_state_by_uuid_'. If both 'ts_state_lock_' and 'lock_' are to
+  // be taken, 'ts_state_lock_' must be taken first.
   mutable RWMutex ts_state_lock_;
+
   // Maps from the UUIDs of tablet servers to their tserver state, if any.
   // Note: the states don't necessarily belong to registered tablet servers.
   std::unordered_map<std::string, TServerStatePB> ts_state_by_uuid_;

[kudu] 04/04: table_locations-itest: restrict benchmark to slow mode

Posted by ad...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

adar pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 2e1c6794662ad8f7cc718ebff5361c570e96102a
Author: Adar Dembo <ad...@cloudera.com>
AuthorDate: Mon Sep 30 20:59:32 2019 -0700

    table_locations-itest: restrict benchmark to slow mode
    
    The CreateTable RPC can time out in TSAN mode, and perhaps in ASAN as well.
    So let's just restrict it to slow mode, which is what we usually do in tests
    that create a replicated table with many tablets.
    
    Change-Id: I867b629be2f51838c453b078b5617042fc8468aa
    Reviewed-on: http://gerrit.cloudera.org:8080/14328
    Reviewed-by: Andrew Wong <aw...@cloudera.com>
    Reviewed-by: Alexey Serbin <as...@cloudera.com>
    Tested-by: Adar Dembo <ad...@cloudera.com>
---
 src/kudu/integration-tests/table_locations-itest.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/kudu/integration-tests/table_locations-itest.cc b/src/kudu/integration-tests/table_locations-itest.cc
index e3d2cef..63adf1b 100644
--- a/src/kudu/integration-tests/table_locations-itest.cc
+++ b/src/kudu/integration-tests/table_locations-itest.cc
@@ -389,6 +389,7 @@ TEST_F(TableLocationsWithTSLocationTest, TestGetTSLocation) {
 }
 
 TEST_F(TableLocationsTest, GetTableLocationsBenchmark) {
+  SKIP_IF_SLOW_NOT_ALLOWED();
   const int kNumSplits = FLAGS_benchmark_num_tablets - 1;
   const int kNumThreads = FLAGS_benchmark_num_threads;
   const auto kRuntime = MonoDelta::FromSeconds(FLAGS_benchmark_runtime_secs);

[kudu] 03/04: trace-test: fix kernel stack watchdog data race

Posted by ad...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

adar pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 8ef7e18f3528000f656cb9a467a6787e6ed080c9
Author: Adar Dembo <ad...@cloudera.com>
AuthorDate: Wed Sep 18 11:46:04 2019 -0700

    trace-test: fix kernel stack watchdog data race
    
    The KernelStackWatchdog thread runs independently of the test thread, and
    by calling IsBeingDebugged, it winds up creating a trace event of its own.
    This is problematic given that trace-test sets up event callbacks to write
    to test fixture members, which go out of scope in between tests.
    
    The only solution I could find was to avoid starting the KernelStackWatchdog
    in trace-test by passing Thread::NO_STACK_WATCHDOG into thread creation. I
    also had to do this when creating the trace sampling thread, but given
    that's not on by default, I don't think it's so bad that we lose watchdog
    monitoring for it.
    
    To test, I ran trace-test under gdb and set a breakpoint in
    KernelStackWatchdog::RunThread. With the fix, gdb no longer hit that
    breakpoint.
    
    WARNING: ThreadSanitizer: data race (pid=4111)
      Read of size 8 at 0x0000015ba5c8 by thread T2:
        #0 kudu::TraceEventCallbackTest::Callback(long, char, unsigned char const*, char const*, unsigned long, int, char const* const*, unsigned char const*, unsigned long const*, unsigned char) /home/jenkins-slave/workspace/kudu-master/2/src/kudu/util/trace-test.cc:463:5 (trace-test+0x4f107f)
        #1 kudu::debug::TraceLog::AddTraceEventWithThreadIdAndTimestamp(char, unsigned char const*, char const*, unsigned long, int, long const&, int, char const**, unsigned char const*, unsigned long const*, scoped_refptr<kudu::debug::ConvertableToTraceFormat> const*, unsigned char) /home/jenkins-slave/workspace/kudu-master/2/src/kudu/util/debug/trace_event_impl.cc:1911:7 (libkudu_util.so+0x1208b3)
        #2 kudu::debug::TraceEventHandle trace_event_internal::AddTraceEventWithThreadIdAndTimestamp<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >(char, unsigned char const*, char const*, unsigned long, int, long const&, unsigned char, char const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) /home/jenkins-slave/workspace/kudu-master/2/src/kudu/util/debug/trace_event.h:1314:10 (libkudu_util.so+0x146f58)
        #3 kudu::debug::TraceEventHandle trace_event_internal::AddTraceEvent<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >(char, unsigned char const*, char const*, unsigned long, unsigned char, char const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) /home/jenkins-slave/workspace/kudu-master/2/src/kudu/util/debug/trace_event.h:1330:10 (libkudu_util.so+0x146bef)
        #4 kudu::(anonymous namespace)::PosixEnv::NewSequentialFile(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::unique_ptr<kudu::SequentialFile, std::__1::default_delete<kudu::SequentialFile> >*) /home/jenkins-slave/workspace/kudu-master/2/src/kudu/util/env_posix.cc:1077:5 (libkudu_util.so+0x140905)
        #5 kudu::ReadFileToString(kudu::Env*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, kudu::faststring*) /home/jenkins-slave/workspace/kudu-master/2/src/kudu/util/env.cc:73:19 (libkudu_util.so+0x140054)
        #6 kudu::IsBeingDebugged() /home/jenkins-slave/workspace/kudu-master/2/src/kudu/util/os-util.cc:154:14 (libkudu_util.so+0x1c9687)
        #7 kudu::KernelStackWatchdog::RunThread() /home/jenkins-slave/workspace/kudu-master/2/src/kudu/util/kernel_stack_watchdog.cc:141:9 (libkudu_util.so+0x17de59)
        #8 boost::_mfi::mf0<void, kudu::KernelStackWatchdog>::operator()(kudu::KernelStackWatchdog*) const /home/jenkins-slave/workspace/kudu-master/2/thirdparty/installed/tsan/include/boost/bind/mem_fn_template.hpp:49:29 (libkudu_util.so+0x17fd89)
        #9 void boost::_bi::list1<boost::_bi::value<kudu::KernelStackWatchdog*> >::operator()<boost::_mfi::mf0<void, kudu::KernelStackWatchdog>, boost::_bi::list0>(boost::_bi::type<void>, boost::_mfi::mf0<void, kudu::KernelStackWatchdog>&, boost::_bi::list0&, int) /home/jenkins-slave/workspace/kudu-master/2/thirdparty/installed/tsan/include/boost/bind/bind.hpp:259:9 (libkudu_util.so+0x17fcda)
        #10 boost::_bi::bind_t<void, boost::_mfi::mf0<void, kudu::KernelStackWatchdog>, boost::_bi::list1<boost::_bi::value<kudu::KernelStackWatchdog*> > >::operator()() /home/jenkins-slave/workspace/kudu-master/2/thirdparty/installed/tsan/include/boost/bind/bind.hpp:1222:16 (libkudu_util.so+0x17fc63)
        #11 boost::detail::function::void_function_obj_invoker0<boost::_bi::bind_t<void, boost::_mfi::mf0<void, kudu::KernelStackWatchdog>, boost::_bi::list1<boost::_bi::value<kudu::KernelStackWatchdog*> > >, void>::invoke(boost::detail::function::function_buffer&) /home/jenkins-slave/workspace/kudu-master/2/thirdparty/installed/tsan/include/boost/function/function_template.hpp:159:11 (libkudu_util.so+0x17fa59)
        #12 boost::function0<void>::operator()() const /home/jenkins-slave/workspace/kudu-master/2/thirdparty/installed/tsan/include/boost/function/function_template.hpp:770:14 (libkudu_util.so+0x1f1dd1)
        #13 kudu::Thread::SuperviseThread(void*) /home/jenkins-slave/workspace/kudu-master/2/src/kudu/util/thread.cc:657:3 (libkudu_util.so+0x1ef3f4)
    
      Previous write of size 8 at 0x0000015ba5c8 by main thread:
        #0 kudu::TraceEventCallbackTest::SetUp() /home/jenkins-slave/workspace/kudu-master/2/src/kudu/util/trace-test.cc:340:16 (trace-test+0x4f3a17)
        #1 void testing::internal::HandleSehExceptionsInMethodIfSupported<testing::Test, void>(testing::Test*, void (testing::Test::*)(), char const*) /home/jenkins-slave/workspace/kudu-master/2/thirdparty/src/googletest-release-1.8.0/googletest/src/gtest.cc:2402:10 (libgmock.so+0x552ef)
        #2 void testing::internal::HandleExceptionsInMethodIfSupported<testing::Test, void>(testing::Test*, void (testing::Test::*)(), char const*) /home/jenkins-slave/workspace/kudu-master/2/thirdparty/src/googletest-release-1.8.0/googletest/src/gtest.cc:2438 (libgmock.so+0x552ef)
        #3 testing::Test::Run() /home/jenkins-slave/workspace/kudu-master/2/thirdparty/src/googletest-release-1.8.0/googletest/src/gtest.cc:2470:3 (libgmock.so+0x343c1)
        #4 testing::TestInfo::Run() /home/jenkins-slave/workspace/kudu-master/2/thirdparty/src/googletest-release-1.8.0/googletest/src/gtest.cc:2656:11 (libgmock.so+0x3574c)
        #5 testing::TestCase::Run() /home/jenkins-slave/workspace/kudu-master/2/thirdparty/src/googletest-release-1.8.0/googletest/src/gtest.cc:2774:28 (libgmock.so+0x36226)
        #6 testing::internal::UnitTestImpl::RunAllTests() /home/jenkins-slave/workspace/kudu-master/2/thirdparty/src/googletest-release-1.8.0/googletest/src/gtest.cc:4649:43 (libgmock.so+0x425fa)
        #7 bool testing::internal::HandleSehExceptionsInMethodIfSupported<testing::internal::UnitTestImpl, bool>(testing::internal::UnitTestImpl*, bool (testing::internal::UnitTestImpl::*)(), char const*) /home/jenkins-slave/workspace/kudu-master/2/thirdparty/src/googletest-release-1.8.0/googletest/src/gtest.cc:2402:10 (libgmock.so+0x5625f)
        #8 bool testing::internal::HandleExceptionsInMethodIfSupported<testing::internal::UnitTestImpl, bool>(testing::internal::UnitTestImpl*, bool (testing::internal::UnitTestImpl::*)(), char const*) /home/jenkins-slave/workspace/kudu-master/2/thirdparty/src/googletest-release-1.8.0/googletest/src/gtest.cc:2438 (libgmock.so+0x5625f)
        #9 testing::UnitTest::Run() /home/jenkins-slave/workspace/kudu-master/2/thirdparty/src/googletest-release-1.8.0/googletest/src/gtest.cc:4257:10 (libgmock.so+0x41ee2)
        #10 RUN_ALL_TESTS() /home/jenkins-slave/workspace/kudu-master/2/thirdparty/installed/tsan/include/gtest/gtest.h:2233:46 (libkudu_test_main.so+0x351b)
        #11 main /home/jenkins-slave/workspace/kudu-master/2/src/kudu/util/test_main.cc:106:13 (libkudu_test_main.so+0x2cc6)
    
      Location is global 'kudu::TraceEventCallbackTest::s_instance' of size 8 at 0x0000015ba5c8 (trace-test+0x0000015ba5c8)
    
      Thread T2 'kernel-watcher-' (tid=4116, running) created by main thread at:
        #0 pthread_create /home/jenkins-slave/workspace/kudu-master/2/thirdparty/src/llvm-6.0.0.src/projects/compiler-rt/lib/tsan/rtl/tsan_interceptors.cc:992 (trace-test+0x453c86)
        #1 kudu::Thread::StartThread(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, boost::function<void ()> const&, unsigned long, scoped_refptr<kudu::Thread>*) /home/jenkins-slave/workspace/kudu-master/2/src/kudu/util/thread.cc:601:15 (libkudu_util.so+0x1eebdb)
        #2 kudu::Status kudu::Thread::CreateWithFlags<boost::_bi::bind_t<void, boost::_mfi::mf0<void, kudu::KernelStackWatchdog>, boost::_bi::list1<boost::_bi::value<kudu::KernelStackWatchdog*> > > >(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, boost::_bi::bind_t<void, boost::_mfi::mf0<void, kudu::KernelStackWatchdog>, boost::_bi::list1<boost::_bi: [...]
        #3 kudu::KernelStackWatchdog::KernelStackWatchdog() /home/jenkins-slave/workspace/kudu-master/2/src/kudu/util/kernel_stack_watchdog.cc:71:3 (libkudu_util.so+0x17dc36)
        #4 Singleton<kudu::KernelStackWatchdog>::CreateInstance() /home/jenkins-slave/workspace/kudu-master/2/src/kudu/gutil/singleton.h:124:18 (libkudu_util.so+0x17f664)
        #5 Singleton<kudu::KernelStackWatchdog>::Init() /home/jenkins-slave/workspace/kudu-master/2/src/kudu/gutil/singleton.h:117:17 (libkudu_util.so+0x17f604)
        #6 GoogleOnceInternalInit(int*, void (*)(), void (*)(void*), void*) /home/jenkins-slave/workspace/kudu-master/2/src/kudu/gutil/once.cc:43:7 (libgutil.so+0x2d7b3)
        #7 GoogleOnceInit(GoogleOnceType*, void (*)()) /home/jenkins-slave/workspace/kudu-master/2/src/kudu/gutil/once.h:53:5 (libkudu_util.so+0x113e4d)
        #8 Singleton<kudu::KernelStackWatchdog>::get() /home/jenkins-slave/workspace/kudu-master/2/src/kudu/gutil/singleton.h:79:5 (libkudu_util.so+0x17f5b1)
        #9 kudu::KernelStackWatchdog::GetInstance() /home/jenkins-slave/workspace/kudu-master/2/src/kudu/util/kernel_stack_watchdog.h:87:12 (libkudu_util.so+0x17f423)
        #10 kudu::KernelStackWatchdog::CreateAndRegisterTLS() /home/jenkins-slave/workspace/kudu-master/2/src/kudu/util/kernel_stack_watchdog.cc:219:3 (libkudu_util.so+0x17ed17)
        #11 kudu::KernelStackWatchdog::GetTLS() /home/jenkins-slave/workspace/kudu-master/2/src/kudu/util/kernel_stack_watchdog.h:170:7 (libkudu_util.so+0x1f2901)
        #12 kudu::ScopedWatchKernelStack::ScopedWatchKernelStack(char const*, int) /home/jenkins-slave/workspace/kudu-master/2/src/kudu/util/kernel_stack_watchdog.h:248:13 (libkudu_util.so+0x1f1b70)
        #13 kudu::Thread::StartThread(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, boost::function<void ()> const&, unsigned long, scoped_refptr<kudu::Thread>*) /home/jenkins-slave/workspace/kudu-master/2/src/kudu/util/thread.cc:600:5 (libkudu_util.so+0x1eebaf)
        #14 kudu::Status kudu::Thread::Create<void (*)(int, int), int, int>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, void (* const&)(int, int), int const&, int const&, scoped_refptr<kudu::Thread>*) /home/jenkins-slave/workspace/kudu-master/2/src/kudu/util/thread.h:170:12 (trace-test+0x4f03ef)
        #15 kudu::TraceTest_TestChromeTracing_Test::TestBody() /home/jenkins-slave/workspace/kudu-master/2/src/kudu/util/trace-test.cc:172:5 (trace-test+0x4e750b)
        #16 void testing::internal::HandleSehExceptionsInMethodIfSupported<testing::Test, void>(testing::Test*, void (testing::Test::*)(), char const*) /home/jenkins-slave/workspace/kudu-master/2/thirdparty/src/googletest-release-1.8.0/googletest/src/gtest.cc:2402:10 (libgmock.so+0x552ef)
        #17 void testing::internal::HandleExceptionsInMethodIfSupported<testing::Test, void>(testing::Test*, void (testing::Test::*)(), char const*) /home/jenkins-slave/workspace/kudu-master/2/thirdparty/src/googletest-release-1.8.0/googletest/src/gtest.cc:2438 (libgmock.so+0x552ef)
        #18 testing::Test::Run() /home/jenkins-slave/workspace/kudu-master/2/thirdparty/src/googletest-release-1.8.0/googletest/src/gtest.cc:2474:5 (libgmock.so+0x344b8)
        #19 testing::TestInfo::Run() /home/jenkins-slave/workspace/kudu-master/2/thirdparty/src/googletest-release-1.8.0/googletest/src/gtest.cc:2656:11 (libgmock.so+0x3574c)
        #20 testing::TestCase::Run() /home/jenkins-slave/workspace/kudu-master/2/thirdparty/src/googletest-release-1.8.0/googletest/src/gtest.cc:2774:28 (libgmock.so+0x36226)
        #21 testing::internal::UnitTestImpl::RunAllTests() /home/jenkins-slave/workspace/kudu-master/2/thirdparty/src/googletest-release-1.8.0/googletest/src/gtest.cc:4649:43 (libgmock.so+0x425fa)
        #22 bool testing::internal::HandleSehExceptionsInMethodIfSupported<testing::internal::UnitTestImpl, bool>(testing::internal::UnitTestImpl*, bool (testing::internal::UnitTestImpl::*)(), char const*) /home/jenkins-slave/workspace/kudu-master/2/thirdparty/src/googletest-release-1.8.0/googletest/src/gtest.cc:2402:10 (libgmock.so+0x5625f)
        #23 bool testing::internal::HandleExceptionsInMethodIfSupported<testing::internal::UnitTestImpl, bool>(testing::internal::UnitTestImpl*, bool (testing::internal::UnitTestImpl::*)(), char const*) /home/jenkins-slave/workspace/kudu-master/2/thirdparty/src/googletest-release-1.8.0/googletest/src/gtest.cc:2438 (libgmock.so+0x5625f)
        #24 testing::UnitTest::Run() /home/jenkins-slave/workspace/kudu-master/2/thirdparty/src/googletest-release-1.8.0/googletest/src/gtest.cc:4257:10 (libgmock.so+0x41ee2)
        #25 RUN_ALL_TESTS() /home/jenkins-slave/workspace/kudu-master/2/thirdparty/installed/tsan/include/gtest/gtest.h:2233:46 (libkudu_test_main.so+0x351b)
        #26 main /home/jenkins-slave/workspace/kudu-master/2/src/kudu/util/test_main.cc:106:13 (libkudu_test_main.so+0x2cc6)
    
    Change-Id: I5dc974be22ff101dcb8091be1fe692ab61376bc2
    SUMMARY: ThreadSanitizer: data race /home/jenkins-slave/workspace/kudu-master/2/src/kudu/util/trace-test.cc:463:5 in kudu::TraceEventCallbackTest::Callback(long, char, unsigned char const*, char const*, unsigned long, int, char const* const*, unsigned char const*, unsigned long const*, unsigned char)
    Reviewed-on: http://gerrit.cloudera.org:8080/14256
    Reviewed-by: Alexey Serbin <as...@cloudera.com>
    Tested-by: Adar Dembo <ad...@cloudera.com>
---
 src/kudu/util/debug/trace_event_impl.cc | 11 ++++++-----
 src/kudu/util/trace-test.cc             | 25 +++++++++++++++++--------
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/src/kudu/util/debug/trace_event_impl.cc b/src/kudu/util/debug/trace_event_impl.cc
index 9b1d919..0f1da6e 100644
--- a/src/kudu/util/debug/trace_event_impl.cc
+++ b/src/kudu/util/debug/trace_event_impl.cc
@@ -12,6 +12,7 @@
 #include <cinttypes>
 #include <cstdlib>
 #include <cstring>
+#include <functional>
 #include <list>
 #include <sstream>
 #include <type_traits>
@@ -36,7 +37,6 @@
 #include "kudu/gutil/strings/util.h"
 #include "kudu/gutil/sysinfo.h"
 #include "kudu/gutil/walltime.h"
-
 #include "kudu/util/atomic.h"
 #include "kudu/util/debug/trace_event.h"
 #include "kudu/util/debug/trace_event_synthetic_delay.h"
@@ -906,6 +906,7 @@ void TraceResultBuffer::Collect(
 //
 ////////////////////////////////////////////////////////////////////////////////
 class TraceBucketData;
+
 typedef Callback<void(TraceBucketData*)> TraceSampleCallback;
 
 class TraceBucketData {
@@ -1357,10 +1358,10 @@ void TraceLog::SetEnabled(const CategoryFilter& category_filter,
           "bucket2",
           Bind(&TraceSamplingThread::DefaultSamplingCallback));
 
-      Status s = Thread::Create("tracing", "sampler",
-                                &TraceSamplingThread::ThreadMain,
-                                sampling_thread_.get(),
-                                &sampling_thread_handle_);
+      Status s = Thread::CreateWithFlags(
+          "tracing", "sampler",
+          std::bind(&TraceSamplingThread::ThreadMain,sampling_thread_.get()),
+          Thread::NO_STACK_WATCHDOG, &sampling_thread_handle_);
       if (!s.ok()) {
         LOG(DFATAL) << "failed to create trace sampling thread: " << s.ToString();
       }
diff --git a/src/kudu/util/trace-test.cc b/src/kudu/util/trace-test.cc
index 9473a54..a1388ee 100644
--- a/src/kudu/util/trace-test.cc
+++ b/src/kudu/util/trace-test.cc
@@ -15,9 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include "kudu/util/trace.h"
+
 #include <cctype>
 #include <cstdint>
 #include <cstring>
+#include <functional>
 #include <map>
 #include <ostream>
 #include <string>
@@ -47,7 +50,6 @@
 #include "kudu/util/test_util.h"
 #include "kudu/util/thread.h"
 #include "kudu/util/trace_metrics.h"
-#include "kudu/util/trace.h"
 
 using kudu::debug::TraceLog;
 using kudu::debug::TraceResultBuffer;
@@ -169,8 +171,10 @@ TEST_F(TraceTest, TestChromeTracing) {
   Stopwatch s;
   s.start();
   for (int i = 0; i < kNumThreads; i++) {
-    CHECK_OK(Thread::Create("test", "gen-traces", &GenerateTraceEvents, i, kEventsPerThread,
-                            &threads[i]));
+    CHECK_OK(Thread::CreateWithFlags(
+        "test", "gen-traces",
+        std::bind(GenerateTraceEvents, i, kEventsPerThread),
+        Thread::NO_STACK_WATCHDOG, &threads[i]));
   }
 
   for (int i = 0; i < kNumThreads; i++) {
@@ -203,8 +207,9 @@ TEST_F(TraceTest, TestTraceFromExitedThread) {
   // Generate 10 trace events in a separate thread.
   int kNumEvents = 10;
   scoped_refptr<Thread> t;
-  CHECK_OK(Thread::Create("test", "gen-traces", &GenerateTraceEvents, 1, kNumEvents,
-                          &t));
+  CHECK_OK(Thread::CreateWithFlags(
+      "test", "gen-traces", std::bind(GenerateTraceEvents, 1, kNumEvents),
+      Thread::NO_STACK_WATCHDOG, &t));
   t->Join();
   tl->SetDisabled();
   string trace_json = TraceResultBuffer::FlushTraceLogToString();
@@ -231,7 +236,9 @@ TEST_F(TraceTest, TestWideSpan) {
                  TraceLog::RECORD_CONTINUOUSLY);
 
   scoped_refptr<Thread> t;
-  CHECK_OK(Thread::Create("test", "gen-traces", &GenerateWideSpan, &t));
+  CHECK_OK(Thread::CreateWithFlags(
+      "test", "gen-traces", &GenerateWideSpan,
+      Thread::NO_STACK_WATCHDOG, &t));
   t->Join();
   tl->SetDisabled();
 
@@ -277,8 +284,10 @@ TEST_F(TraceTest, TestStartAndStopCollection) {
   CountDownLatch latch(1);
   AtomicInt<int64_t> num_events_generated(0);
   scoped_refptr<Thread> t;
-  CHECK_OK(Thread::Create("test", "gen-traces", &GenerateTracesUntilLatch,
-                          &num_events_generated, &latch, &t));
+  CHECK_OK(Thread::CreateWithFlags(
+      "test", "gen-traces",
+      std::bind(GenerateTracesUntilLatch, &num_events_generated, &latch),
+      Thread::NO_STACK_WATCHDOG, &t));
 
   const int num_flushes = AllowSlowTests() ? 50 : 3;
   for (int i = 0; i < num_flushes; i++) {

[kudu] 02/04: KUDU-2069 p5: recheck tablet health when exiting maintenance mode

Posted by ad...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

adar pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 3cd6bd0b6b2eaba1027974eabc1490d90691ce82
Author: Andrew Wong <aw...@apache.org>
AuthorDate: Sat Sep 28 23:15:30 2019 -0700

    KUDU-2069 p5: recheck tablet health when exiting maintenance mode
    
    Previously, when exiting maintenance mode for a given tserver, if the
    replicas of that tserver were unhealthy, there was no mechanism with
    which to guarantee that the proper re-replication would happen.
    
    Specifically, the following sequence of events was possible:
    1. tablet T has replicas on tservers A, B*, C
    2. A enters maintenance mode
    3. A is shut down
    4. enough time passes for B* to consider A as failed
    5. B* notices the failure of A and reports to the master that replica A
       has failed
    6. the master does nothing to schedule re-replication because A is in
       maintenance mode
    7. A exits maintenance mode, but is not brought back online
    8. B* never hears back from A, and never hits a health state change to
       report to the master, and so the master never "rechecks" the health
       of T
    9. T is left under-replicated with only B* and C
    
    Note: The set of tservers we need to recheck is the set that hosted a
    leader of any replica on A.
    
    This patch addresses this by requesting a full tablet report from every
    tserver in the cluster upon exiting maintenance mode on any tserver.
    
    Testing:
    - this adds to the existing integration test for maintenance mode to
      exercise the new behavior
    - amends an existing concurrency test to verify the correct locking
      behavior is used
    
    Change-Id: Ic0ab3d78cbc5b1228c01592a00118f11f76e43dd
    Reviewed-on: http://gerrit.cloudera.org:8080/14223
    Tested-by: Kudu Jenkins
    Reviewed-by: Adar Dembo <ad...@cloudera.com>
    Reviewed-by: Alexey Serbin <as...@cloudera.com>
---
 src/kudu/integration-tests/maintenance_mode-itest.cc | 11 +++++++++++
 src/kudu/master/master_service.cc                    | 14 ++++++++++++++
 src/kudu/master/ts_descriptor.cc                     | 11 +++++++++++
 src/kudu/master/ts_descriptor.h                      |  9 +++++++++
 src/kudu/master/ts_manager.cc                        | 11 +++++++++++
 src/kudu/master/ts_manager.h                         | 11 ++++++++++-
 src/kudu/master/ts_state-test.cc                     |  9 ++++++++-
 7 files changed, 74 insertions(+), 2 deletions(-)

diff --git a/src/kudu/integration-tests/maintenance_mode-itest.cc b/src/kudu/integration-tests/maintenance_mode-itest.cc
index 6d620f4..f982586 100644
--- a/src/kudu/integration-tests/maintenance_mode-itest.cc
+++ b/src/kudu/integration-tests/maintenance_mode-itest.cc
@@ -270,6 +270,17 @@ TEST_F(MaintenanceModeRF3ITest, TestFailedTServerInMaintenanceModeDoesntRereplic
   SleepFor(kDurationForSomeHeartbeats);
   NO_FATALS(ExpectStartedTabletCopies(/*should_have_started*/false));
 
+  // Now set maintenance mode, bring the tablet server down, and then exit
+  // maintenance mode without bringing the tablet server back up. This should
+  // result in tablet copies.
+  ASSERT_OK(ChangeTServerState(maintenance_uuid, TServerStateChangePB::ENTER_MAINTENANCE_MODE));
+  NO_FATALS(maintenance_ts->Shutdown());
+  NO_FATALS(AssertEventuallyNumFailedReplicas(ts_map, kNumTablets));
+  ASSERT_OK(ChangeTServerState(maintenance_uuid, TServerStateChangePB::EXIT_MAINTENANCE_MODE));
+  ASSERT_EVENTUALLY([&] {
+    NO_FATALS(ExpectStartedTabletCopies(/*should_have_started*/true));
+  });
+
   // All the while, our workload should not have been interrupted. Assert
   // eventually to wait for the rows to converge.
   NO_FATALS(create_table.StopAndJoin());
diff --git a/src/kudu/master/master_service.cc b/src/kudu/master/master_service.cc
index 6941ed7..f9e74e1 100644
--- a/src/kudu/master/master_service.cc
+++ b/src/kudu/master/master_service.cc
@@ -338,6 +338,13 @@ void MasterServiceImpl::TSHeartbeat(const TSHeartbeatRequestPB* req,
       rpc->RespondFailure(s.CloneAndPrepend("Failed to process tablet report"));
       return;
     }
+    // If we previously needed a full tablet report for the tserver (e.g.
+    // because we need to recheck replica states after exiting from maintenance
+    // mode) and have just received a full report, mark that we no longer need
+    // a full tablet report.
+    if (!req->tablet_report().is_incremental()) {
+      ts_desc->UpdateNeedsFullTabletReport(false);
+    }
   }
 
   // 6. Only leaders sign CSR from tablet servers (if present).
@@ -367,6 +374,13 @@ void MasterServiceImpl::TSHeartbeat(const TSHeartbeatRequestPB* req,
     }
   }
 
+  // 8. Check if we need a full tablet report (e.g. the tablet server just
+  //    exited maintenance mode and needs to check whether any replicas need to
+  //    be moved).
+  if (is_leader_master && ts_desc->needs_full_report()) {
+    resp->set_needs_full_tablet_report(true);
+  }
+
   rpc->RespondSuccess();
 }
 
diff --git a/src/kudu/master/ts_descriptor.cc b/src/kudu/master/ts_descriptor.cc
index e89b27f..90f035d 100644
--- a/src/kudu/master/ts_descriptor.cc
+++ b/src/kudu/master/ts_descriptor.cc
@@ -76,6 +76,7 @@ TSDescriptor::TSDescriptor(std::string perm_id)
     : permanent_uuid_(std::move(perm_id)),
       latest_seqno_(-1),
       last_heartbeat_(MonoTime::Now()),
+      needs_full_report_(false),
       recent_replica_creations_(0),
       last_replica_creations_decay_(MonoTime::Now()),
       num_live_replicas_(0) {
@@ -162,6 +163,16 @@ MonoDelta TSDescriptor::TimeSinceHeartbeat() const {
   return now - last_heartbeat_;
 }
 
+void TSDescriptor::UpdateNeedsFullTabletReport(bool needs_report) {
+  std::lock_guard<rw_spinlock> l(lock_);
+  needs_full_report_ = needs_report;
+}
+
+bool TSDescriptor::needs_full_report() const  {
+  shared_lock<rw_spinlock> l(lock_);
+  return needs_full_report_;
+}
+
 bool TSDescriptor::PresumedDead() const {
   return TimeSinceHeartbeat().ToMilliseconds() >= FLAGS_tserver_unresponsive_timeout_ms;
 }
diff --git a/src/kudu/master/ts_descriptor.h b/src/kudu/master/ts_descriptor.h
index 800710f..9c4199a 100644
--- a/src/kudu/master/ts_descriptor.h
+++ b/src/kudu/master/ts_descriptor.h
@@ -78,6 +78,12 @@ class TSDescriptor : public enable_make_shared<TSDescriptor> {
   // Set the last-heartbeat time to now.
   void UpdateHeartbeatTime();
 
+  // Set whether a full tablet report is needed.
+  void UpdateNeedsFullTabletReport(bool needs_report);
+
+  // Whether a full tablet report is needed from this tablet server.
+  bool needs_full_report() const;
+
   // Return the amount of time since the last heartbeat received
   // from this TS.
   MonoDelta TimeSinceHeartbeat() const;
@@ -181,6 +187,9 @@ class TSDescriptor : public enable_make_shared<TSDescriptor> {
   // The last time a heartbeat was received for this node.
   MonoTime last_heartbeat_;
 
+  // Whether the tablet server needs to send a full report.
+  bool needs_full_report_;
+
   // The number of times this tablet server has recently been selected to create a
   // tablet replica. This value decays back to 0 over time.
   double recent_replica_creations_;
diff --git a/src/kudu/master/ts_manager.cc b/src/kudu/master/ts_manager.cc
index 79d3088..a44526b 100644
--- a/src/kudu/master/ts_manager.cc
+++ b/src/kudu/master/ts_manager.cc
@@ -241,6 +241,10 @@ Status TSManager::SetTServerState(const string& ts_uuid,
     RETURN_NOT_OK_PREPEND(sys_catalog->RemoveTServerState(ts_uuid),
         Substitute("Failed to remove tserver state for $0", ts_uuid));
     ts_state_by_uuid_.erase(ts_uuid);
+    // If exiting maintenance mode, make sure that any replica failures that
+    // may have been ignored while in maintenance mode are reprocessed. To do
+    // so, request full tablet reports across all tablet servers.
+    SetAllTServersNeedFullTabletReports();
     return Status::OK();
   }
   SysTServerStateEntryPB pb;
@@ -269,6 +273,13 @@ Status TSManager::ReloadTServerStates(SysCatalogTable* sys_catalog) {
   return sys_catalog->VisitTServerStates(&loader);
 }
 
+void TSManager::SetAllTServersNeedFullTabletReports() {
+  lock_guard<rw_spinlock> l(lock_);
+  for (auto& id_and_desc : servers_by_id_) {
+    id_and_desc.second->UpdateNeedsFullTabletReport(true);
+  }
+}
+
 int TSManager::ClusterSkew() const {
   int min_count = std::numeric_limits<int>::max();
   int max_count = 0;
diff --git a/src/kudu/master/ts_manager.h b/src/kudu/master/ts_manager.h
index d1cc925..cb0c6dc 100644
--- a/src/kudu/master/ts_manager.h
+++ b/src/kudu/master/ts_manager.h
@@ -100,6 +100,9 @@ class TSManager {
   int GetCount() const;
 
   // Sets the tserver state for the given tserver, persisting it to disk.
+  //
+  // If removing a tserver from maintenance mode, this also sets that all
+  // tablet servers must report back a full tablet reports.
   Status SetTServerState(const std::string& ts_uuid,
                          TServerStatePB ts_state,
                          SysCatalogTable* sys_catalog);
@@ -124,10 +127,16 @@ class TSManager {
   // is not dead, not in maintenance mode).
   bool AvailableForPlacementUnlocked(const TSDescriptor& ts) const;
 
-  mutable rw_spinlock lock_;
+  // Sets that all registered tablet servers need to report back with a full
+  // tablet report. This may be necessary, e.g., after exiting maintenance mode
+  // to recheck any ignored failures.
+  void SetAllTServersNeedFullTabletReports();
 
   FunctionGaugeDetacher metric_detacher_;
 
+  // Protects 'servers_by_id_'.
+  mutable rw_spinlock lock_;
+
   // TODO(awong): add a map from HostPort to descriptor so we aren't forced to
   // know UUIDs up front, e.g. if specifying a given tablet server for
   // maintenance mode, it'd be easier for users to specify the HostPort.
diff --git a/src/kudu/master/ts_state-test.cc b/src/kudu/master/ts_state-test.cc
index 387dac5..9ba575b 100644
--- a/src/kudu/master/ts_state-test.cc
+++ b/src/kudu/master/ts_state-test.cc
@@ -238,7 +238,7 @@ TEST_F(TServerStateTest, TestConcurrentSetTServerState) {
   }
   // Spin up a bunch of threads that contend for setting the state for a
   // limited number of tablet servers.
-  Barrier b(kNumThreadsPerTServer * kNumTServers);
+  Barrier b(kNumThreadsPerTServer * kNumTServers + kNumTServers);
   for (int i = 0; i < kNumThreadsPerTServer; i++) {
     for (const auto& ts : tservers) {
       threads.emplace_back([&, ts] {
@@ -247,6 +247,13 @@ TEST_F(TServerStateTest, TestConcurrentSetTServerState) {
       });
     }
   }
+  // Concurrently, register the servers.
+  for (const auto& ts : tservers) {
+    threads.emplace_back([&, ts] {
+      b.Wait();
+      CHECK_OK(SendHeartbeat(ts));
+    });
+  }
   for (auto& t : threads) {
     t.join();
   }