You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by ad...@apache.org on 2020/01/14 00:33:31 UTC

[kudu] 02/02: maintenance_manager-test: deflake TestPriorityOpLaunch

This is an automated email from the ASF dual-hosted git repository.

adar pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit c20aecce285e05500a49925502f4a9a259fd20c7
Author: Adar Dembo <ad...@cloudera.com>
AuthorDate: Mon Jan 13 12:37:00 2020 -0800

    maintenance_manager-test: deflake TestPriorityOpLaunch
    
    Sometimes, the test didn't get its desired op execution order because the MM
    would execute an op before all of the ops were registered. That is, the ops
    weren't registered atomically, and an "unlucky" overlap of FindBestOp and
    one of the registration calls would lead to an errant op execution.
    
    We can fix this by using FLAGS_enable_maintenance_manager to disable the MM
    while we register ops.
    
    Before the fix, I could reliably reproduce a failure within 1000 executions
    of the test.
    
    After, I couldn't reproduce it in 30000 executions.
    
    Change-Id: I06f3fd8fa952ed74bfaf2ac3dd55cd289f4b878b
    Reviewed-on: http://gerrit.cloudera.org:8080/15025
    Reviewed-by: Alexey Serbin <as...@cloudera.com>
    Reviewed-by: Andrew Wong <aw...@cloudera.com>
    Tested-by: Andrew Wong <aw...@cloudera.com>
---
 src/kudu/util/maintenance_manager-test.cc | 57 +++++++++++++++++++++++++------
 1 file changed, 46 insertions(+), 11 deletions(-)

diff --git a/src/kudu/util/maintenance_manager-test.cc b/src/kudu/util/maintenance_manager-test.cc
index c345d5c..60d019d 100644
--- a/src/kudu/util/maintenance_manager-test.cc
+++ b/src/kudu/util/maintenance_manager-test.cc
@@ -63,14 +63,14 @@ METRIC_DEFINE_histogram(test, maintenance_op_duration,
                         kudu::MetricLevel::kInfo,
                         60000000LU, 2);
 
+DECLARE_bool(enable_maintenance_manager);
 DECLARE_int64(log_target_replay_size_mb);
 DECLARE_double(maintenance_op_multiplier);
 DECLARE_int32(max_priority_range);
 
-
 namespace kudu {
 
-static const int kHistorySize = 6;
+static const int kHistorySize = 7;
 static const char kFakeUuid[] = "12345";
 
 class MaintenanceManagerTest : public KuduTest {
@@ -200,8 +200,13 @@ class TestMaintenanceOp : public MaintenanceOp {
     return priority_;
   }
 
+  int remaining_runs() const {
+    std::lock_guard<Mutex> guard(lock_);
+    return remaining_runs_;
+  }
+
  private:
-  Mutex lock_;
+  mutable Mutex lock_;
 
   uint64_t ram_anchored_;
   uint64_t logs_retained_bytes_;
@@ -488,6 +493,32 @@ TEST_F(MaintenanceManagerTest, TestPriorityOpLaunch) {
   StopManager();
   StartManager(1);
 
+  // Register an op whose sole purpose is to allow us to delay the rest of this
+  // test until we know that the MM scheduler thread is running.
+  TestMaintenanceOp early_op("early", MaintenanceOp::HIGH_IO_USAGE, 0);
+  early_op.set_perf_improvement(1);
+  manager_->RegisterOp(&early_op);
+  // From this point forward if an ASSERT fires, we'll hit a CHECK failure if
+  // we don't unregister an op before it goes out of scope.
+  SCOPED_CLEANUP({
+    manager_->UnregisterOp(&early_op);
+  });
+  ASSERT_EVENTUALLY([&]() {
+    ASSERT_EQ(0, early_op.remaining_runs());
+  });
+
+  // The MM scheduler thread is now running. It is now safe to use
+  // FLAGS_enable_maintenance_manager to temporarily disable the MM, thus
+  // allowing us to register a group of ops "atomically" and ensuring the op
+  // execution order that this test wants to see.
+  //
+  // Without the "early op" hack above, there's a small chance that the MM
+  // scheduler thread will not have run at all at the time of
+  // FLAGS_enable_maintenance_manager = false, which would cause the thread
+  // to exit entirely instead of sleeping.
+
+  // Ops are listed here in perf improvement order, which is a function of the
+  // op's raw perf improvement as well as its priority.
   TestMaintenanceOp op1("op1", MaintenanceOp::HIGH_IO_USAGE, -FLAGS_max_priority_range - 1);
   op1.set_perf_improvement(10);
   op1.set_remaining_runs(1);
@@ -508,22 +539,24 @@ TEST_F(MaintenanceManagerTest, TestPriorityOpLaunch) {
   op4.set_remaining_runs(1);
   op4.set_sleep_time(MonoDelta::FromMilliseconds(1));
 
-  TestMaintenanceOp op5("op5", MaintenanceOp::HIGH_IO_USAGE, FLAGS_max_priority_range + 1);
-  op5.set_perf_improvement(10);
+  TestMaintenanceOp op5("op5", MaintenanceOp::HIGH_IO_USAGE, 0);
+  op5.set_perf_improvement(12);
   op5.set_remaining_runs(1);
   op5.set_sleep_time(MonoDelta::FromMilliseconds(1));
 
-  TestMaintenanceOp op6("op6", MaintenanceOp::HIGH_IO_USAGE, 0);
-  op6.set_perf_improvement(12);
+  TestMaintenanceOp op6("op6", MaintenanceOp::HIGH_IO_USAGE, FLAGS_max_priority_range + 1);
+  op6.set_perf_improvement(10);
   op6.set_remaining_runs(1);
   op6.set_sleep_time(MonoDelta::FromMilliseconds(1));
 
+  FLAGS_enable_maintenance_manager = false;
   manager_->RegisterOp(&op1);
   manager_->RegisterOp(&op2);
   manager_->RegisterOp(&op3);
   manager_->RegisterOp(&op4);
   manager_->RegisterOp(&op5);
   manager_->RegisterOp(&op6);
+  FLAGS_enable_maintenance_manager = true;
 
   // From this point forward if an ASSERT fires, we'll hit a CHECK failure if
   // we don't unregister an op before it goes out of scope.
@@ -539,7 +572,7 @@ TEST_F(MaintenanceManagerTest, TestPriorityOpLaunch) {
   ASSERT_EVENTUALLY([&]() {
     MaintenanceManagerStatusPB status_pb;
     manager_->GetMaintenanceManagerStatusDump(&status_pb);
-    ASSERT_EQ(status_pb.completed_operations_size(), 6);
+    ASSERT_EQ(status_pb.completed_operations_size(), 7);
   });
 
   // Wait for instances to complete by shutting down the maintenance manager.
@@ -550,14 +583,16 @@ TEST_F(MaintenanceManagerTest, TestPriorityOpLaunch) {
   MaintenanceManagerStatusPB status_pb;
   manager_->GetMaintenanceManagerStatusDump(&status_pb);
   ASSERT_EQ(status_pb.running_operations_size(), 0);
-  ASSERT_EQ(status_pb.completed_operations_size(), 6);
-  // In perf_improvement score ascending order, the latter completed OP will list former.
+
+  // Check that ops were executed in perf improvement order (from greatest to
+  // least improvement). Note that completed ops are listed in _reverse_ execution order.
   list<string> ordered_ops({"op1",
                             "op2",
                             "op3",
                             "op4",
+                            "op5",
                             "op6",
-                            "op5"});
+                            "early"});
   ASSERT_EQ(ordered_ops.size(), status_pb.completed_operations().size());
   for (const auto& instance : status_pb.completed_operations()) {
     ASSERT_EQ(ordered_ops.front(), instance.name());