You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@kudu.apache.org by ad...@apache.org on 2020/01/14 00:33:29 UTC

[kudu] branch master updated (69172e8 -> c20aecc)

This is an automated email from the ASF dual-hosted git repository.

adar pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git.


    from 69172e8  fs: move file cache to server
     new d5fab3e  maintenance_manager-test: fix cascading CHECK failure
     new c20aecc  maintenance_manager-test: deflake TestPriorityOpLaunch

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 src/kudu/util/maintenance_manager-test.cc | 79 ++++++++++++++++++++++++-------
 src/kudu/util/test_util.cc                | 12 +++--
 2 files changed, 68 insertions(+), 23 deletions(-)

[kudu] 01/02: maintenance_manager-test: fix cascading CHECK failure

Posted by ad...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

adar pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit d5fab3e808357ff7f7af77934cc5ff0b641f35ab
Author: Adar Dembo <ad...@cloudera.com>
AuthorDate: Mon Jan 13 12:05:14 2020 -0800

    maintenance_manager-test: fix cascading CHECK failure
    
    While trying to repro test flakiness, I saw that ASSERT_EVENTUALLY doesn't
    handle --gtest_throw_on_failure correctly. And that also highlighted an
    issue with the test where an ASSERT firing early could lead to a second
    CHECK failure, obscuring the actual test failure.
    
    Change-Id: I7a8ed5bedbcdb742bebe56327ffdbb885eef28b0
    Reviewed-on: http://gerrit.cloudera.org:8080/15024
    Reviewed-by: Alexey Serbin <as...@cloudera.com>
    Tested-by: Kudu Jenkins
    Reviewed-by: Andrew Wong <aw...@cloudera.com>
---
 src/kudu/util/maintenance_manager-test.cc | 22 +++++++++++++++-------
 src/kudu/util/test_util.cc                | 12 +++++++-----
 2 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/src/kudu/util/maintenance_manager-test.cc b/src/kudu/util/maintenance_manager-test.cc
index 4341351..c345d5c 100644
--- a/src/kudu/util/maintenance_manager-test.cc
+++ b/src/kudu/util/maintenance_manager-test.cc
@@ -40,6 +40,7 @@
 #include "kudu/util/metrics.h"
 #include "kudu/util/monotime.h"
 #include "kudu/util/mutex.h"
+#include "kudu/util/scoped_cleanup.h"
 #include "kudu/util/status.h"
 #include "kudu/util/test_macros.h"
 #include "kudu/util/test_util.h"
@@ -524,19 +525,26 @@ TEST_F(MaintenanceManagerTest, TestPriorityOpLaunch) {
   manager_->RegisterOp(&op5);
   manager_->RegisterOp(&op6);
 
+  // From this point forward if an ASSERT fires, we'll hit a CHECK failure if
+  // we don't unregister an op before it goes out of scope.
+  SCOPED_CLEANUP({
+    manager_->UnregisterOp(&op1);
+    manager_->UnregisterOp(&op2);
+    manager_->UnregisterOp(&op3);
+    manager_->UnregisterOp(&op4);
+    manager_->UnregisterOp(&op5);
+    manager_->UnregisterOp(&op6);
+  });
+
   ASSERT_EVENTUALLY([&]() {
     MaintenanceManagerStatusPB status_pb;
     manager_->GetMaintenanceManagerStatusDump(&status_pb);
     ASSERT_EQ(status_pb.completed_operations_size(), 6);
   });
 
-  // Wait for instances to complete.
-  manager_->UnregisterOp(&op1);
-  manager_->UnregisterOp(&op2);
-  manager_->UnregisterOp(&op3);
-  manager_->UnregisterOp(&op4);
-  manager_->UnregisterOp(&op5);
-  manager_->UnregisterOp(&op6);
+  // Wait for instances to complete by shutting down the maintenance manager.
+  // We can still call GetMaintenanceManagerStatusDump though.
+  StopManager();
 
   // Check that running instances are removed from collection after completion.
   MaintenanceManagerStatusPB status_pb;
diff --git a/src/kudu/util/test_util.cc b/src/kudu/util/test_util.cc
index ec9e23b..93eac65 100644
--- a/src/kudu/util/test_util.cc
+++ b/src/kudu/util/test_util.cc
@@ -38,7 +38,6 @@
 
 #include <boost/optional/optional.hpp>
 #include <gflags/gflags.h>
-#include <gflags/gflags_declare.h>
 #include <glog/logging.h>
 #include <gtest/gtest-spi.h>
 
@@ -290,14 +289,17 @@ void AssertEventually(const std::function<void(void)>& f,
                       AssertBackoff backoff) {
   const MonoTime deadline = MonoTime::Now() + timeout;
   {
-    // Disable --gtest_break_on_failure, or else the assertion failures
-    // inside our attempts will cause the test to SEGV even though we
-    // would like to retry.
+    // Disable gtest's "on failure" behavior, or else the assertion failures
+    // inside our attempts will cause the test to end even though we would
+    // like to retry.
     bool old_break_on_failure = testing::FLAGS_gtest_break_on_failure;
-    auto c = MakeScopedCleanup([old_break_on_failure]() {
+    bool old_throw_on_failure = testing::FLAGS_gtest_throw_on_failure;
+    auto c = MakeScopedCleanup([old_break_on_failure, old_throw_on_failure]() {
       testing::FLAGS_gtest_break_on_failure = old_break_on_failure;
+      testing::FLAGS_gtest_throw_on_failure = old_throw_on_failure;
     });
     testing::FLAGS_gtest_break_on_failure = false;
+    testing::FLAGS_gtest_throw_on_failure = false;
 
     for (int attempts = 0; MonoTime::Now() < deadline; attempts++) {
       // Capture any assertion failures within this scope (i.e. from their function)

[kudu] 02/02: maintenance_manager-test: deflake TestPriorityOpLaunch

Posted by ad...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

adar pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit c20aecce285e05500a49925502f4a9a259fd20c7
Author: Adar Dembo <ad...@cloudera.com>
AuthorDate: Mon Jan 13 12:37:00 2020 -0800

    maintenance_manager-test: deflake TestPriorityOpLaunch
    
    Sometimes, the test didn't get its desired op execution order because the MM
    would execute an op before all of the ops were registered. That is, the ops
    weren't registered atomically, and an "unlucky" overlap of FindBestOp and
    one of the registration calls would lead to an errant op execution.
    
    We can fix this by using FLAGS_enable_maintenance_manager to disable the MM
    while we register ops.
    
    Before the fix, I could reliably reproduce a failure within 1000 executions
    of the test.
    
    After, I couldn't reproduce it in 30000 executions.
    
    Change-Id: I06f3fd8fa952ed74bfaf2ac3dd55cd289f4b878b
    Reviewed-on: http://gerrit.cloudera.org:8080/15025
    Reviewed-by: Alexey Serbin <as...@cloudera.com>
    Reviewed-by: Andrew Wong <aw...@cloudera.com>
    Tested-by: Andrew Wong <aw...@cloudera.com>
---
 src/kudu/util/maintenance_manager-test.cc | 57 +++++++++++++++++++++++++------
 1 file changed, 46 insertions(+), 11 deletions(-)

diff --git a/src/kudu/util/maintenance_manager-test.cc b/src/kudu/util/maintenance_manager-test.cc
index c345d5c..60d019d 100644
--- a/src/kudu/util/maintenance_manager-test.cc
+++ b/src/kudu/util/maintenance_manager-test.cc
@@ -63,14 +63,14 @@ METRIC_DEFINE_histogram(test, maintenance_op_duration,
                         kudu::MetricLevel::kInfo,
                         60000000LU, 2);
 
+DECLARE_bool(enable_maintenance_manager);
 DECLARE_int64(log_target_replay_size_mb);
 DECLARE_double(maintenance_op_multiplier);
 DECLARE_int32(max_priority_range);
 
-
 namespace kudu {
 
-static const int kHistorySize = 6;
+static const int kHistorySize = 7;
 static const char kFakeUuid[] = "12345";
 
 class MaintenanceManagerTest : public KuduTest {
@@ -200,8 +200,13 @@ class TestMaintenanceOp : public MaintenanceOp {
     return priority_;
   }
 
+  int remaining_runs() const {
+    std::lock_guard<Mutex> guard(lock_);
+    return remaining_runs_;
+  }
+
  private:
-  Mutex lock_;
+  mutable Mutex lock_;
 
   uint64_t ram_anchored_;
   uint64_t logs_retained_bytes_;
@@ -488,6 +493,32 @@ TEST_F(MaintenanceManagerTest, TestPriorityOpLaunch) {
   StopManager();
   StartManager(1);
 
+  // Register an op whose sole purpose is to allow us to delay the rest of this
+  // test until we know that the MM scheduler thread is running.
+  TestMaintenanceOp early_op("early", MaintenanceOp::HIGH_IO_USAGE, 0);
+  early_op.set_perf_improvement(1);
+  manager_->RegisterOp(&early_op);
+  // From this point forward if an ASSERT fires, we'll hit a CHECK failure if
+  // we don't unregister an op before it goes out of scope.
+  SCOPED_CLEANUP({
+    manager_->UnregisterOp(&early_op);
+  });
+  ASSERT_EVENTUALLY([&]() {
+    ASSERT_EQ(0, early_op.remaining_runs());
+  });
+
+  // The MM scheduler thread is now running. It is now safe to use
+  // FLAGS_enable_maintenance_manager to temporarily disable the MM, thus
+  // allowing us to register a group of ops "atomically" and ensuring the op
+  // execution order that this test wants to see.
+  //
+  // Without the "early op" hack above, there's a small chance that the MM
+  // scheduler thread will not have run at all at the time of
+  // FLAGS_enable_maintenance_manager = false, which would cause the thread
+  // to exit entirely instead of sleeping.
+
+  // Ops are listed here in perf improvement order, which is a function of the
+  // op's raw perf improvement as well as its priority.
   TestMaintenanceOp op1("op1", MaintenanceOp::HIGH_IO_USAGE, -FLAGS_max_priority_range - 1);
   op1.set_perf_improvement(10);
   op1.set_remaining_runs(1);
@@ -508,22 +539,24 @@ TEST_F(MaintenanceManagerTest, TestPriorityOpLaunch) {
   op4.set_remaining_runs(1);
   op4.set_sleep_time(MonoDelta::FromMilliseconds(1));
 
-  TestMaintenanceOp op5("op5", MaintenanceOp::HIGH_IO_USAGE, FLAGS_max_priority_range + 1);
-  op5.set_perf_improvement(10);
+  TestMaintenanceOp op5("op5", MaintenanceOp::HIGH_IO_USAGE, 0);
+  op5.set_perf_improvement(12);
   op5.set_remaining_runs(1);
   op5.set_sleep_time(MonoDelta::FromMilliseconds(1));
 
-  TestMaintenanceOp op6("op6", MaintenanceOp::HIGH_IO_USAGE, 0);
-  op6.set_perf_improvement(12);
+  TestMaintenanceOp op6("op6", MaintenanceOp::HIGH_IO_USAGE, FLAGS_max_priority_range + 1);
+  op6.set_perf_improvement(10);
   op6.set_remaining_runs(1);
   op6.set_sleep_time(MonoDelta::FromMilliseconds(1));
 
+  FLAGS_enable_maintenance_manager = false;
   manager_->RegisterOp(&op1);
   manager_->RegisterOp(&op2);
   manager_->RegisterOp(&op3);
   manager_->RegisterOp(&op4);
   manager_->RegisterOp(&op5);
   manager_->RegisterOp(&op6);
+  FLAGS_enable_maintenance_manager = true;
 
   // From this point forward if an ASSERT fires, we'll hit a CHECK failure if
   // we don't unregister an op before it goes out of scope.
@@ -539,7 +572,7 @@ TEST_F(MaintenanceManagerTest, TestPriorityOpLaunch) {
   ASSERT_EVENTUALLY([&]() {
     MaintenanceManagerStatusPB status_pb;
     manager_->GetMaintenanceManagerStatusDump(&status_pb);
-    ASSERT_EQ(status_pb.completed_operations_size(), 6);
+    ASSERT_EQ(status_pb.completed_operations_size(), 7);
   });
 
   // Wait for instances to complete by shutting down the maintenance manager.
@@ -550,14 +583,16 @@ TEST_F(MaintenanceManagerTest, TestPriorityOpLaunch) {
   MaintenanceManagerStatusPB status_pb;
   manager_->GetMaintenanceManagerStatusDump(&status_pb);
   ASSERT_EQ(status_pb.running_operations_size(), 0);
-  ASSERT_EQ(status_pb.completed_operations_size(), 6);
-  // In perf_improvement score ascending order, the latter completed OP will list former.
+
+  // Check that ops were executed in perf improvement order (from greatest to
+  // least improvement). Note that completed ops are listed in _reverse_ execution order.
   list<string> ordered_ops({"op1",
                             "op2",
                             "op3",
                             "op4",
+                            "op5",
                             "op6",
-                            "op5"});
+                            "early"});
   ASSERT_EQ(ordered_ops.size(), status_pb.completed_operations().size());
   for (const auto& instance : status_pb.completed_operations()) {
     ASSERT_EQ(ordered_ops.front(), instance.name());