You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by aw...@apache.org on 2019/12/17 20:10:29 UTC
[kudu] branch master updated: KUDU-3002: prioritize WAL unanchoring when under memory pressure

This is an automated email from the ASF dual-hosted git repository.

awong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new 111e54c  KUDU-3002: prioritize WAL unanchoring when under memory pressure
111e54c is described below

commit 111e54c0c8f254fb110919150c1f5d60dca37682
Author: Andrew Wong <aw...@apache.org>
AuthorDate: Sat Dec 14 17:31:49 2019 -0800

    KUDU-3002: prioritize WAL unanchoring when under memory pressure
    
    When under memory pressure, we currently prioritize performing the op
    that will free the most memory. In theory this seems reasonable -- if
    Kudu's using way too much memory, we should try to use less memory as
    quickly as possible. In practice, this meant that, while under memory
    pressure and under a constant, memory-consuming workload (e.g. inserts
    to the MRS), Kudu would starve operations that anchor relatively little
    memory (e.g. DMS flushes).
    
    This patch updates the behavior so that we prioritize operations that
    unanchor the most WAL bytes, breaking ties by prioritizing the ops that
    use more memory. This seems reasonable because:
    - Ops that anchor WALs also anchor memory. In performing an op that
      unanchor WALs, we are performing an op that frees memory, so we'll
      still prefer MRS and DMS flushing over compactions when under memory
      pressure.
    - We already use this heuristic when _not_ under memory pressure, but it
      is only used when the ops under consideration anchor too many WAL
      bytes (per --log_target_replay_size_mb), lending some credibility to
      it when used conditionally.
    - It becomes much more difficult to think of a scenario in which we're
      "stuck" using too much space for WALs or too much memory.
    
    Change-Id: Ibd85e8f2904a36b74cd6a3038c9ec49bb1ff9844
    Reviewed-on: http://gerrit.cloudera.org:8080/14910
    Tested-by: Kudu Jenkins
    Reviewed-by: Adar Dembo <ad...@cloudera.com>
---
 src/kudu/tserver/tablet_server-test.cc    | 58 +++++++++++++++++++++++++++++--
 src/kudu/util/maintenance_manager-test.cc | 42 ++++++++++++++++++++++
 src/kudu/util/maintenance_manager.cc      | 34 +++++++++---------
 src/kudu/util/maintenance_manager.h       |  1 +
 4 files changed, 117 insertions(+), 18 deletions(-)

diff --git a/src/kudu/tserver/tablet_server-test.cc b/src/kudu/tserver/tablet_server-test.cc
index e09366e..a725390 100644
--- a/src/kudu/tserver/tablet_server-test.cc
+++ b/src/kudu/tserver/tablet_server-test.cc
@@ -21,6 +21,7 @@
 #include <unistd.h>
 
 #include <algorithm>
+#include <atomic>
 #include <cstdint>
 #include <initializer_list>
 #include <map>
@@ -100,8 +101,10 @@
 #include "kudu/util/countdown_latch.h"
 #include "kudu/util/crc.h"
 #include "kudu/util/curl_util.h"
+#include "kudu/util/debug/sanitizer_scopes.h"
 #include "kudu/util/env.h"
 #include "kudu/util/faststring.h"
+#include "kudu/util/hdr_histogram.h"
 #include "kudu/util/jsonwriter.h"
 #include "kudu/util/logging_test_util.h"
 #include "kudu/util/metrics.h"
@@ -159,6 +162,7 @@ DEFINE_int32(delete_tablet_bench_num_flushes, 200,
              "Number of disk row sets to flush in the delete tablet benchmark");
 
 DECLARE_bool(crash_on_eio);
+DECLARE_bool(enable_flush_deltamemstores);
 DECLARE_bool(enable_flush_memrowset);
 DECLARE_bool(enable_maintenance_manager);
 DECLARE_bool(enable_rowset_compaction);
@@ -172,6 +176,7 @@ DECLARE_int32(flush_threshold_secs);
 DECLARE_int32(fs_data_dirs_available_space_cache_seconds);
 DECLARE_int32(fs_target_data_dirs_per_tablet);
 DECLARE_int32(maintenance_manager_num_threads);
+DECLARE_int32(maintenance_manager_polling_interval_ms);
 DECLARE_int32(memory_pressure_percentage);
 DECLARE_int32(metrics_retirement_age_ms);
 DECLARE_int32(scanner_batch_size_rows);
@@ -183,16 +188,17 @@ DECLARE_string(env_inject_full_globs);
 
 // Declare these metrics prototypes for simpler unit testing of their behavior.
 METRIC_DECLARE_counter(block_manager_total_bytes_read);
+METRIC_DECLARE_counter(log_block_manager_holes_punched);
 METRIC_DECLARE_counter(rows_inserted);
 METRIC_DECLARE_counter(rows_updated);
 METRIC_DECLARE_counter(rows_deleted);
 METRIC_DECLARE_counter(scanners_expired);
 METRIC_DECLARE_gauge_uint64(log_block_manager_blocks_under_management);
 METRIC_DECLARE_gauge_uint64(log_block_manager_containers);
-METRIC_DECLARE_counter(log_block_manager_holes_punched);
 METRIC_DECLARE_gauge_size(active_scanners);
 METRIC_DECLARE_gauge_size(tablet_active_scanners);
 METRIC_DECLARE_gauge_size(num_rowsets_on_disk);
+METRIC_DECLARE_histogram(flush_dms_duration);
 
 namespace kudu {
 
@@ -903,18 +909,66 @@ class TabletServerMaintenanceMemoryPressureTest : public TabletServerTestBase {
     FLAGS_enable_maintenance_manager = true;
     FLAGS_flush_threshold_secs = 1;
     FLAGS_memory_pressure_percentage = 0;
+    // For the sake of easier setup, slow down our maintenance polling interval.
+    FLAGS_maintenance_manager_polling_interval_ms = 1000;
 
     // While setting up rowsets, disable compactions and flushing. Do this
     // before doing anything so we can have tighter control over the flushing
     // of our rowsets.
     FLAGS_enable_rowset_compaction = false;
+    FLAGS_enable_flush_deltamemstores = false;
     FLAGS_enable_flush_memrowset = false;
     NO_FATALS(StartTabletServer(/*num_data_dirs=*/1));
   }
 };
 
+// Regression test for KUDU-3002. Previously, when under memory pressure, we
+// might starve older (usually small) DMS flushes in favor of (usually larger)
+// MRS flushes.
+TEST_F(TabletServerMaintenanceMemoryPressureTest, TestDontStarveDMSWhileUnderMemoryPressure) {
+  // First, set up a rowset with a delta.
+  NO_FATALS(InsertTestRowsDirect(1, 1));
+  ASSERT_OK(tablet_replica_->tablet()->Flush());
+  NO_FATALS(UpdateTestRowRemote(1, 2));
+
+  // Roll onto a new log segment so our DMS anchors some WAL bytes.
+  ASSERT_OK(tablet_replica_->log()->WaitUntilAllFlushed());
+  ASSERT_OK(tablet_replica_->log()->AllocateSegmentAndRollOverForTests());
+
+  // Now start inserting to the tablet so every time we pick a maintenance op,
+  // we'll have a sizeable MRS.
+  std::atomic<bool> keep_inserting(true);
+  thread insert_thread([&] {
+    int cur_row = 2;
+    while (keep_inserting) {
+      // Ignore TSAN warnings that complain about a race in gtest between this
+      // check for fatal failures and the check for fatal failures in the below
+      // AssertEventually.
+      debug::ScopedTSANIgnoreReadsAndWrites ignore_tsan;
+      NO_FATALS(InsertTestRowsDirect(cur_row++, 1));
+    }
+  });
+  SCOPED_CLEANUP({
+    keep_inserting = false;
+    insert_thread.join();
+  });
+
+  // Wait a bit for the MRS to build up and then enable flushing.
+  SleepFor(MonoDelta::FromSeconds(1));
+  FLAGS_enable_flush_memrowset = true;
+  FLAGS_enable_flush_deltamemstores = true;
+
+  // Despite always having a large MRS, we should eventually flush the DMS,
+  // since it anchors WALs.
+  scoped_refptr<Histogram> dms_flushes =
+      METRIC_flush_dms_duration.Instantiate(tablet_replica_->tablet()->GetMetricEntity());
+  ASSERT_EVENTUALLY([&] {
+    ASSERT_EQ(1, dms_flushes->histogram()->TotalCount());
+  });
+}
+
 // Regression test for KUDU-2929. Previously, when under memory pressure, we
-// would never compact, even if there were nothing else to do. We'll simulate
+// would never compact, even if there were something else to do. We'll simulate
 // this by flushing some overlapping rowsets and then making sure we compact.
 TEST_F(TabletServerMaintenanceMemoryPressureTest, TestCompactWhileUnderMemoryPressure) {
   // Insert sets of overlapping rows.
diff --git a/src/kudu/util/maintenance_manager-test.cc b/src/kudu/util/maintenance_manager-test.cc
index 0c5bf4d..4341351 100644
--- a/src/kudu/util/maintenance_manager-test.cc
+++ b/src/kudu/util/maintenance_manager-test.cc
@@ -363,6 +363,48 @@ TEST_F(MaintenanceManagerTest, TestLogRetentionPrioritization) {
   manager_->UnregisterOp(&op2);
 }
 
+// Test that ops are prioritized correctly when under memory pressure.
+TEST_F(MaintenanceManagerTest, TestPrioritizeLogRetentionUnderMemoryPressure) {
+  StopManager();
+
+  // We should perform these in the order of WAL bytes retained, followed by
+  // amount of memory anchored.
+  TestMaintenanceOp op1("op1", MaintenanceOp::HIGH_IO_USAGE);
+  op1.set_logs_retained_bytes(100);
+  op1.set_ram_anchored(100);
+
+  TestMaintenanceOp op2("op2", MaintenanceOp::HIGH_IO_USAGE);
+  op2.set_logs_retained_bytes(100);
+  op2.set_ram_anchored(99);
+
+  TestMaintenanceOp op3("op3", MaintenanceOp::HIGH_IO_USAGE);
+  op3.set_logs_retained_bytes(99);
+  op3.set_ram_anchored(101);
+
+  indicate_memory_pressure_ = true;
+  manager_->RegisterOp(&op1);
+  manager_->RegisterOp(&op2);
+  manager_->RegisterOp(&op3);
+
+  auto op_and_why = manager_->FindBestOp();
+  ASSERT_EQ(&op1, op_and_why.first);
+  EXPECT_EQ(op_and_why.second, "under memory pressure (0.00% used), 100 bytes log retention, and "
+                               "flush 100 bytes memory");
+  manager_->UnregisterOp(&op1);
+
+  op_and_why = manager_->FindBestOp();
+  ASSERT_EQ(&op2, op_and_why.first);
+  EXPECT_EQ(op_and_why.second, "under memory pressure (0.00% used), 100 bytes log retention, and "
+                               "flush 99 bytes memory");
+  manager_->UnregisterOp(&op2);
+
+  op_and_why = manager_->FindBestOp();
+  ASSERT_EQ(&op3, op_and_why.first);
+  EXPECT_EQ(op_and_why.second, "under memory pressure (0.00% used), 99 bytes log retention, and "
+                               "flush 101 bytes memory");
+  manager_->UnregisterOp(&op3);
+}
+
 // Test retrieving a list of an op's running instances
 TEST_F(MaintenanceManagerTest, TestRunningInstances) {
   TestMaintenanceOp op("op", MaintenanceOp::HIGH_IO_USAGE);
diff --git a/src/kudu/util/maintenance_manager.cc b/src/kudu/util/maintenance_manager.cc
index 6e0b964..61a6e52 100644
--- a/src/kudu/util/maintenance_manager.cc
+++ b/src/kudu/util/maintenance_manager.cc
@@ -19,7 +19,6 @@
 
 #include <algorithm>
 #include <cinttypes>
-#include <cmath>
 #include <cstdint>
 #include <memory>
 #include <mutex>
@@ -362,9 +361,6 @@ pair<MaintenanceOp*, string> MaintenanceManager::FindBestOp() {
   int64_t low_io_most_logs_retained_bytes = 0;
   MaintenanceOp* low_io_most_logs_retained_bytes_op = nullptr;
 
-  int64_t most_ram_anchored = 0;
-  MaintenanceOp* most_ram_anchored_op = nullptr;
-
   int64_t most_logs_retained_bytes = 0;
   int64_t most_logs_retained_bytes_ram_anchored = 0;
   MaintenanceOp* most_logs_retained_bytes_ram_anchored_op = nullptr;
@@ -395,14 +391,9 @@ pair<MaintenanceOp*, string> MaintenanceManager::FindBestOp() {
                         op->name(), logs_retained_bytes);
     }
 
-    const auto ram_anchored = stats.ram_anchored();
-    if (ram_anchored > most_ram_anchored) {
-      most_ram_anchored_op = op;
-      most_ram_anchored = ram_anchored;
-    }
-
     // We prioritize ops that can free more logs, but when it's the same we pick
     // the one that also frees up the most memory.
+    const auto ram_anchored = stats.ram_anchored();
     if (std::make_pair(logs_retained_bytes, ram_anchored) >
         std::make_pair(most_logs_retained_bytes,
                        most_logs_retained_bytes_ram_anchored)) {
@@ -435,13 +426,24 @@ pair<MaintenanceOp*, string> MaintenanceManager::FindBestOp() {
   }
 
   // Look at free memory. If it is dangerously low, we must select something
-  // that frees memory -- the op with the most anchored memory.
+  // that frees memory -- ignore the target replay size and flush whichever op
+  // anchors the most WALs (the op should also free memory).
+  //
+  // Why not select the op that frees the most memory? Such a heuristic could
+  // lead to starvation of ops that consume less memory, e.g. we might always
+  // choose to do MRS flushes even when there are small, long-lived DMSes that
+  // are anchoring WALs. Choosing the op that frees the most WALs ensures that
+  // all ops that anchor memory (and also anchor WALs) will eventually be
+  // performed.
   double capacity_pct;
-  if (memory_pressure_func_(&capacity_pct) && most_ram_anchored_op) {
-    string note = StringPrintf("under memory pressure (%.2f%% used, "
-                               "can flush %" PRIu64 " bytes)",
-                               capacity_pct, most_ram_anchored);
-    return {most_ram_anchored_op, std::move(note)};
+  if (memory_pressure_func_(&capacity_pct) && most_logs_retained_bytes_ram_anchored_op) {
+    DCHECK_GT(most_logs_retained_bytes_ram_anchored, 0);
+    string note = StringPrintf("under memory pressure (%.2f%% used), "
+                               "%" PRIu64 " bytes log retention, and flush "
+                               "%" PRIu64 " bytes memory", capacity_pct,
+                               most_logs_retained_bytes,
+                               most_logs_retained_bytes_ram_anchored);
+    return {most_logs_retained_bytes_ram_anchored_op, std::move(note)};
   }
 
   // Look at ops that free up more log retention, and also free up more memory.
diff --git a/src/kudu/util/maintenance_manager.h b/src/kudu/util/maintenance_manager.h
index 631d0d6..6dad1b6 100644
--- a/src/kudu/util/maintenance_manager.h
+++ b/src/kudu/util/maintenance_manager.h
@@ -305,6 +305,7 @@ class MaintenanceManager : public std::enable_shared_from_this<MaintenanceManage
 
  private:
   FRIEND_TEST(MaintenanceManagerTest, TestLogRetentionPrioritization);
+  FRIEND_TEST(MaintenanceManagerTest, TestPrioritizeLogRetentionUnderMemoryPressure);
   FRIEND_TEST(MaintenanceManagerTest, TestOpFactors);
 
   typedef std::map<MaintenanceOp*, MaintenanceOpStats,