You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by la...@apache.org on 2023/03/16 16:09:11 UTC

[kudu] branch master updated: [server] add 'slow_scans' metric for tablet server

This is an automated email from the ASF dual-hosted git repository.

laiyingchun pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new 7133b894e [server] add 'slow_scans' metric for tablet server
7133b894e is described below

commit 7133b894e87c705353753c25e83c56b683e6d805
Author: kedeng <kd...@gmail.com>
AuthorDate: Mon Feb 27 16:07:16 2023 +0800

    [server] add 'slow_scans' metric for tablet server
    
    As a supplement to the slow scan display, I added
    a new metric 'slow_scans' to facilitate our monitoring.
    
    At the same time, I added test to ensure that the
    new metric will take effect.
    
    Change-Id: I51e64bc4602f0e1dd99207f7d4cf8883085eca9a
    Reviewed-on: http://gerrit.cloudera.org:8080/19545
    Tested-by: Kudu Jenkins
    Reviewed-by: Yingchun Lai <la...@apache.org>
---
 src/kudu/tserver/scanners.cc           | 29 ++++++++++++++++++
 src/kudu/tserver/scanners.h            |  4 +++
 src/kudu/tserver/tablet_server-test.cc | 54 ++++++++++++++++++++++++++++++++++
 3 files changed, 87 insertions(+)

diff --git a/src/kudu/tserver/scanners.cc b/src/kudu/tserver/scanners.cc
index d28f1a4aa..c53b331a1 100644
--- a/src/kudu/tserver/scanners.cc
+++ b/src/kudu/tserver/scanners.cc
@@ -91,6 +91,13 @@ METRIC_DEFINE_gauge_size(server, active_scanners,
                          "Number of scanners that are currently active",
                          kudu::MetricLevel::kInfo);
 
+METRIC_DEFINE_gauge_size(server, slow_scans,
+                         "Slow Scans",
+                         kudu::MetricUnit::kScanners,
+                         "Number of slow scanners that are defined by --slow_scanner_threshold_ms "
+                         "if --show_slow_scans set to 'true'.",
+                         kudu::MetricLevel::kWarn);
+
 using kudu::rpc::RemoteUser;
 using kudu::tablet::TabletReplica;
 using std::string;
@@ -133,6 +140,9 @@ ScannerManager::ScannerManager(const scoped_refptr<MetricEntity>& metric_entity)
     METRIC_active_scanners.InstantiateFunctionGauge(
         metric_entity, [this]() { return this->CountActiveScanners(); })
         ->AutoDetach(&metric_detacher_);
+    METRIC_slow_scans.InstantiateFunctionGauge(
+        metric_entity, [this]() { return this->CountSlowScans(); })
+        ->AutoDetach(&metric_detacher_);
   }
   for (size_t i = 0; i < kNumScannerMapStripes; i++) {
     scanner_maps_.push_back(new ScannerMapStripe());
@@ -279,6 +289,25 @@ size_t ScannerManager::CountActiveScanners() const {
   return total;
 }
 
+size_t ScannerManager::CountSlowScans() const {
+  size_t total = 0;
+  const MonoTime now = MonoTime::Now();
+  const MonoDelta slow_threshold = MonoDelta::FromMilliseconds(FLAGS_slow_scanner_threshold_ms);
+  for (const auto* stripe : scanner_maps_) {
+    shared_lock<RWMutex> l(stripe->lock_);
+    for (const auto& it : stripe->scanners_by_id_) {
+      const SharedScanner& scanner = it.second;
+      const MonoTime start_time = scanner->start_time();
+      if (start_time + slow_threshold >= now) {
+        continue;
+      }
+      total++;
+    }
+  }
+
+  return total;
+}
+
 void ScannerManager::ListScanners(std::vector<SharedScanner>* scanners) const {
   for (const ScannerMapStripe* stripe : scanner_maps_) {
     shared_lock<RWMutex> l(stripe->lock_);
diff --git a/src/kudu/tserver/scanners.h b/src/kudu/tserver/scanners.h
index 531a33f68..0fbaa8ec5 100644
--- a/src/kudu/tserver/scanners.h
+++ b/src/kudu/tserver/scanners.h
@@ -107,6 +107,10 @@ class ScannerManager {
   // if under concurrent modifications.
   size_t CountActiveScanners() const;
 
+  // Return the number of slow scans that have been defined as
+  // slow by --slow_scanner_threshold_ms.
+  size_t CountSlowScans() const;
+
   // List all active scanners.
   // Note this method will not return a consistent view
   // of all active scanners if under concurrent modifications.
diff --git a/src/kudu/tserver/tablet_server-test.cc b/src/kudu/tserver/tablet_server-test.cc
index f023f9fac..4916d796a 100644
--- a/src/kudu/tserver/tablet_server-test.cc
+++ b/src/kudu/tserver/tablet_server-test.cc
@@ -181,6 +181,7 @@ DECLARE_bool(enable_workload_score_for_perf_improvement_ops);
 DECLARE_bool(fail_dns_resolution);
 DECLARE_bool(rowset_metadata_store_keys);
 DECLARE_bool(scanner_unregister_on_invalid_seq_id);
+DECLARE_bool(show_slow_scans);
 DECLARE_double(cfile_inject_corruption);
 DECLARE_double(env_inject_eio);
 DECLARE_double(env_inject_full);
@@ -199,7 +200,9 @@ DECLARE_int32(metrics_retirement_age_ms);
 DECLARE_int32(rpc_service_queue_length);
 DECLARE_int32(scanner_batch_size_rows);
 DECLARE_int32(scanner_gc_check_interval_us);
+DECLARE_int32(scanner_inject_latency_on_each_batch_ms);
 DECLARE_int32(scanner_ttl_ms);
+DECLARE_int32(slow_scanner_threshold_ms);
 DECLARE_int32(tablet_bootstrap_inject_latency_ms);
 DECLARE_int32(tablet_inject_latency_on_apply_write_op_ms);
 DECLARE_int32(workload_stats_rate_collection_min_interval_ms);
@@ -226,6 +229,7 @@ METRIC_DECLARE_gauge_uint64(log_block_manager_containers);
 METRIC_DECLARE_gauge_size(active_scanners);
 METRIC_DECLARE_gauge_size(tablet_active_scanners);
 METRIC_DECLARE_gauge_size(num_rowsets_on_disk);
+METRIC_DECLARE_gauge_size(slow_scans);
 METRIC_DECLARE_histogram(flush_dms_duration);
 METRIC_DECLARE_histogram(op_apply_queue_length);
 METRIC_DECLARE_histogram(op_apply_queue_time);
@@ -559,6 +563,7 @@ TEST_F(TabletServerTest, TestWebPages) {
     // bugs in the past.
     ASSERT_STR_CONTAINS(buf.ToString(), "hybrid_clock_timestamp");
     ASSERT_STR_CONTAINS(buf.ToString(), "active_scanners");
+    ASSERT_STR_CONTAINS(buf.ToString(), "slow_scans");
     ASSERT_STR_CONTAINS(buf.ToString(), "threads_started");
     ASSERT_STR_CONTAINS(buf.ToString(), "code_cache_queries");
 #ifdef TCMALLOC_ENABLED
@@ -2283,6 +2288,55 @@ static const ReadMode kReadModes[] = {
 INSTANTIATE_TEST_SUITE_P(Params, ExpiredScannerParamTest,
                          testing::ValuesIn(kReadModes));
 
+class SlowScansParamTest :
+    public TabletServerTest,
+    public ::testing::WithParamInterface<ReadMode> {
+};
+
+TEST_P(SlowScansParamTest, Test) {
+  const ReadMode mode = GetParam();
+  // Slow scanners aren't shown by default.
+  ASSERT_FALSE(FLAGS_show_slow_scans);
+  FLAGS_show_slow_scans = true;
+  FLAGS_scanner_ttl_ms = 500;
+  // Create slow scan scenarios.
+  FLAGS_scanner_inject_latency_on_each_batch_ms = 50;
+  FLAGS_slow_scanner_threshold_ms = 1;
+
+  int num_rows = 10000;
+  InsertTestRowsDirect(0, num_rows);
+
+  // Instantiate slow scans metric.
+  ASSERT_TRUE(mini_server_->server()->metric_entity());
+  auto slow_scans = METRIC_slow_scans.InstantiateFunctionGauge(
+      mini_server_->server()->metric_entity(), [this]() {
+          return this->mini_server_->server()->scanner_manager()->CountSlowScans(); });
+
+  // Initially, there've been no scanners, so none is slow.
+  ASSERT_EQ(0, slow_scans->value());
+
+  ScanResponsePB resp;
+  NO_FATALS(OpenScannerWithAllColumns(&resp, mode));
+  ScanRequestPB req;
+  RpcController rpc;
+  req.set_scanner_id(resp.scanner_id());
+  req.set_call_seq_id(1);
+  resp.Clear();
+  ASSERT_OK(proxy_->Scan(req, &resp, &rpc));
+  // The scanner should be slow after a while, which is defined by '--slow_scanner_threshold_ms'.
+  ASSERT_EQ(1, slow_scans->value());
+
+  // Make scanners expire quickly.
+  FLAGS_scanner_ttl_ms = 1;
+  // Ensure that the metrics have been updated now.
+  ASSERT_EVENTUALLY([&]() {
+    ASSERT_EQ(0, slow_scans->value());
+  });
+}
+
+INSTANTIATE_TEST_SUITE_P(Params, SlowScansParamTest,
+                         testing::ValuesIn(kReadModes));
+
 class ScanCorruptedDeltasParamTest :
     public TabletServerTest,
     public ::testing::WithParamInterface<ReadMode> {