You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by la...@apache.org on 2023/03/16 16:09:11 UTC
[kudu] branch master updated: [server] add 'slow_scans' metric for tablet server
This is an automated email from the ASF dual-hosted git repository.
laiyingchun pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new 7133b894e [server] add 'slow_scans' metric for tablet server
7133b894e is described below
commit 7133b894e87c705353753c25e83c56b683e6d805
Author: kedeng <kd...@gmail.com>
AuthorDate: Mon Feb 27 16:07:16 2023 +0800
[server] add 'slow_scans' metric for tablet server
As a supplement to the slow scan display, I added
a new metric 'slow_scans' to facilitate our monitoring.
At the same time, I added test to ensure that the
new metric will take effect.
Change-Id: I51e64bc4602f0e1dd99207f7d4cf8883085eca9a
Reviewed-on: http://gerrit.cloudera.org:8080/19545
Tested-by: Kudu Jenkins
Reviewed-by: Yingchun Lai <la...@apache.org>
---
src/kudu/tserver/scanners.cc | 29 ++++++++++++++++++
src/kudu/tserver/scanners.h | 4 +++
src/kudu/tserver/tablet_server-test.cc | 54 ++++++++++++++++++++++++++++++++++
3 files changed, 87 insertions(+)
diff --git a/src/kudu/tserver/scanners.cc b/src/kudu/tserver/scanners.cc
index d28f1a4aa..c53b331a1 100644
--- a/src/kudu/tserver/scanners.cc
+++ b/src/kudu/tserver/scanners.cc
@@ -91,6 +91,13 @@ METRIC_DEFINE_gauge_size(server, active_scanners,
"Number of scanners that are currently active",
kudu::MetricLevel::kInfo);
+METRIC_DEFINE_gauge_size(server, slow_scans,
+ "Slow Scans",
+ kudu::MetricUnit::kScanners,
+ "Number of slow scanners that are defined by --slow_scanner_threshold_ms "
+ "if --show_slow_scans set to 'true'.",
+ kudu::MetricLevel::kWarn);
+
using kudu::rpc::RemoteUser;
using kudu::tablet::TabletReplica;
using std::string;
@@ -133,6 +140,9 @@ ScannerManager::ScannerManager(const scoped_refptr<MetricEntity>& metric_entity)
METRIC_active_scanners.InstantiateFunctionGauge(
metric_entity, [this]() { return this->CountActiveScanners(); })
->AutoDetach(&metric_detacher_);
+ METRIC_slow_scans.InstantiateFunctionGauge(
+ metric_entity, [this]() { return this->CountSlowScans(); })
+ ->AutoDetach(&metric_detacher_);
}
for (size_t i = 0; i < kNumScannerMapStripes; i++) {
scanner_maps_.push_back(new ScannerMapStripe());
@@ -279,6 +289,25 @@ size_t ScannerManager::CountActiveScanners() const {
return total;
}
+size_t ScannerManager::CountSlowScans() const {
+ size_t total = 0;
+ const MonoTime now = MonoTime::Now();
+ const MonoDelta slow_threshold = MonoDelta::FromMilliseconds(FLAGS_slow_scanner_threshold_ms);
+ for (const auto* stripe : scanner_maps_) {
+ shared_lock<RWMutex> l(stripe->lock_);
+ for (const auto& it : stripe->scanners_by_id_) {
+ const SharedScanner& scanner = it.second;
+ const MonoTime start_time = scanner->start_time();
+ if (start_time + slow_threshold >= now) {
+ continue;
+ }
+ total++;
+ }
+ }
+
+ return total;
+}
+
void ScannerManager::ListScanners(std::vector<SharedScanner>* scanners) const {
for (const ScannerMapStripe* stripe : scanner_maps_) {
shared_lock<RWMutex> l(stripe->lock_);
diff --git a/src/kudu/tserver/scanners.h b/src/kudu/tserver/scanners.h
index 531a33f68..0fbaa8ec5 100644
--- a/src/kudu/tserver/scanners.h
+++ b/src/kudu/tserver/scanners.h
@@ -107,6 +107,10 @@ class ScannerManager {
// if under concurrent modifications.
size_t CountActiveScanners() const;
+ // Return the number of slow scans that have been defined as
+ // slow by --slow_scanner_threshold_ms.
+ size_t CountSlowScans() const;
+
// List all active scanners.
// Note this method will not return a consistent view
// of all active scanners if under concurrent modifications.
diff --git a/src/kudu/tserver/tablet_server-test.cc b/src/kudu/tserver/tablet_server-test.cc
index f023f9fac..4916d796a 100644
--- a/src/kudu/tserver/tablet_server-test.cc
+++ b/src/kudu/tserver/tablet_server-test.cc
@@ -181,6 +181,7 @@ DECLARE_bool(enable_workload_score_for_perf_improvement_ops);
DECLARE_bool(fail_dns_resolution);
DECLARE_bool(rowset_metadata_store_keys);
DECLARE_bool(scanner_unregister_on_invalid_seq_id);
+DECLARE_bool(show_slow_scans);
DECLARE_double(cfile_inject_corruption);
DECLARE_double(env_inject_eio);
DECLARE_double(env_inject_full);
@@ -199,7 +200,9 @@ DECLARE_int32(metrics_retirement_age_ms);
DECLARE_int32(rpc_service_queue_length);
DECLARE_int32(scanner_batch_size_rows);
DECLARE_int32(scanner_gc_check_interval_us);
+DECLARE_int32(scanner_inject_latency_on_each_batch_ms);
DECLARE_int32(scanner_ttl_ms);
+DECLARE_int32(slow_scanner_threshold_ms);
DECLARE_int32(tablet_bootstrap_inject_latency_ms);
DECLARE_int32(tablet_inject_latency_on_apply_write_op_ms);
DECLARE_int32(workload_stats_rate_collection_min_interval_ms);
@@ -226,6 +229,7 @@ METRIC_DECLARE_gauge_uint64(log_block_manager_containers);
METRIC_DECLARE_gauge_size(active_scanners);
METRIC_DECLARE_gauge_size(tablet_active_scanners);
METRIC_DECLARE_gauge_size(num_rowsets_on_disk);
+METRIC_DECLARE_gauge_size(slow_scans);
METRIC_DECLARE_histogram(flush_dms_duration);
METRIC_DECLARE_histogram(op_apply_queue_length);
METRIC_DECLARE_histogram(op_apply_queue_time);
@@ -559,6 +563,7 @@ TEST_F(TabletServerTest, TestWebPages) {
// bugs in the past.
ASSERT_STR_CONTAINS(buf.ToString(), "hybrid_clock_timestamp");
ASSERT_STR_CONTAINS(buf.ToString(), "active_scanners");
+ ASSERT_STR_CONTAINS(buf.ToString(), "slow_scans");
ASSERT_STR_CONTAINS(buf.ToString(), "threads_started");
ASSERT_STR_CONTAINS(buf.ToString(), "code_cache_queries");
#ifdef TCMALLOC_ENABLED
@@ -2283,6 +2288,55 @@ static const ReadMode kReadModes[] = {
INSTANTIATE_TEST_SUITE_P(Params, ExpiredScannerParamTest,
testing::ValuesIn(kReadModes));
+class SlowScansParamTest :
+ public TabletServerTest,
+ public ::testing::WithParamInterface<ReadMode> {
+};
+
+TEST_P(SlowScansParamTest, Test) {
+ const ReadMode mode = GetParam();
+ // Slow scanners aren't shown by default.
+ ASSERT_FALSE(FLAGS_show_slow_scans);
+ FLAGS_show_slow_scans = true;
+ FLAGS_scanner_ttl_ms = 500;
+ // Create slow scan scenarios.
+ FLAGS_scanner_inject_latency_on_each_batch_ms = 50;
+ FLAGS_slow_scanner_threshold_ms = 1;
+
+ int num_rows = 10000;
+ InsertTestRowsDirect(0, num_rows);
+
+ // Instantiate slow scans metric.
+ ASSERT_TRUE(mini_server_->server()->metric_entity());
+ auto slow_scans = METRIC_slow_scans.InstantiateFunctionGauge(
+ mini_server_->server()->metric_entity(), [this]() {
+ return this->mini_server_->server()->scanner_manager()->CountSlowScans(); });
+
+ // Initially, there've been no scanners, so none is slow.
+ ASSERT_EQ(0, slow_scans->value());
+
+ ScanResponsePB resp;
+ NO_FATALS(OpenScannerWithAllColumns(&resp, mode));
+ ScanRequestPB req;
+ RpcController rpc;
+ req.set_scanner_id(resp.scanner_id());
+ req.set_call_seq_id(1);
+ resp.Clear();
+ ASSERT_OK(proxy_->Scan(req, &resp, &rpc));
+ // The scanner should be slow after a while, which is defined by '--slow_scanner_threshold_ms'.
+ ASSERT_EQ(1, slow_scans->value());
+
+ // Make scanners expire quickly.
+ FLAGS_scanner_ttl_ms = 1;
+ // Ensure that the metrics have been updated now.
+ ASSERT_EVENTUALLY([&]() {
+ ASSERT_EQ(0, slow_scans->value());
+ });
+}
+
+INSTANTIATE_TEST_SUITE_P(Params, SlowScansParamTest,
+ testing::ValuesIn(kReadModes));
+
class ScanCorruptedDeltasParamTest :
public TabletServerTest,
public ::testing::WithParamInterface<ReadMode> {