You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2023/02/08 18:33:12 UTC

[kudu] branch master updated: [clock] output more info in SystemNtp::DumpDiagnostics()

This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new e65ea38a4 [clock] output more info in SystemNtp::DumpDiagnostics()
e65ea38a4 is described below

commit e65ea38a4860c007d93ada9c991bccec903a80b1
Author: Alexey Serbin <al...@apache.org>
AuthorDate: Sun Feb 5 22:11:14 2023 -0800

    [clock] output more info in SystemNtp::DumpDiagnostics()
    
    This patch adds more details to the output of the DumpDiagnostics()
    method for the system clock synchronized with NTP.  In particular,
    the output now contains relevant metrics for the hybrid clock, the
    output of the 'ntpstat' utility (if present), the output of
    'chronyc activity' (if the 'chronyc' CLI is present), and a snapshot
    of the newly introduced 'clock_ntp_status' metric.
    
    As already mentioned, this patch also adds a new 'clock_ntp_status'
    metric that provides information on ntp_gettime()/ntp_adjtime() call
    result.  It's a string containing current timestamp (usec),
    max error (usec), and the result status of the call.
    
    The corresponding HybridClockTest.TestNtpDiagnostics scenario has been
    updated correspondingly.
    
    Change-Id: Ibb74e81ba1e4bc4c60acc6bd49bb458085e28207
    Reviewed-on: http://gerrit.cloudera.org:8080/19478
    Tested-by: Kudu Jenkins
    Reviewed-by: Ashwani Raina <ar...@cloudera.com>
    Reviewed-by: Yifan Zhang <ch...@163.com>
---
 src/kudu/clock/hybrid_clock-test.cc |  9 +++++
 src/kudu/clock/hybrid_clock.cc      |  2 +-
 src/kudu/clock/system_ntp.cc        | 75 ++++++++++++++++++++++++++++++++++++-
 src/kudu/clock/system_ntp.h         | 17 ++++++++-
 4 files changed, 99 insertions(+), 4 deletions(-)

diff --git a/src/kudu/clock/hybrid_clock-test.cc b/src/kudu/clock/hybrid_clock-test.cc
index 80588a723..4c9ec5ee2 100644
--- a/src/kudu/clock/hybrid_clock-test.cc
+++ b/src/kudu/clock/hybrid_clock-test.cc
@@ -577,10 +577,19 @@ TEST_F(HybridClockTest, TestNtpDiagnostics) {
   // error messages that corresponding binaries cannot be found. In either case
   // it proves the tools were attempted to run to collect NTP-related diagnostic
   // information.
+  ASSERT_STR_CONTAINS(s, "ntpstat");
   ASSERT_STR_CONTAINS(s, "ntptime");
   ASSERT_STR_CONTAINS(s, "ntpq");
   ASSERT_STR_CONTAINS(s, "ntpdc");
   ASSERT_STR_CONTAINS(s, "chronyc");
+
+  // Verify the output contains relevant metrics.
+  ASSERT_STR_CONTAINS(s, "clock_ntp_status");
+  ASSERT_STR_CONTAINS(s, "hybrid_clock_extrapolating");
+  ASSERT_STR_CONTAINS(s, "hybrid_clock_error");
+  ASSERT_STR_CONTAINS(s, "hybrid_clock_timestamp");
+  ASSERT_STR_CONTAINS(s, "hybrid_clock_max_errors");
+  ASSERT_STR_CONTAINS(s, "hybrid_clock_extrapolation_intervals");
 }
 #endif // #if defined(KUDU_HAS_SYSTEM_TIME_SOURCE) ...
 
diff --git a/src/kudu/clock/hybrid_clock.cc b/src/kudu/clock/hybrid_clock.cc
index d217c07ed..8c64c23be 100644
--- a/src/kudu/clock/hybrid_clock.cc
+++ b/src/kudu/clock/hybrid_clock.cc
@@ -542,7 +542,7 @@ Status HybridClock::InitWithTimeSource(TimeSource time_source) {
       break;
 #if defined(KUDU_HAS_SYSTEM_TIME_SOURCE)
     case TimeSource::NTP_SYNC_SYSTEM:
-      time_service_.reset(new clock::SystemNtp);
+      time_service_.reset(new clock::SystemNtp(metric_entity_));
       break;
 #endif
     case TimeSource::UNSYNC_SYSTEM:
diff --git a/src/kudu/clock/system_ntp.cc b/src/kudu/clock/system_ntp.cc
index 78d7e9b42..4c2debbc7 100644
--- a/src/kudu/clock/system_ntp.cc
+++ b/src/kudu/clock/system_ntp.cc
@@ -21,6 +21,7 @@
 #include <sys/timex.h>
 
 #include <cerrno>
+#include <functional>
 #include <limits>
 #include <ostream>
 #include <string>
@@ -33,7 +34,9 @@
 #include "kudu/gutil/strings/join.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/util/errno.h"
+#include "kudu/util/jsonwriter.h"
 #include "kudu/util/logging.h"
+#include "kudu/util/metrics.h"
 #include "kudu/util/path_util.h"
 #include "kudu/util/status.h"
 #include "kudu/util/subprocess.h"
@@ -44,6 +47,11 @@ using std::string;
 using std::vector;
 using strings::Substitute;
 
+METRIC_DEFINE_gauge_string(server, clock_ntp_status, "Clock NTP Status String",
+                           kudu::MetricUnit::kState,
+                           "Output of ntp_adjtime()/ntp_gettime() kernel API call",
+                           kudu::MetricLevel::kDebug);
+
 namespace kudu {
 namespace clock {
 
@@ -114,8 +122,9 @@ void TryRun(vector<string> cmd, vector<string>* log) {
 
 } // anonymous namespace
 
-SystemNtp::SystemNtp()
-    : skew_ppm_(std::numeric_limits<int64_t>::max()) {
+SystemNtp::SystemNtp(const scoped_refptr<MetricEntity>& metric_entity)
+    : skew_ppm_(std::numeric_limits<int64_t>::max()),
+      metric_entity_(metric_entity) {
 }
 
 Status SystemNtp::Init() {
@@ -131,6 +140,10 @@ Status SystemNtp::Init() {
   skew_ppm_ = t.tolerance >> 16;
   VLOG(1) << "ntp_adjtime(): tolerance is " << t.tolerance;
 
+  METRIC_clock_ntp_status.InstantiateFunctionGauge(
+      metric_entity_, []() { return ClockNtpStatusForMetrics(); })->
+          AutoDetachToLastValue(&metric_detacher_);
+
   return Status::OK();
 }
 
@@ -164,6 +177,8 @@ Status SystemNtp::WalltimeWithError(uint64_t* now_usec, uint64_t* error_usec) {
 
 void SystemNtp::DumpDiagnostics(vector<string>* log) const {
   LOG_STRING(ERROR, log) << "Dumping NTP diagnostics";
+
+  TryRun({"ntpstat"}, log);
   TryRun({"ntptime"}, log);
   // Gather as much info as possible from both ntpq and ntpdc, even
   // though some of it might be redundant. Different versions of ntp
@@ -185,6 +200,62 @@ void SystemNtp::DumpDiagnostics(vector<string>* log) const {
 
   TryRun({"chronyc", "-n", "tracking"}, log);
   TryRun({"chronyc", "-n", "sources"}, log);
+  TryRun({"chronyc", "-n", "activity"}, log);
+
+  MetricJsonOptions opts;
+  // There are relevant histogram metrics such as 'hybrid_clock_max_errors' and
+  // 'hybrid_clock_extrapolation_intervals' to output.
+  opts.include_raw_histograms = true;
+  // No need to output attributes -- that information could be retrieved from
+  // elsewhere, if needed.
+  opts.include_entity_attributes = false;
+  opts.filters.entity_metrics.emplace_back("clock_ntp_status");
+  opts.filters.entity_metrics.emplace_back("hybrid_clock_extrapolating");
+  opts.filters.entity_metrics.emplace_back("hybrid_clock_error");
+  opts.filters.entity_metrics.emplace_back("hybrid_clock_timestamp");
+  opts.filters.entity_metrics.emplace_back("hybrid_clock_max_errors");
+  opts.filters.entity_metrics.emplace_back("hybrid_clock_extrapolation_intervals");
+
+  // Collect the metrics in JSON pretty-printed format.
+  std::ostringstream buf;
+  JsonWriter writer(&buf, JsonWriter::PRETTY);
+  if (auto s = metric_entity_->WriteAsJson(&writer, opts); !s.ok()) {
+    LOG_STRING(WARNING, log) << "failed to dump relevant clock metrics: " << s.ToString();
+  } else {
+    LOG_STRING(ERROR, log) << buf.str();
+  }
+}
+
+string SystemNtp::ClockNtpStatusForMetrics() {
+  static constexpr const char* const kFmt = "now:$0 maxerror:$1 status:$2";
+
+#ifdef __APPLE__
+  ntptimeval t;
+  if (const auto s = NtpStateToStatus(ntp_gettime(&t)); !s.ok()) {
+    return Substitute(kFmt, "n/a", "n/a", s.ToString());
+  }
+  const uint64_t now_usec = static_cast<uint64_t>(t.time.tv_sec) * 1000000 +
+      t.time.tv_nsec / 1000;
+#else
+  timex t;
+  t.modes = 0; // set mode to 0 for read-only query
+  const int rc = ntp_adjtime(&t);
+
+  if (const auto s = CheckForNtpAdjtimeError(rc); !s.ok()) {
+    return Substitute(kFmt, "n/a", "n/a", s.ToString());
+  }
+  if (const auto s = NtpStateToStatus(rc); !s.ok()) {
+    return Substitute(kFmt, "n/a", "n/a", s.ToString());
+  }
+
+  if (t.status & STA_NANO) {
+    t.time.tv_usec /= 1000;
+  }
+  const uint64_t now_usec = static_cast<uint64_t>(t.time.tv_sec) * 1000000 +
+      t.time.tv_usec;
+#endif // #ifdef __APPLE__ ... #else ...
+
+  return Substitute(kFmt, now_usec, t.maxerror, "ok");
 }
 
 } // namespace clock
diff --git a/src/kudu/clock/system_ntp.h b/src/kudu/clock/system_ntp.h
index 90ec93ac5..b79314d12 100644
--- a/src/kudu/clock/system_ntp.h
+++ b/src/kudu/clock/system_ntp.h
@@ -22,6 +22,8 @@
 
 #include "kudu/clock/time_service.h"
 #include "kudu/gutil/macros.h"
+#include "kudu/gutil/ref_counted.h"
+#include "kudu/util/metrics.h"
 #include "kudu/util/status.h"
 
 namespace kudu {
@@ -35,7 +37,7 @@ namespace clock {
 // to keep the kernel's timekeeping up to date and in sync.
 class SystemNtp : public TimeService {
  public:
-  SystemNtp();
+  explicit SystemNtp(const scoped_refptr<MetricEntity>& metric_entity);
 
   Status Init() override;
 
@@ -48,10 +50,23 @@ class SystemNtp : public TimeService {
   void DumpDiagnostics(std::vector<std::string>* log) const override;
 
  private:
+  // Returns current timestamp, maxerror, and status as returned by the
+  // invocation of ntp_gettime()/ntp_adjtime() NTP kernel API call.
+  static std::string ClockNtpStatusForMetrics();
+
   // The maximum possible clock frequency skew rate reported by the kernel,
   // parts-per-million (PPM).
   int64_t skew_ppm_;
 
+  // Metric entity. Used to fetch information on NTP-related metrics upon
+  // calling DumpDiagnostics().
+  scoped_refptr<MetricEntity> metric_entity_;
+
+  // Metrics are set to detach to their last value. This means that, during
+  // running the destructor, there might be a need to access other class members
+  // declared above. Hence, this member must be declared last.
+  FunctionGaugeDetacher metric_detacher_;
+
   DISALLOW_COPY_AND_ASSIGN(SystemNtp);
 };