You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by jd...@apache.org on 2017/12/04 23:22:47 UTC

kudu git commit: KUDU-2209. HybridClock doesn't handle changes in STA_NANO flag

Repository: kudu
Updated Branches:
  refs/heads/branch-1.5.x d4a47487b -> af119b68d


KUDU-2209. HybridClock doesn't handle changes in STA_NANO flag

Users have occasionally reported spurious crashes due to Kudu thinking
that another node has a time stamp from the future. After some debugging
I realized that the issue is that we currently capture the flag
'STA_NANO' from the kernel only at startup. This flag indicates whether
the kernel's sub-second timestamp is in nanoseconds or microseconds. We
initially assumed this was a static property of the kernel. However it
turns out that this flag can get toggled at runtime by ntp in certain
circumstances. Given this, it was possible for us to interpret a number
of nanoseconds as if it were microseconds, resulting in a timestamp up
to 1000 seconds in the future.

This patch changes the SystemNtp time source to always use the 'adjtime'
call to fetch the clock, and looks at the STA_NANO flag on every such
call rather than only at startup.

I checked the source for the ntp_gettime call that we used to use, and
it turns out it was implemented in terms of the same adjtime() call, so
there should be no performance penalty in this change.

This patch doesn't include tests since it's based on some kernel
functionality. However, I was able to test it as follows:

- wrote a simple program to print the time once a second
- stopped ntp, ran chrony, stopped chrony, and started ntpd
-- this has the side effect of clearing STA_NANO
- waited 10 minutes or so, and eventually ntp reset back to STA_NANO
-- this caused my test program to start printing incorrect timestamps
   as it was interpreting nanosecond values from the kernel as if they
   were microseconds

Change-Id: Iaa9241dea1304076ed5cfaec78779d15b11293ff
Reviewed-on: http://gerrit.cloudera.org:8080/8450
Reviewed-by: Alexey Serbin <as...@cloudera.com>
Tested-by: Alexey Serbin <as...@cloudera.com>
Reviewed-on: http://gerrit.cloudera.org:8080/8699
Reviewed-by: Jean-Daniel Cryans <jd...@apache.org>
Tested-by: Jean-Daniel Cryans <jd...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/af119b68
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/af119b68
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/af119b68

Branch: refs/heads/branch-1.5.x
Commit: af119b68df82c3fe42ccb02acee16ba549a51dfa
Parents: d4a4748
Author: Todd Lipcon <to...@apache.org>
Authored: Wed Nov 1 18:41:07 2017 -0700
Committer: Jean-Daniel Cryans <jd...@apache.org>
Committed: Mon Dec 4 22:48:11 2017 +0000

----------------------------------------------------------------------
 src/kudu/clock/system_ntp.cc | 67 ++++++++++++---------------------------
 src/kudu/clock/system_ntp.h  |  7 +---
 2 files changed, 21 insertions(+), 53 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/af119b68/src/kudu/clock/system_ntp.cc
----------------------------------------------------------------------
diff --git a/src/kudu/clock/system_ntp.cc b/src/kudu/clock/system_ntp.cc
index 9cdac05..d1e8d56 100644
--- a/src/kudu/clock/system_ntp.cc
+++ b/src/kudu/clock/system_ntp.cc
@@ -22,12 +22,9 @@
 
 #include <cerrno>
 #include <ostream>
-#include <string>
 
 #include <glog/logging.h>
 
-#include "kudu/gutil/port.h"
-#include "kudu/gutil/strings/substitute.h"
 #include "kudu/util/errno.h"
 #include "kudu/util/logging.h"
 #include "kudu/util/status.h"
@@ -35,37 +32,21 @@
 namespace kudu {
 namespace clock {
 
-using strings::Substitute;
-
 const double SystemNtp::kAdjtimexScalingFactor = 65536;
-const uint64_t SystemNtp::kNanosPerSec = 1000000;
+const uint64_t SystemNtp::kMicrosPerSec = 1000000;
 
 namespace {
 
-// Returns the clock modes and checks if the clock is synchronized.
-Status GetClockModes(timex* timex) {
-  // this makes ntp_adjtime a read-only call
-  timex->modes = 0;
-  int rc = ntp_adjtime(timex);
-  if (PREDICT_FALSE(rc == TIME_ERROR)) {
-    return Status::ServiceUnavailable(
-        Substitute("Error reading clock. Clock considered unsynchronized. Return code: $0", rc));
-  }
-  // TODO what to do about leap seconds? see KUDU-146
-  if (PREDICT_FALSE(rc != TIME_OK)) {
-    LOG(ERROR) << Substitute("TODO Server undergoing leap second. Return code: $0", rc);
-  }
-  return Status::OK();
-}
-
 // Returns the current time/max error and checks if the clock is synchronized.
-Status GetClockTime(ntptimeval* timeval) {
-  int rc = ntp_gettime(timeval);
+Status CallAdjTime(timex* tx) {
+  // Set mode to 0 to query the current time.
+  tx->modes = 0;
+  int rc = ntp_adjtime(tx);
   switch (rc) {
     case TIME_OK:
       return Status::OK();
     case -1: // generic error
-      return Status::ServiceUnavailable("Error reading clock. ntp_gettime() failed",
+      return Status::ServiceUnavailable("Error reading clock. ntp_adjtime() failed",
                                         ErrnoToString(errno));
     case TIME_ERROR:
       return Status::ServiceUnavailable("Error reading clock. Clock considered unsynchronized");
@@ -80,30 +61,16 @@ Status GetClockTime(ntptimeval* timeval) {
 } // anonymous namespace
 
 Status SystemNtp::Init() {
-  // Read the current time. This will return an error if the clock is not synchronized.
-  uint64_t now_usec;
-  uint64_t error_usec;
-  RETURN_NOT_OK(WalltimeWithError(&now_usec, &error_usec));
-
   timex timex;
-  RETURN_NOT_OK(GetClockModes(&timex));
-  // read whether the STA_NANO bit is set to know whether we'll get back nanos
-  // or micros in timeval.time.tv_usec. See:
-  // http://stackoverflow.com/questions/16063408/does-ntp-gettime-actually-return-nanosecond-precision
-  // set the timeval.time.tv_usec divisor so that we always get micros
-  if (timex.status & STA_NANO) {
-    divisor_ = 1000;
-  } else {
-    divisor_ = 1;
-  }
+  RETURN_NOT_OK(CallAdjTime(&timex));
 
   // Calculate the sleep skew adjustment according to the max tolerance of the clock.
   // Tolerance comes in parts per million but needs to be applied a scaling factor.
   skew_ppm_ = timex.tolerance / kAdjtimexScalingFactor;
 
-  LOG(INFO) << "NTP initialized. Resolution in nanos?: " << (divisor_ == 1000)
-            << " Skew PPM: " << skew_ppm_
-            << " Current error: " << error_usec;
+  LOG(INFO) << "NTP initialized."
+            << " Skew: " << skew_ppm_ << "ppm"
+            << " Current error: " << timex.maxerror <<  "us";
 
   return Status::OK();
 }
@@ -112,10 +79,16 @@ Status SystemNtp::Init() {
 Status SystemNtp::WalltimeWithError(uint64_t *now_usec,
                                     uint64_t *error_usec) {
   // Read the time. This will return an error if the clock is not synchronized.
-  ntptimeval timeval;
-  RETURN_NOT_OK(GetClockTime(&timeval));
-  *now_usec = timeval.time.tv_sec * kNanosPerSec + timeval.time.tv_usec / divisor_;
-  *error_usec = timeval.maxerror;
+  timex tx;
+  RETURN_NOT_OK(CallAdjTime(&tx));
+
+  if (tx.status & STA_NANO) {
+    tx.time.tv_usec /= 1000;
+  }
+  DCHECK_LE(tx.time.tv_usec, 1e6);
+
+  *now_usec = tx.time.tv_sec * kMicrosPerSec + tx.time.tv_usec;
+  *error_usec = tx.maxerror;
   return Status::OK();
 }
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/af119b68/src/kudu/clock/system_ntp.h
----------------------------------------------------------------------
diff --git a/src/kudu/clock/system_ntp.h b/src/kudu/clock/system_ntp.h
index 546b2c2..ed66f33 100644
--- a/src/kudu/clock/system_ntp.h
+++ b/src/kudu/clock/system_ntp.h
@@ -50,12 +50,7 @@ class SystemNtp : public TimeService {
   // "scale factor used by adjtimex freq param.  1 ppm = 65536"
   static const double kAdjtimexScalingFactor;
 
-  static const uint64_t kNanosPerSec;
-
-  // The divisor for the 'timeval.time.tv_usec' returned from the kernel.
-  // Despite the name 'tv_usec', in fact in some cases it will return a nanosecond
-  // value, in which case we may need to divide.
-  uint64_t divisor_ = 1;
+  static const uint64_t kMicrosPerSec;
 
   // The skew rate in PPM reported by the kernel.
   uint64_t skew_ppm_ = 0;