You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@trafficserver.apache.org by am...@apache.org on 2017/01/06 17:56:08 UTC
[trafficserver] branch master updated: TS-5056: Implement
nonrecoverable error mechanism This closes #1224.
This is an automated email from the ASF dual-hosted git repository.
amc pushed a commit to branch master
in repository https://git-dual.apache.org/repos/asf/trafficserver.git
The following commit(s) were added to refs/heads/master by this push:
new 24347df TS-5056: Implement nonrecoverable error mechanism This closes #1224.
24347df is described below
commit 24347df553e91660b794af1079fbf69f986dc5b4
Author: Daniel Xu <dl...@yahoo.com>
AuthorDate: Wed Nov 16 14:21:27 2016 -0600
TS-5056: Implement nonrecoverable error mechanism
This closes #1224.
Change `Emergency()` to terminate the current process with status
code UNRECOVERABLE_EXIT.
Also change traffic_manager to listen for the UNRECOVERABLE_EXIT
status code. If heard, then TM will not try to restart TS from
that point forward.
This was designed so that traffic_server could call Emergency(..)
in the event of a nonrecoverable error such as a bad config file.
No amount of TS rebooting will fix a bad config, so we might as well
have TM wait for human intervention.
Note that if traffic_cop or traffic_manager calls Emergency(),
nothing totally unexpected will happen since the only visible change
from this patch is the status code.
---
cmd/traffic_manager/traffic_manager.cc | 6 +++++-
lib/ts/Diags.cc | 7 ++++++-
lib/ts/ink_error.cc | 32 ++++++++++++++++++++++++++------
lib/ts/ink_error.h | 10 ++++++++++
mgmt/LocalManager.cc | 11 +++++++++++
mgmt/LocalManager.h | 1 +
6 files changed, 59 insertions(+), 8 deletions(-)
diff --git a/cmd/traffic_manager/traffic_manager.cc b/cmd/traffic_manager/traffic_manager.cc
index 87f3bac..ce33925 100644
--- a/cmd/traffic_manager/traffic_manager.cc
+++ b/cmd/traffic_manager/traffic_manager.cc
@@ -799,7 +799,7 @@ main(int argc, const char **argv)
break;
}
- if (lmgmt->run_proxy && !lmgmt->processRunning()) { /* Make sure we still have a proxy up */
+ if (lmgmt->run_proxy && !lmgmt->processRunning() && lmgmt->proxy_recoverable) { /* Make sure we still have a proxy up */
if (sleep_time) {
mgmt_log("Relaunching proxy after %d sec...", sleep_time);
millisleep(1000 * sleep_time); // we use millisleep instead of sleep because it doesnt interfere with signals
@@ -814,6 +814,10 @@ main(int argc, const char **argv)
just_started++;
}
} else { /* Give the proxy a chance to fire up */
+ if (!lmgmt->proxy_recoverable) {
+ mgmt_log("[main] Proxy is un-recoverable. Proxy will not be relaunched.\n");
+ }
+
just_started++;
}
diff --git a/lib/ts/Diags.cc b/lib/ts/Diags.cc
index 85b3465..da8c5f2 100644
--- a/lib/ts/Diags.cc
+++ b/lib/ts/Diags.cc
@@ -552,7 +552,12 @@ Diags::error_va(DiagsLevel level, const SourceLocation *loc, const char *format_
if (cleanup_func) {
cleanup_func();
}
- ink_fatal_va(format_string, ap2);
+
+ // DL_Emergency means the process cannot recover from a reboot
+ if (level == DL_Emergency)
+ ink_emergency_va(format_string, ap2);
+ else
+ ink_fatal_va(format_string, ap2);
}
va_end(ap2);
diff --git a/lib/ts/ink_error.cc b/lib/ts/ink_error.cc
index 7d7bf1e..d38bafb 100644
--- a/lib/ts/ink_error.cc
+++ b/lib/ts/ink_error.cc
@@ -35,12 +35,12 @@
*/
static void
-fatal_va(const char *fmt, va_list ap)
+fatal_va(const char *hdr, const char *fmt, va_list ap)
{
char msg[1024];
- const size_t len = sizeof("FATAL: ") - 1;
+ const size_t len = strlen(hdr);
- strncpy(msg, "FATAL: ", sizeof(msg));
+ strncpy(msg, hdr, sizeof(msg));
vsnprintf(msg + len, sizeof(msg) - len, fmt, ap);
msg[sizeof(msg) - 1] = 0;
@@ -51,7 +51,7 @@ fatal_va(const char *fmt, va_list ap)
void
ink_fatal_va(const char *fmt, va_list ap)
{
- fatal_va(fmt, ap);
+ fatal_va("Fatal: ", fmt, ap);
::exit(70); // 70 corresponds to EX_SOFTWARE in BSD's sysexits. As good a status as any.
}
@@ -61,19 +61,39 @@ ink_fatal(const char *message_format, ...)
va_list ap;
va_start(ap, message_format);
- fatal_va(message_format, ap);
+ fatal_va("Fatal: ", message_format, ap);
va_end(ap);
::exit(70); // 70 corresponds to EX_SOFTWARE in BSD's sysexits. As good a status as any.
}
void
+ink_emergency_va(const char *fmt, va_list ap)
+{
+ fatal_va("Emergency: ", fmt, ap);
+ ::exit(UNRECOVERABLE_EXIT);
+}
+
+void
+ink_emergency(const char *message_format, ...)
+{
+ va_list ap;
+
+ va_start(ap, message_format);
+ ink_emergency_va(message_format, ap);
+ // Should never reach here since ink_emergency_va calls exit()
+ va_end(ap);
+
+ ::exit(UNRECOVERABLE_EXIT);
+}
+
+void
ink_abort(const char *message_format, ...)
{
va_list ap;
va_start(ap, message_format);
- fatal_va(message_format, ap);
+ fatal_va("Fatal: ", message_format, ap);
va_end(ap);
abort();
diff --git a/lib/ts/ink_error.h b/lib/ts/ink_error.h
index d0b7651..34309f9 100644
--- a/lib/ts/ink_error.h
+++ b/lib/ts/ink_error.h
@@ -36,6 +36,16 @@
#include "ts/ink_platform.h"
#include "ts/ink_apidefs.h"
+// This magic exit code is used to signal that the crashing process cannot
+// be recovered from a restart of said process
+//
+// Originally, this was intended to be used as a backchannel mechanism whereby
+// traffic_server can tell traffic_manager via an exit code to stop trying to restart
+// traffic_server b/c (for example) traffic_server has a bad config file
+#define UNRECOVERABLE_EXIT 33
+
+void ink_emergency_va(const char *fmt, va_list ap) TS_NORETURN;
+void ink_emergency(const char *message_format, ...) TS_PRINTFLIKE(1, 2) TS_NORETURN;
void ink_fatal_va(const char *message_format, va_list ap) TS_NORETURN;
void ink_fatal(const char *message_format, ...) TS_PRINTFLIKE(1, 2) TS_NORETURN;
void ink_abort(const char *message_format, ...) TS_PRINTFLIKE(1, 2) TS_NORETURN;
diff --git a/mgmt/LocalManager.cc b/mgmt/LocalManager.cc
index 29fd3ca..835c0f4 100644
--- a/mgmt/LocalManager.cc
+++ b/mgmt/LocalManager.cc
@@ -24,6 +24,7 @@
#include "ts/ink_platform.h"
#include "ts/ink_sock.h"
#include "ts/ink_file.h"
+#include "ts/ink_error.h"
#include "MgmtUtils.h"
#include "ts/I_Layout.h"
#include "LocalManager.h"
@@ -185,6 +186,7 @@ LocalManager::LocalManager(bool proxy_on) : BaseManager(), run_proxy(proxy_on),
syslog_facility = 0;
ccom = nullptr;
+ proxy_recoverable = true;
proxy_started_at = -1;
proxy_launch_count = 0;
manager_started_at = time(nullptr);
@@ -493,6 +495,15 @@ LocalManager::pollMgmtProcessServer()
if (WIFSIGNALED(estatus)) {
int sig = WTERMSIG(estatus);
mgmt_log("[LocalManager::pollMgmtProcessServer] Server Process terminated due to Sig %d: %s\n", sig, strsignal(sig));
+ } else if (WIFEXITED(estatus)) {
+ int return_code = WEXITSTATUS(estatus);
+
+ // traffic_server's exit code will be UNRECOVERABLE_EXIT if it calls
+ // ink_emergency() or ink_emergency_va(). The call signals that traffic_server
+ // cannot be recovered with a reboot. In other words, catastrophic failure.
+ if (return_code == UNRECOVERABLE_EXIT) {
+ proxy_recoverable = false;
+ }
}
if (lmgmt->run_proxy) {
diff --git a/mgmt/LocalManager.h b/mgmt/LocalManager.h
index 6d88423..f82e859 100644
--- a/mgmt/LocalManager.h
+++ b/mgmt/LocalManager.h
@@ -91,6 +91,7 @@ public:
bool clusterOk();
volatile bool run_proxy;
+ volatile bool proxy_recoverable; // false if traffic_server cannot recover with a reboot
volatile time_t manager_started_at;
volatile time_t proxy_started_at;
volatile int proxy_launch_count;
--
To stop receiving notification emails like this one, please contact
['"commits@trafficserver.apache.org" <co...@trafficserver.apache.org>'].