You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@trafficserver.apache.org by ig...@apache.org on 2013/01/14 18:15:28 UTC

[2/5] git commit: traffic_cop: wait() for children on SIGTERM

traffic_cop: wait() for children on SIGTERM

rework traffic_cop's signal handling to register a handler for
SIGTERM. In this handler we now send SIGTERM to all children
and then wait() for them.
This should allow us to handle restarts in upstart more gracefully.

Finally, we simplify the handling on Linux by no longer parsing
/proc: It's not necessary. Linux is a POSIX system too.


Project: http://git-wip-us.apache.org/repos/asf/trafficserver/repo
Commit: http://git-wip-us.apache.org/repos/asf/trafficserver/commit/f29b2c71
Tree: http://git-wip-us.apache.org/repos/asf/trafficserver/tree/f29b2c71
Diff: http://git-wip-us.apache.org/repos/asf/trafficserver/diff/f29b2c71

Branch: refs/heads/master
Commit: f29b2c71287ba8e440656969198c22039eb7aff8
Parents: 6b90d87
Author: Igor Galić <i....@brainsware.org>
Authored: Tue Oct 30 21:52:42 2012 +0100
Committer: Igor Galić <i....@brainsware.org>
Committed: Mon Jan 14 18:03:14 2013 +0100

----------------------------------------------------------------------
 cop/TrafficCop.cc  |   74 +++++++++++++++++++++++++++++++----------------
 lib/ts/lockfile.cc |   55 ++++++----------------------------
 2 files changed, 59 insertions(+), 70 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/trafficserver/blob/f29b2c71/cop/TrafficCop.cc
----------------------------------------------------------------------
diff --git a/cop/TrafficCop.cc b/cop/TrafficCop.cc
index 6b0a0eb..3360996 100644
--- a/cop/TrafficCop.cc
+++ b/cop/TrafficCop.cc
@@ -45,7 +45,7 @@ union semun
 #endif  // linux check
 
 // For debugging, turn this on.
-// #define TRACE_LOG_COP 1
+#define TRACE_LOG_COP 1
 
 #define OPTIONS_MAX     32
 #define OPTIONS_LEN_MAX 1024
@@ -212,6 +212,7 @@ chown_file_to_admin_user(const char *file) {
     }
   }
 }
+
 static void
 sig_child(int signum)
 {
@@ -239,6 +240,43 @@ sig_child(int signum)
 }
 
 static void
+sig_term(int signum)
+{
+  pid_t pid = 0;
+  int status = 0;
+  int err;
+  pid_t holding_pid;
+
+  //killsig = SIGTERM;
+
+  cop_log_trace("Entering sig_term(%d)\n", signum);
+
+  // safely^W commit suicide.
+  cop_log_trace("Sending signal %d to entire group\n", signum);
+  killpg(0, signum);
+  
+  cop_log_trace("Waiting for children to exit.");
+
+  for (;;) {
+    pid = waitpid(WAIT_ANY, &status, WNOHANG);
+
+    if (pid <= 0) {
+      break;
+    }
+    // TSqa03086 - We can not log the child status signal from
+    //   the signal handler since syslog can deadlock.  Record
+    //   the pid and the status in a global for logging
+    //   next time through the event loop.  We will occasionally
+    //   lose some information if we get two sig childs in rapid
+    //   succession
+    child_pid = pid;
+    child_status = status;
+  }
+  cop_log_trace("Leaving sig_term(%d), exiting traffic_cop\n", signum);
+  exit(0);
+}
+
+static void
 #if defined(solaris)
 sig_fatal(int signum, siginfo_t * t, void *c)
 #else
@@ -1381,18 +1419,7 @@ check_programs()
   Lockfile manager_lf(manager_lockfile);
   err = manager_lf.Open(&holding_pid);
   chown_file_to_admin_user(manager_lockfile);
-#if defined(linux)
-  // if lockfile held, but process doesn't exist, killall and try again
-  if (err == 0) {
-    if (kill(holding_pid, 0) == -1) {
-      cop_log(COP_WARNING, "%s's lockfile is held, but its pid (%d) is missing;"
-              " killing all processes named '%s' and retrying\n", manager_binary, holding_pid, manager_binary);
-      ink_killall(manager_binary, killsig);
-      sleep(1);                 // give signals a chance to be received
-      err = manager_lf.Open(&holding_pid);
-    }
-  }
-#endif
+
   if (err > 0) {
     // 'lockfile_open' returns the file descriptor of the opened
     // lockfile.  We need to close this before spawning the
@@ -1473,18 +1500,7 @@ check_programs()
 
     Lockfile server_lf(server_lockfile);
     err = server_lf.Open(&holding_pid);
-#if defined(linux)
-    // if lockfile held, but process doesn't exist, killall and try again
-    if (err == 0) {
-      if (kill(holding_pid, 0) == -1) {
-        cop_log(COP_WARNING, "%s's lockfile is held, but its pid (%d) is missing;"
-                " killing all processes named '%s' and retrying\n", server_binary, holding_pid, server_binary);
-        ink_killall(server_binary, killsig);
-        sleep(1);               // give signals a chance to be received
-        err = server_lf.Open(&holding_pid);
-      }
-    }
-#endif
+
     if (err > 0) {
       server_lf.Close();
 
@@ -1673,6 +1689,14 @@ init_signals()
   struct sigaction action;
 
   cop_log_trace("Entering init_signals()\n");
+  // Handle the SIGTERM signal: We simply do the same as
+  // in sig_child..
+  action.sa_handler = sig_term;
+  sigemptyset(&action.sa_mask);
+  action.sa_flags = 0;
+
+  sigaction(SIGTERM, &action, NULL);
+
   // Handle the SIGCHLD signal. We simply reap all children that
   // die (which should only be spawned traffic_manager's).
   action.sa_handler = sig_child;

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/f29b2c71/lib/ts/lockfile.cc
----------------------------------------------------------------------
diff --git a/lib/ts/lockfile.cc b/lib/ts/lockfile.cc
index b3d345d..c2d34c4 100644
--- a/lib/ts/lockfile.cc
+++ b/lib/ts/lockfile.cc
@@ -205,54 +205,21 @@ static void
 lockfile_kill_internal(pid_t init_pid, int init_sig, pid_t pid, const char *pname, int sig)
 {
   int err;
-
-#if defined(linux)
-
-  pid_t *pidv = NULL;
-  int pidvcnt;
-
-  // Need to grab pname's pid vector before we issue any kill signals.
-  // Specifically, this prevents the race-condition in which
-  // traffic_manager spawns a new traffic_server while we still think
-  // we're killall'ing the old traffic_server.
-  if (pname) {
-    ink_killall_get_pidv_xmalloc(pname, &pidv, &pidvcnt);
-  }
-
-  if (init_sig > 0) {
-    kill(init_pid, init_sig);
-    // sleep for a bit and give time for the first signal to be
-    // delivered
-    sleep(1);
-  }
-
-  do {
-    if ((err = kill(pid, sig)) == 0) {
-      sleep(1);
-    }
-    if (pname && (pidvcnt > 0)) {
-      ink_killall_kill_pidv(pidv, pidvcnt, sig);
-      sleep(1);
-    }
-  } while ((err == 0) || ((err < 0) && (errno == EINTR)));
-
-  ats_free(pidv);
-
-#else
+  int status;
 
   if (init_sig > 0) {
     kill(init_pid, init_sig);
-    // sleep for a bit and give time for the first signal to be
-    // delivered
-    sleep(1);
+    // Wait for children to exit
+    do {
+      err = waitpid(-1, &status, WNOHANG);
+      if (err == -1) break;
+    } while(!WIFEXITED(status) && !WIFSIGNALED(status));
   }
 
   do {
     err = kill(pid, sig);
   } while ((err == 0) || ((err < 0) && (errno == EINTR)));
 
-#endif  // linux check
-
 }
 
 void
@@ -294,14 +261,12 @@ Lockfile::KillGroup(int sig, int initial_sig, const char *pname)
 
     if ((pid < 0) || (pid == getpid()))
       pid = holding_pid;
-    else
-      pid = -pid;
 
     if (pid != 0) {
-      // We kill the holding_pid instead of the process_group
-      // initially since there is no point trying to get core files
-      // from a group since the core file of one overwrites the core
-      // file of another one
+      // This way, we kill the process_group:
+      pid = -pid;
+      // In order to get core files from each process, please
+      // set your core_pattern appropriately.
       lockfile_kill_internal(holding_pid, initial_sig, pid, pname, sig);
     }
   }