You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@trafficserver.apache.org by zw...@apache.org on 2016/02/26 17:45:18 UTC

trafficserver git commit: TS-4228 Adds better error handling in the synthetic checks

Repository: trafficserver
Updated Branches:
  refs/heads/master 2cdd1016f -> 9bf5beb36


TS-4228 Adds better error handling in the synthetic checks

In traffic_manager, the thread that handles the request from
traffic_cop (via traffic_server) does not deal well with various
(obscure) error conditions.


Project: http://git-wip-us.apache.org/repos/asf/trafficserver/repo
Commit: http://git-wip-us.apache.org/repos/asf/trafficserver/commit/9bf5beb3
Tree: http://git-wip-us.apache.org/repos/asf/trafficserver/tree/9bf5beb3
Diff: http://git-wip-us.apache.org/repos/asf/trafficserver/diff/9bf5beb3

Branch: refs/heads/master
Commit: 9bf5beb3625038ada8de89850d35dfc561220b77
Parents: 2cdd101
Author: Leif Hedstrom <zw...@apache.org>
Authored: Wed Feb 24 19:44:14 2016 -0700
Committer: Leif Hedstrom <zw...@apache.org>
Committed: Fri Feb 26 09:07:25 2016 -0700

----------------------------------------------------------------------
 cmd/traffic_cop/traffic_cop.cc      | 31 ++++++++++++-------------------
 cmd/traffic_manager/MgmtHandlers.cc | 16 +++++++++++++---
 lib/ts/ink_sock.cc                  |  8 ++++----
 lib/ts/ink_sock.h                   |  4 ++--
 mgmt/Cop.h                          | 27 +++++++++++++++++++++++++++
 5 files changed, 58 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/trafficserver/blob/9bf5beb3/cmd/traffic_cop/traffic_cop.cc
----------------------------------------------------------------------
diff --git a/cmd/traffic_cop/traffic_cop.cc b/cmd/traffic_cop/traffic_cop.cc
index 94cfcd3..8758514 100644
--- a/cmd/traffic_cop/traffic_cop.cc
+++ b/cmd/traffic_cop/traffic_cop.cc
@@ -35,6 +35,7 @@
 #include "RecordsConfig.h"
 #include "ClusterCom.h"
 #include "ts/ink_cap.h"
+#include "Cop.h"
 
 #include <string>
 #include <map>
@@ -110,11 +111,7 @@ static int source_port = 0;
 static int manager_failures = 0;
 static int server_failures = 0;
 static int server_not_found = 0;
-
-static const int sleep_time = 10;          // 10 sec
-static int init_sleep_time = sleep_time;   // 10 sec
-static const int manager_timeout = 3 * 60; //  3 min
-static const int server_timeout = 3 * 60;  //  3 min
+static int init_sleep_time = cop_sleep_time; // 10 sec
 
 // traffic_manager flap detection
 #define MANAGER_FLAP_DETECTION 1
@@ -131,8 +128,6 @@ static ink_hrtime manager_flap_retry_start_time = 0;    // first time we attempt
 // transient syscall error timeout
 #define TRANSIENT_ERROR_WAIT_MS 500
 
-static const int kill_timeout = 1 * 60; //  1 min
-
 static int child_pid = 0;
 static int child_status = 0;
 
@@ -316,12 +311,10 @@ sig_alarm_warn(int signum)
 #endif
 {
   cop_log_trace("Entering sig_alarm_warn(%d)\n", signum);
-  cop_log(COP_WARNING, "unable to kill traffic_server for the last"
-                       " %d seconds\n",
-          kill_timeout);
+  cop_log(COP_WARNING, "unable to kill traffic_server for the last %d seconds\n", cop_kill_timeout);
 
   // Set us up for another alarm
-  alarm(kill_timeout);
+  alarm(cop_kill_timeout);
   cop_log_trace("Leaving sig_alarm_warn(%d)\n", signum);
 }
 
@@ -402,7 +395,7 @@ safe_kill(const char *lockfile_name, const char *pname, bool group)
 
   cop_log_trace("Entering safe_kill(%s, %s, %d)\n", lockfile_name, pname, group);
   set_alarm_warn();
-  alarm(kill_timeout);
+  alarm(cop_kill_timeout);
 
   if (group == true) {
     lockfile.KillGroup(killsig, coresig, pname);
@@ -1017,7 +1010,7 @@ read_manager_string(const char *variable, char *value, size_t val_len)
 
   snprintf(request, sizeof(request), "read %s\n", variable);
 
-  err = test_port(rs_port, request, buffer, 4095, manager_timeout * 1000);
+  err = test_port(rs_port, request, buffer, 4095, cop_manager_timeout * 1000);
   if (err < 0) {
     return err;
   }
@@ -1071,7 +1064,7 @@ read_manager_int(const char *variable, int *value)
 
   snprintf(request, sizeof(request), "read %s\n", variable);
 
-  err = test_port(rs_port, request, buffer, 4095, manager_timeout * 1000);
+  err = test_port(rs_port, request, buffer, 4095, cop_manager_timeout * 1000);
   if (err < 0) {
     return err;
   }
@@ -1236,7 +1229,7 @@ test_server_http_port()
   // servers up on the autoconf port.
   snprintf(request, sizeof(request), "GET http://127.0.0.1:%d/synthetic.txt HTTP/1.0\r\n\r\n", synthetic_port);
 
-  return test_http_port(http_backdoor_port, request, server_timeout * 1000, localhost, localhost);
+  return test_http_port(http_backdoor_port, request, cop_server_timeout * 1000, localhost, localhost);
 }
 
 static int
@@ -1444,7 +1437,7 @@ check_programs()
     // is up, we make sure there is actually a server process
     // running. If there is we test it.
 
-    alarm(2 * manager_timeout);
+    alarm(2 * cop_manager_timeout);
     err = heartbeat_manager();
     alarm(0);
 
@@ -1471,7 +1464,7 @@ check_programs()
         safe_kill(manager_lockfile, manager_binary, true);
       }
     } else {
-      alarm(2 * server_timeout);
+      alarm(2 * cop_server_timeout);
       heartbeat_server();
       alarm(0);
     }
@@ -1566,7 +1559,7 @@ check(void *arg)
     chown_file_to_admin_user(manager_lockfile);
     chown_file_to_admin_user(server_lockfile);
 
-    alarm(2 * (sleep_time + manager_timeout * 2 + server_timeout));
+    alarm(2 * (cop_sleep_time + cop_manager_timeout * 2 + cop_server_timeout));
 
     if (check_no_run() < 0) {
       break;
@@ -1601,7 +1594,7 @@ check(void *arg)
     // Pause to catch our breath. (10 seconds).
     // Use 'millisleep()' because normal 'sleep()' interferes with
     // the SIGALRM signal which we use to heartbeat the cop.
-    millisleep(sleep_time * 1000);
+    millisleep(cop_sleep_time * 1000);
 
     // We do this after the first round of checks, since the first "check" will spawn traffic_manager
     if (!mgmt_init) {

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/9bf5beb3/cmd/traffic_manager/MgmtHandlers.cc
----------------------------------------------------------------------
diff --git a/cmd/traffic_manager/MgmtHandlers.cc b/cmd/traffic_manager/MgmtHandlers.cc
index 913037d..3f667e5 100644
--- a/cmd/traffic_manager/MgmtHandlers.cc
+++ b/cmd/traffic_manager/MgmtHandlers.cc
@@ -36,6 +36,7 @@
 #include "MgmtSocket.h"
 #include "NetworkUtilsRemote.h"
 #include "MIME.h"
+#include "Cop.h"
 
 // INKqa09866
 #include "TSControlMain.h"
@@ -157,14 +158,20 @@ synthetic_thread(void *info)
   // Read the request
   bufp = buffer;
   while (len < strlen(RequestStr)) {
+    if (read_ready(clientFD, cop_server_timeout * 1000) <= 0) {
+      mgmt_log(stderr, "[SyntheticHealthServer] poll() failed, no request to read()");
+      goto error;
+    }
     bytes = read(clientFD, buffer, sizeof(buffer));
-    if (bytes < 0) {
+    if (0 == bytes) {
+      mgmt_log(stderr, "[SyntheticHealthServer] EOF on the socket, likely prematurely closed");
+      goto error;
+    } else if (bytes < 0) {
       if (errno == EINTR || errno == EAGAIN) {
         continue;
       } else {
         mgmt_log(stderr, "[SyntheticHealthServer] Failed to read the request");
         goto error;
-        break;
       }
     } else {
       len += bytes;
@@ -186,6 +193,10 @@ synthetic_thread(void *info)
   // Write it
   bufp = buffer;
   while (len) {
+    if (write_ready(clientFD, cop_server_timeout * 1000) <= 0) {
+      mgmt_log(stderr, "[SyntheticHealthServer] poll() failed, no response to write()");
+      goto error;
+    }
     bytes = write(clientFD, buffer, len);
     if (bytes < 0) {
       if (errno == EINTR || errno == EAGAIN) {
@@ -193,7 +204,6 @@ synthetic_thread(void *info)
       } else {
         mgmt_log(stderr, "[SyntheticHealthServer] Failed to write the response");
         goto error;
-        break;
       }
     } else {
       len -= bytes;

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/9bf5beb3/lib/ts/ink_sock.cc
----------------------------------------------------------------------
diff --git a/lib/ts/ink_sock.cc b/lib/ts/ink_sock.cc
index 3c447a2..2b98ed1 100644
--- a/lib/ts/ink_sock.cc
+++ b/lib/ts/ink_sock.cc
@@ -123,12 +123,12 @@ safe_blocking(int fd)
 }
 
 int
-write_ready(int fd)
+write_ready(int fd, int timeout_msec)
 {
   struct pollfd p;
   p.events = POLLOUT;
   p.fd = fd;
-  int r = poll(&p, 1, 0);
+  int r = poll(&p, 1, timeout_msec);
   if (r <= 0)
     return r;
   if (p.revents & (POLLERR | POLLNVAL))
@@ -139,12 +139,12 @@ write_ready(int fd)
 }
 
 int
-read_ready(int fd)
+read_ready(int fd, int timeout_msec)
 {
   struct pollfd p;
   p.events = POLLIN;
   p.fd = fd;
-  int r = poll(&p, 1, 0);
+  int r = poll(&p, 1, timeout_msec);
   if (r <= 0)
     return r;
   if (p.revents & (POLLERR | POLLNVAL))

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/9bf5beb3/lib/ts/ink_sock.h
----------------------------------------------------------------------
diff --git a/lib/ts/ink_sock.h b/lib/ts/ink_sock.h
index 6e73faa..7c7d66e 100644
--- a/lib/ts/ink_sock.h
+++ b/lib/ts/ink_sock.h
@@ -51,8 +51,8 @@ int safe_clr_fl(int fd, int arg);
 int safe_blocking(int fd);
 int safe_nonblocking(int fd);
 
-int write_ready(int fd);
-int read_ready(int fd);
+int write_ready(int fd, int timeout_msec = 0);
+int read_ready(int fd, int timeout_msec = 0);
 
 char fd_read_char(int fd);
 int fd_read_line(int fd, char *s, int len);

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/9bf5beb3/mgmt/Cop.h
----------------------------------------------------------------------
diff --git a/mgmt/Cop.h b/mgmt/Cop.h
new file mode 100644
index 0000000..d1fab21
--- /dev/null
+++ b/mgmt/Cop.h
@@ -0,0 +1,27 @@
+/** @file
+
+    Main entry point for the traffic_cop application.
+
+    @section license License
+
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+static const int cop_sleep_time = 10;          // 10 sec
+static const int cop_manager_timeout = 3 * 60; //  3 min
+static const int cop_server_timeout = 3 * 60;  //  3 min
+static const int cop_kill_timeout = 1 * 60;    //  1 min