You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@trafficserver.apache.org by zw...@apache.org on 2016/02/26 17:45:18 UTC
trafficserver git commit: TS-4228 Adds better error handling in the
synthetic checks
Repository: trafficserver
Updated Branches:
refs/heads/master 2cdd1016f -> 9bf5beb36
TS-4228 Adds better error handling in the synthetic checks
In traffic_manager, the thread that handles the request from
traffic_cop (via traffic_server) does not deal well with various
(obscure) error conditions.
Project: http://git-wip-us.apache.org/repos/asf/trafficserver/repo
Commit: http://git-wip-us.apache.org/repos/asf/trafficserver/commit/9bf5beb3
Tree: http://git-wip-us.apache.org/repos/asf/trafficserver/tree/9bf5beb3
Diff: http://git-wip-us.apache.org/repos/asf/trafficserver/diff/9bf5beb3
Branch: refs/heads/master
Commit: 9bf5beb3625038ada8de89850d35dfc561220b77
Parents: 2cdd101
Author: Leif Hedstrom <zw...@apache.org>
Authored: Wed Feb 24 19:44:14 2016 -0700
Committer: Leif Hedstrom <zw...@apache.org>
Committed: Fri Feb 26 09:07:25 2016 -0700
----------------------------------------------------------------------
cmd/traffic_cop/traffic_cop.cc | 31 ++++++++++++-------------------
cmd/traffic_manager/MgmtHandlers.cc | 16 +++++++++++++---
lib/ts/ink_sock.cc | 8 ++++----
lib/ts/ink_sock.h | 4 ++--
mgmt/Cop.h | 27 +++++++++++++++++++++++++++
5 files changed, 58 insertions(+), 28 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/trafficserver/blob/9bf5beb3/cmd/traffic_cop/traffic_cop.cc
----------------------------------------------------------------------
diff --git a/cmd/traffic_cop/traffic_cop.cc b/cmd/traffic_cop/traffic_cop.cc
index 94cfcd3..8758514 100644
--- a/cmd/traffic_cop/traffic_cop.cc
+++ b/cmd/traffic_cop/traffic_cop.cc
@@ -35,6 +35,7 @@
#include "RecordsConfig.h"
#include "ClusterCom.h"
#include "ts/ink_cap.h"
+#include "Cop.h"
#include <string>
#include <map>
@@ -110,11 +111,7 @@ static int source_port = 0;
static int manager_failures = 0;
static int server_failures = 0;
static int server_not_found = 0;
-
-static const int sleep_time = 10; // 10 sec
-static int init_sleep_time = sleep_time; // 10 sec
-static const int manager_timeout = 3 * 60; // 3 min
-static const int server_timeout = 3 * 60; // 3 min
+static int init_sleep_time = cop_sleep_time; // 10 sec
// traffic_manager flap detection
#define MANAGER_FLAP_DETECTION 1
@@ -131,8 +128,6 @@ static ink_hrtime manager_flap_retry_start_time = 0; // first time we attempt
// transient syscall error timeout
#define TRANSIENT_ERROR_WAIT_MS 500
-static const int kill_timeout = 1 * 60; // 1 min
-
static int child_pid = 0;
static int child_status = 0;
@@ -316,12 +311,10 @@ sig_alarm_warn(int signum)
#endif
{
cop_log_trace("Entering sig_alarm_warn(%d)\n", signum);
- cop_log(COP_WARNING, "unable to kill traffic_server for the last"
- " %d seconds\n",
- kill_timeout);
+ cop_log(COP_WARNING, "unable to kill traffic_server for the last %d seconds\n", cop_kill_timeout);
// Set us up for another alarm
- alarm(kill_timeout);
+ alarm(cop_kill_timeout);
cop_log_trace("Leaving sig_alarm_warn(%d)\n", signum);
}
@@ -402,7 +395,7 @@ safe_kill(const char *lockfile_name, const char *pname, bool group)
cop_log_trace("Entering safe_kill(%s, %s, %d)\n", lockfile_name, pname, group);
set_alarm_warn();
- alarm(kill_timeout);
+ alarm(cop_kill_timeout);
if (group == true) {
lockfile.KillGroup(killsig, coresig, pname);
@@ -1017,7 +1010,7 @@ read_manager_string(const char *variable, char *value, size_t val_len)
snprintf(request, sizeof(request), "read %s\n", variable);
- err = test_port(rs_port, request, buffer, 4095, manager_timeout * 1000);
+ err = test_port(rs_port, request, buffer, 4095, cop_manager_timeout * 1000);
if (err < 0) {
return err;
}
@@ -1071,7 +1064,7 @@ read_manager_int(const char *variable, int *value)
snprintf(request, sizeof(request), "read %s\n", variable);
- err = test_port(rs_port, request, buffer, 4095, manager_timeout * 1000);
+ err = test_port(rs_port, request, buffer, 4095, cop_manager_timeout * 1000);
if (err < 0) {
return err;
}
@@ -1236,7 +1229,7 @@ test_server_http_port()
// servers up on the autoconf port.
snprintf(request, sizeof(request), "GET http://127.0.0.1:%d/synthetic.txt HTTP/1.0\r\n\r\n", synthetic_port);
- return test_http_port(http_backdoor_port, request, server_timeout * 1000, localhost, localhost);
+ return test_http_port(http_backdoor_port, request, cop_server_timeout * 1000, localhost, localhost);
}
static int
@@ -1444,7 +1437,7 @@ check_programs()
// is up, we make sure there is actually a server process
// running. If there is we test it.
- alarm(2 * manager_timeout);
+ alarm(2 * cop_manager_timeout);
err = heartbeat_manager();
alarm(0);
@@ -1471,7 +1464,7 @@ check_programs()
safe_kill(manager_lockfile, manager_binary, true);
}
} else {
- alarm(2 * server_timeout);
+ alarm(2 * cop_server_timeout);
heartbeat_server();
alarm(0);
}
@@ -1566,7 +1559,7 @@ check(void *arg)
chown_file_to_admin_user(manager_lockfile);
chown_file_to_admin_user(server_lockfile);
- alarm(2 * (sleep_time + manager_timeout * 2 + server_timeout));
+ alarm(2 * (cop_sleep_time + cop_manager_timeout * 2 + cop_server_timeout));
if (check_no_run() < 0) {
break;
@@ -1601,7 +1594,7 @@ check(void *arg)
// Pause to catch our breath. (10 seconds).
// Use 'millisleep()' because normal 'sleep()' interferes with
// the SIGALRM signal which we use to heartbeat the cop.
- millisleep(sleep_time * 1000);
+ millisleep(cop_sleep_time * 1000);
// We do this after the first round of checks, since the first "check" will spawn traffic_manager
if (!mgmt_init) {
http://git-wip-us.apache.org/repos/asf/trafficserver/blob/9bf5beb3/cmd/traffic_manager/MgmtHandlers.cc
----------------------------------------------------------------------
diff --git a/cmd/traffic_manager/MgmtHandlers.cc b/cmd/traffic_manager/MgmtHandlers.cc
index 913037d..3f667e5 100644
--- a/cmd/traffic_manager/MgmtHandlers.cc
+++ b/cmd/traffic_manager/MgmtHandlers.cc
@@ -36,6 +36,7 @@
#include "MgmtSocket.h"
#include "NetworkUtilsRemote.h"
#include "MIME.h"
+#include "Cop.h"
// INKqa09866
#include "TSControlMain.h"
@@ -157,14 +158,20 @@ synthetic_thread(void *info)
// Read the request
bufp = buffer;
while (len < strlen(RequestStr)) {
+ if (read_ready(clientFD, cop_server_timeout * 1000) <= 0) {
+ mgmt_log(stderr, "[SyntheticHealthServer] poll() failed, no request to read()");
+ goto error;
+ }
bytes = read(clientFD, buffer, sizeof(buffer));
- if (bytes < 0) {
+ if (0 == bytes) {
+ mgmt_log(stderr, "[SyntheticHealthServer] EOF on the socket, likely prematurely closed");
+ goto error;
+ } else if (bytes < 0) {
if (errno == EINTR || errno == EAGAIN) {
continue;
} else {
mgmt_log(stderr, "[SyntheticHealthServer] Failed to read the request");
goto error;
- break;
}
} else {
len += bytes;
@@ -186,6 +193,10 @@ synthetic_thread(void *info)
// Write it
bufp = buffer;
while (len) {
+ if (write_ready(clientFD, cop_server_timeout * 1000) <= 0) {
+ mgmt_log(stderr, "[SyntheticHealthServer] poll() failed, no response to write()");
+ goto error;
+ }
bytes = write(clientFD, buffer, len);
if (bytes < 0) {
if (errno == EINTR || errno == EAGAIN) {
@@ -193,7 +204,6 @@ synthetic_thread(void *info)
} else {
mgmt_log(stderr, "[SyntheticHealthServer] Failed to write the response");
goto error;
- break;
}
} else {
len -= bytes;
http://git-wip-us.apache.org/repos/asf/trafficserver/blob/9bf5beb3/lib/ts/ink_sock.cc
----------------------------------------------------------------------
diff --git a/lib/ts/ink_sock.cc b/lib/ts/ink_sock.cc
index 3c447a2..2b98ed1 100644
--- a/lib/ts/ink_sock.cc
+++ b/lib/ts/ink_sock.cc
@@ -123,12 +123,12 @@ safe_blocking(int fd)
}
int
-write_ready(int fd)
+write_ready(int fd, int timeout_msec)
{
struct pollfd p;
p.events = POLLOUT;
p.fd = fd;
- int r = poll(&p, 1, 0);
+ int r = poll(&p, 1, timeout_msec);
if (r <= 0)
return r;
if (p.revents & (POLLERR | POLLNVAL))
@@ -139,12 +139,12 @@ write_ready(int fd)
}
int
-read_ready(int fd)
+read_ready(int fd, int timeout_msec)
{
struct pollfd p;
p.events = POLLIN;
p.fd = fd;
- int r = poll(&p, 1, 0);
+ int r = poll(&p, 1, timeout_msec);
if (r <= 0)
return r;
if (p.revents & (POLLERR | POLLNVAL))
http://git-wip-us.apache.org/repos/asf/trafficserver/blob/9bf5beb3/lib/ts/ink_sock.h
----------------------------------------------------------------------
diff --git a/lib/ts/ink_sock.h b/lib/ts/ink_sock.h
index 6e73faa..7c7d66e 100644
--- a/lib/ts/ink_sock.h
+++ b/lib/ts/ink_sock.h
@@ -51,8 +51,8 @@ int safe_clr_fl(int fd, int arg);
int safe_blocking(int fd);
int safe_nonblocking(int fd);
-int write_ready(int fd);
-int read_ready(int fd);
+int write_ready(int fd, int timeout_msec = 0);
+int read_ready(int fd, int timeout_msec = 0);
char fd_read_char(int fd);
int fd_read_line(int fd, char *s, int len);
http://git-wip-us.apache.org/repos/asf/trafficserver/blob/9bf5beb3/mgmt/Cop.h
----------------------------------------------------------------------
diff --git a/mgmt/Cop.h b/mgmt/Cop.h
new file mode 100644
index 0000000..d1fab21
--- /dev/null
+++ b/mgmt/Cop.h
@@ -0,0 +1,27 @@
+/** @file
+
+ Main entry point for the traffic_cop application.
+
+ @section license License
+
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+
+static const int cop_sleep_time = 10; // 10 sec
+static const int cop_manager_timeout = 3 * 60; // 3 min
+static const int cop_server_timeout = 3 * 60; // 3 min
+static const int cop_kill_timeout = 1 * 60; // 1 min