You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@trafficserver.apache.org by bn...@apache.org on 2022/02/28 21:09:35 UTC
[trafficserver] branch master updated: Add back "DNS: Fix lack of nameserver failover in low use circumstances." (#8705)
This is an automated email from the ASF dual-hosted git repository.
bneradt pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/trafficserver.git
The following commit(s) were added to refs/heads/master by this push:
new 5b4a6a8 Add back "DNS: Fix lack of nameserver failover in low use circumstances." (#8705)
5b4a6a8 is described below
commit 5b4a6a8c679bcdd4dc32e28a75a068ed7b54b94b
Author: Brian Neradt <br...@gmail.com>
AuthorDate: Mon Feb 28 15:07:19 2022 -0600
Add back "DNS: Fix lack of nameserver failover in low use circumstances." (#8705)
This adds back the fix for nameserver failover in low use circumstances
and addresses the regression test instability caused when the test box
has connectivity issues with the DNS server.
---
iocore/dns/DNS.cc | 28 +++++++++++++++++++++-------
iocore/dns/P_DNSProcessor.h | 23 +++++++++++++++--------
src/traffic_server/InkAPITest.cc | 7 +++++++
3 files changed, 43 insertions(+), 15 deletions(-)
diff --git a/iocore/dns/DNS.cc b/iocore/dns/DNS.cc
index e7e9ebd..9ad2ea3 100644
--- a/iocore/dns/DNS.cc
+++ b/iocore/dns/DNS.cc
@@ -524,12 +524,12 @@ DNSHandler::open_con(sockaddr const *target, bool failed, int icon, bool over_tc
}
return false;
} else {
- ns_down[icon] = 0;
if (cur_con.eio.start(pd, &cur_con, EVENTIO_READ) < 0) {
Error("[iocore_dns] open_con: Failed to add %d server to epoll list\n", icon);
} else {
- cur_con.num = icon;
- Debug("dns", "opening connection %s SUCCEEDED for %d", ip_text, icon);
+ cur_con.num = icon;
+ ns_down[icon] = 0;
+ Debug("dns", "opening connection %s on fd %d SUCCEEDED for %d", ip_text, cur_con.fd, icon);
}
ret = true;
}
@@ -729,6 +729,15 @@ void
DNSHandler::failover()
{
Debug("dns", "failover: initiating failover attempt, current name_server=%d", name_server);
+ if (!ns_down[name_server]) {
+ ip_text_buffer buff;
+ // mark this nameserver as down
+ Debug("dns", "failover: Marking nameserver %d as down", name_server);
+ ns_down[name_server] = 1;
+ Warning("connection to DNS server %s lost, marking as down",
+ ats_ip_ntop(&m_res->nsaddr_list[name_server].sa, buff, sizeof(buff)));
+ }
+
// no hope, if we have only one server
if (m_res->nscount > 1) {
ip_text_buffer buff1, buff2;
@@ -766,6 +775,8 @@ DNSHandler::failover()
ip_text_buffer buff;
Warning("failover: connection to DNS server %s lost, retrying", ats_ip_ntop(&ip.sa, buff, sizeof(buff)));
}
+ // Make sure retries are done even if no more requests.
+ this_ethread()->schedule_in(this, DNS_PRIMARY_RETRY_PERIOD);
}
/** Mark one of the nameservers as down. */
@@ -779,6 +790,8 @@ DNSHandler::rr_failure(int ndx)
Debug("dns", "rr_failure: Marking nameserver %d as down", ndx);
ns_down[ndx] = 1;
Warning("connection to DNS server %s lost, marking as down", ats_ip_ntop(&m_res->nsaddr_list[ndx].sa, buff, sizeof(buff)));
+ // Make sure retries are done even if no more requests.
+ this_ethread()->schedule_in(this, DNS_PRIMARY_RETRY_PERIOD);
}
int nscount = m_res->nscount;
@@ -971,6 +984,11 @@ DNSHandler::check_and_reset_tcp_conn()
int
DNSHandler::mainEvent(int event, Event *e)
{
+ // If this was a scheduled retry event, clear the associated flag.
+ if (e && e->cookie == RETRY_COOKIE) {
+ this->nameserver_retry_in_flight_p = false;
+ }
+
recv_dns(event, e);
if (dns_ns_rr) {
if (DNS_CONN_MODE::TCP_RETRY == dns_conn_mode) {
@@ -1017,10 +1035,6 @@ DNSHandler::mainEvent(int event, Event *e)
write_dns(this);
}
- if (std::any_of(ns_down, ns_down + n_con, [](int f) { return f != 0; })) {
- this_ethread()->schedule_at(this, DNS_PRIMARY_RETRY_PERIOD);
- }
-
return EVENT_CONT;
}
diff --git a/iocore/dns/P_DNSProcessor.h b/iocore/dns/P_DNSProcessor.h
index 4c16f32..7d0bba9 100644
--- a/iocore/dns/P_DNSProcessor.h
+++ b/iocore/dns/P_DNSProcessor.h
@@ -169,9 +169,16 @@ struct DNSHandler : public Continuation {
DNSConnection udpcon[MAX_NAMED];
Queue<DNSEntry> entries;
Queue<DNSConnection> triggered;
- int in_flight = 0;
- int name_server = 0;
- int in_write_dns = 0;
+ int in_flight = 0;
+ int name_server = 0;
+ int in_write_dns = 0;
+ /// Rate limiter for down nameserver retries.
+ /// Don't schedule another if there is already one in flight.
+ std::atomic<bool> nameserver_retry_in_flight_p{false};
+ /// Marker for event cookie to indicate it's a nameserver retry event.
+ /// @note Can't be @c constexpr because of the cast.
+ static inline void *const RETRY_COOKIE{reinterpret_cast<void *>(0x2)};
+
HostEnt *hostent_cache = nullptr;
int ns_down[MAX_NAMED];
@@ -212,16 +219,16 @@ struct DNSHandler : public Continuation {
(ink_hrtime)HRTIME_SECONDS(dns_failover_period));
Debug("dns", "\tdelta time is %" PRId64 "", (Thread::get_hrtime() - crossed_failover_number[i]));
}
- return (crossed_failover_number[i] &&
- ((Thread::get_hrtime() - crossed_failover_number[i]) > HRTIME_SECONDS(dns_failover_period)));
+ return ns_down[i] || (crossed_failover_number[i] &&
+ ((Thread::get_hrtime() - crossed_failover_number[i]) > HRTIME_SECONDS(dns_failover_period)));
}
bool
failover_soon(int i)
{
- return (crossed_failover_number[i] &&
- ((Thread::get_hrtime() - crossed_failover_number[i]) >
- (HRTIME_SECONDS(dns_failover_try_period + failover_soon_number[i] * FAILOVER_SOON_RETRY))));
+ return ns_down[i] || (crossed_failover_number[i] &&
+ ((Thread::get_hrtime() - crossed_failover_number[i]) >
+ (HRTIME_SECONDS(dns_failover_try_period + failover_soon_number[i] * FAILOVER_SOON_RETRY))));
}
void recv_dns(int event, Event *e);
diff --git a/src/traffic_server/InkAPITest.cc b/src/traffic_server/InkAPITest.cc
index cd00096..21d61ab 100644
--- a/src/traffic_server/InkAPITest.cc
+++ b/src/traffic_server/InkAPITest.cc
@@ -83,6 +83,8 @@
#define ERROR_BODY "TESTING ERROR PAGE"
#define TRANSFORM_APPEND_STRING "This is a transformed response"
+extern int dns_failover_period;
+
//////////////////////////////////////////////////////////////////////////////
// STRUCTURES
//////////////////////////////////////////////////////////////////////////////
@@ -8525,6 +8527,11 @@ EXCLUSIVE_REGRESSION_TEST(SDK_API_TSHttpConnectIntercept)(RegressionTest *test,
EXCLUSIVE_REGRESSION_TEST(SDK_API_TSHttpConnectServerIntercept)(RegressionTest *test, int /* atype ATS_UNUSED */, int *pstatus)
{
*pstatus = REGRESSION_TEST_INPROGRESS;
+ // These tests don't actually require DNS resolution, but if there are issues
+ // connecting to the DNS server then the test will fail due to error events
+ // from the communication arising. We avoid this by simply extending the DNS
+ // failover period to a very large value.
+ dns_failover_period = 1000;
TSDebug(UTDBG_TAG, "Starting test TSHttpConnectServerIntercept");