You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@trafficserver.apache.org by bn...@apache.org on 2022/02/28 21:09:35 UTC

[trafficserver] branch master updated: Add back "DNS: Fix lack of nameserver failover in low use circumstances." (#8705)

This is an automated email from the ASF dual-hosted git repository.

bneradt pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/trafficserver.git


The following commit(s) were added to refs/heads/master by this push:
     new 5b4a6a8  Add back "DNS: Fix lack of nameserver failover in low use circumstances." (#8705)
5b4a6a8 is described below

commit 5b4a6a8c679bcdd4dc32e28a75a068ed7b54b94b
Author: Brian Neradt <br...@gmail.com>
AuthorDate: Mon Feb 28 15:07:19 2022 -0600

    Add back "DNS: Fix lack of nameserver failover in low use circumstances." (#8705)
    
    This adds back the fix for nameserver failover in low use circumstances
    and addresses the regression test instability caused when the test box
    has connectivity issues with the DNS server.
---
 iocore/dns/DNS.cc                | 28 +++++++++++++++++++++-------
 iocore/dns/P_DNSProcessor.h      | 23 +++++++++++++++--------
 src/traffic_server/InkAPITest.cc |  7 +++++++
 3 files changed, 43 insertions(+), 15 deletions(-)

diff --git a/iocore/dns/DNS.cc b/iocore/dns/DNS.cc
index e7e9ebd..9ad2ea3 100644
--- a/iocore/dns/DNS.cc
+++ b/iocore/dns/DNS.cc
@@ -524,12 +524,12 @@ DNSHandler::open_con(sockaddr const *target, bool failed, int icon, bool over_tc
     }
     return false;
   } else {
-    ns_down[icon] = 0;
     if (cur_con.eio.start(pd, &cur_con, EVENTIO_READ) < 0) {
       Error("[iocore_dns] open_con: Failed to add %d server to epoll list\n", icon);
     } else {
-      cur_con.num = icon;
-      Debug("dns", "opening connection %s SUCCEEDED for %d", ip_text, icon);
+      cur_con.num   = icon;
+      ns_down[icon] = 0;
+      Debug("dns", "opening connection %s on fd %d SUCCEEDED for %d", ip_text, cur_con.fd, icon);
     }
     ret = true;
   }
@@ -729,6 +729,15 @@ void
 DNSHandler::failover()
 {
   Debug("dns", "failover: initiating failover attempt, current name_server=%d", name_server);
+  if (!ns_down[name_server]) {
+    ip_text_buffer buff;
+    // mark this nameserver as down
+    Debug("dns", "failover: Marking nameserver %d as down", name_server);
+    ns_down[name_server] = 1;
+    Warning("connection to DNS server %s lost, marking as down",
+            ats_ip_ntop(&m_res->nsaddr_list[name_server].sa, buff, sizeof(buff)));
+  }
+
   // no hope, if we have only one server
   if (m_res->nscount > 1) {
     ip_text_buffer buff1, buff2;
@@ -766,6 +775,8 @@ DNSHandler::failover()
     ip_text_buffer buff;
     Warning("failover: connection to DNS server %s lost, retrying", ats_ip_ntop(&ip.sa, buff, sizeof(buff)));
   }
+  // Make sure retries are done even if no more requests.
+  this_ethread()->schedule_in(this, DNS_PRIMARY_RETRY_PERIOD);
 }
 
 /** Mark one of the nameservers as down. */
@@ -779,6 +790,8 @@ DNSHandler::rr_failure(int ndx)
     Debug("dns", "rr_failure: Marking nameserver %d as down", ndx);
     ns_down[ndx] = 1;
     Warning("connection to DNS server %s lost, marking as down", ats_ip_ntop(&m_res->nsaddr_list[ndx].sa, buff, sizeof(buff)));
+    // Make sure retries are done even if no more requests.
+    this_ethread()->schedule_in(this, DNS_PRIMARY_RETRY_PERIOD);
   }
 
   int nscount = m_res->nscount;
@@ -971,6 +984,11 @@ DNSHandler::check_and_reset_tcp_conn()
 int
 DNSHandler::mainEvent(int event, Event *e)
 {
+  // If this was a scheduled retry event, clear the associated flag.
+  if (e && e->cookie == RETRY_COOKIE) {
+    this->nameserver_retry_in_flight_p = false;
+  }
+
   recv_dns(event, e);
   if (dns_ns_rr) {
     if (DNS_CONN_MODE::TCP_RETRY == dns_conn_mode) {
@@ -1017,10 +1035,6 @@ DNSHandler::mainEvent(int event, Event *e)
     write_dns(this);
   }
 
-  if (std::any_of(ns_down, ns_down + n_con, [](int f) { return f != 0; })) {
-    this_ethread()->schedule_at(this, DNS_PRIMARY_RETRY_PERIOD);
-  }
-
   return EVENT_CONT;
 }
 
diff --git a/iocore/dns/P_DNSProcessor.h b/iocore/dns/P_DNSProcessor.h
index 4c16f32..7d0bba9 100644
--- a/iocore/dns/P_DNSProcessor.h
+++ b/iocore/dns/P_DNSProcessor.h
@@ -169,9 +169,16 @@ struct DNSHandler : public Continuation {
   DNSConnection udpcon[MAX_NAMED];
   Queue<DNSEntry> entries;
   Queue<DNSConnection> triggered;
-  int in_flight          = 0;
-  int name_server        = 0;
-  int in_write_dns       = 0;
+  int in_flight    = 0;
+  int name_server  = 0;
+  int in_write_dns = 0;
+  /// Rate limiter for down nameserver retries.
+  /// Don't schedule another if there is already one in flight.
+  std::atomic<bool> nameserver_retry_in_flight_p{false};
+  /// Marker for event cookie to indicate it's a nameserver retry event.
+  /// @note Can't be @c constexpr because of the cast.
+  static inline void *const RETRY_COOKIE{reinterpret_cast<void *>(0x2)};
+
   HostEnt *hostent_cache = nullptr;
 
   int ns_down[MAX_NAMED];
@@ -212,16 +219,16 @@ struct DNSHandler : public Continuation {
             (ink_hrtime)HRTIME_SECONDS(dns_failover_period));
       Debug("dns", "\tdelta time is %" PRId64 "", (Thread::get_hrtime() - crossed_failover_number[i]));
     }
-    return (crossed_failover_number[i] &&
-            ((Thread::get_hrtime() - crossed_failover_number[i]) > HRTIME_SECONDS(dns_failover_period)));
+    return ns_down[i] || (crossed_failover_number[i] &&
+                          ((Thread::get_hrtime() - crossed_failover_number[i]) > HRTIME_SECONDS(dns_failover_period)));
   }
 
   bool
   failover_soon(int i)
   {
-    return (crossed_failover_number[i] &&
-            ((Thread::get_hrtime() - crossed_failover_number[i]) >
-             (HRTIME_SECONDS(dns_failover_try_period + failover_soon_number[i] * FAILOVER_SOON_RETRY))));
+    return ns_down[i] || (crossed_failover_number[i] &&
+                          ((Thread::get_hrtime() - crossed_failover_number[i]) >
+                           (HRTIME_SECONDS(dns_failover_try_period + failover_soon_number[i] * FAILOVER_SOON_RETRY))));
   }
 
   void recv_dns(int event, Event *e);
diff --git a/src/traffic_server/InkAPITest.cc b/src/traffic_server/InkAPITest.cc
index cd00096..21d61ab 100644
--- a/src/traffic_server/InkAPITest.cc
+++ b/src/traffic_server/InkAPITest.cc
@@ -83,6 +83,8 @@
 #define ERROR_BODY "TESTING ERROR PAGE"
 #define TRANSFORM_APPEND_STRING "This is a transformed response"
 
+extern int dns_failover_period;
+
 //////////////////////////////////////////////////////////////////////////////
 // STRUCTURES
 //////////////////////////////////////////////////////////////////////////////
@@ -8525,6 +8527,11 @@ EXCLUSIVE_REGRESSION_TEST(SDK_API_TSHttpConnectIntercept)(RegressionTest *test,
 EXCLUSIVE_REGRESSION_TEST(SDK_API_TSHttpConnectServerIntercept)(RegressionTest *test, int /* atype ATS_UNUSED */, int *pstatus)
 {
   *pstatus = REGRESSION_TEST_INPROGRESS;
+  // These tests don't actually require DNS resolution, but if there are issues
+  // connecting to the DNS server then the test will fail due to error events
+  // from the communication arising. We avoid this by simply extending the DNS
+  // failover period to a very large value.
+  dns_failover_period = 1000;
 
   TSDebug(UTDBG_TAG, "Starting test TSHttpConnectServerIntercept");