You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@trafficserver.apache.org by zw...@apache.org on 2020/05/20 20:04:57 UTC

[trafficserver] branch 9.0.x updated: Fixes to hostDB to avoid event and memory leaks (#6686)

This is an automated email from the ASF dual-hosted git repository.

zwoop pushed a commit to branch 9.0.x
in repository https://gitbox.apache.org/repos/asf/trafficserver.git


The following commit(s) were added to refs/heads/9.0.x by this push:
     new e05d9e1  Fixes to hostDB to avoid event and memory leaks (#6686)
e05d9e1 is described below

commit e05d9e1ee3f9065fc1621e42b9bfd050a2acf3b9
Author: Susan Hinrichs <sh...@yahoo-inc.com>
AuthorDate: Wed May 20 14:50:51 2020 -0500

    Fixes to hostDB to avoid event and memory leaks (#6686)
    
    Co-authored-by: Susan Hinrichs <sh...@verizonmedia.com>
    (cherry picked from commit 6094492e3efd178298d87629a54d0383d575c9d9)
---
 iocore/hostdb/HostDB.cc           | 60 ++++++++++++++++++++-------------------
 iocore/hostdb/P_HostDBProcessor.h |  1 -
 2 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/iocore/hostdb/HostDB.cc b/iocore/hostdb/HostDB.cc
index 3e6aaad..555ea95 100644
--- a/iocore/hostdb/HostDB.cc
+++ b/iocore/hostdb/HostDB.cc
@@ -43,7 +43,6 @@ HostDBContinuation::Options const HostDBContinuation::DEFAULT_OPTIONS;
 int hostdb_enable                              = true;
 int hostdb_migrate_on_demand                   = true;
 int hostdb_lookup_timeout                      = 30;
-int hostdb_insert_timeout                      = 160;
 int hostdb_re_dns_on_reload                    = false;
 int hostdb_ttl_mode                            = TTL_OBEY;
 unsigned int hostdb_round_robin_max_count      = 16;
@@ -1175,12 +1174,13 @@ HostDBContinuation::dnsEvent(int event, HostEnt *e)
     timeout = nullptr;
   }
   EThread *thread = mutex->thread_holding;
-  if (event == EVENT_INTERVAL) {
+  if (event != DNS_EVENT_LOOKUP) {
+    // This was an event_interval or an event_immediate
+    // Either we timed out, or remove_trigger_pending gave up on us
     if (!action.continuation) {
       // give up on insert, it has been too long
-      // remove_trigger_pending_dns will notify and clean up all requests
-      // including this one.
-      remove_trigger_pending_dns();
+      hostDB.pending_dns_for_hash(hash.hash).remove(this);
+      hostdb_cont_free(this);
       return EVENT_DONE;
     }
     MUTEX_TRY_LOCK(lock, action.mutex, thread);
@@ -1196,8 +1196,6 @@ HostDBContinuation::dnsEvent(int event, HostEnt *e)
       action.continuation->handleEvent(EVENT_HOST_DB_LOOKUP, nullptr);
     }
     action = nullptr;
-    // do not exit yet, wait to see if we can insert into DB
-    timeout = thread->schedule_in(this, HRTIME_SECONDS(hostdb_insert_timeout));
     return EVENT_DONE;
   } else {
     bool failed = !e || !e->good;
@@ -1420,37 +1418,38 @@ HostDBContinuation::dnsEvent(int event, HostEnt *e)
         return EVENT_CONT;
       }
 
-      // We have seen cases were the action.mutex != action.continuation.mutex.
+      // We have seen cases were the action.mutex != action.continuation.mutex.  However, it seems that case
+      // is likely a memory corruption... Thus the introduction of the assert.
       // Since reply_to_cont will call the handler on the action.continuation, it is important that we hold
       // that mutex.
       bool need_to_reschedule = true;
       MUTEX_TRY_LOCK(lock, action.mutex, thread);
       if (lock.is_locked()) {
-        need_to_reschedule = !action.cancelled;
         if (!action.cancelled) {
           if (action.continuation->mutex) {
-            MUTEX_TRY_LOCK(lock2, action.continuation->mutex, thread);
-            if (lock2.is_locked()) {
-              reply_to_cont(action.continuation, r, is_srv());
-              need_to_reschedule = false;
-            }
-          } else {
-            reply_to_cont(action.continuation, r, is_srv());
-            need_to_reschedule = false;
+            ink_release_assert(action.continuation->mutex == action.mutex);
           }
+          reply_to_cont(action.continuation, r, is_srv());
         }
+        need_to_reschedule = false;
       }
+
       if (need_to_reschedule) {
         SET_HANDLER((HostDBContHandler)&HostDBContinuation::probeEvent);
-        // remove_trigger_pending_dns should kick off the current hostDB too
-        // No need to explicitly reschedule
-        remove_trigger_pending_dns();
+        // Will reschedule on affinity thread or current thread
+        timeout = eventProcessor.schedule_in(this, HOST_DB_RETRY_PERIOD);
         return EVENT_CONT;
       }
     }
+
+    // Clean ourselves up
+    hostDB.pending_dns_for_hash(hash.hash).remove(this);
+
     // wake up everyone else who is waiting
     remove_trigger_pending_dns();
 
+    hostdb_cont_free(this);
+
     // all done, or at least scheduled to be all done
     //
     return EVENT_DONE;
@@ -1523,12 +1522,17 @@ HostDBContinuation::probeEvent(int /* event ATS_UNUSED */, Event *e)
   ink_assert(!link.prev && !link.next);
   EThread *t = e ? e->ethread : this_ethread();
 
+  if (timeout) {
+    timeout->cancel(this);
+    timeout = nullptr;
+  }
+
   MUTEX_TRY_LOCK(lock, action.mutex, t);
 
   // Separating lock checks here to make sure things don't break
   // when we check if the action is cancelled.
   if (!lock.is_locked()) {
-    mutex->thread_holding->schedule_in(this, HOST_DB_RETRY_PERIOD);
+    timeout = mutex->thread_holding->schedule_in(this, HOST_DB_RETRY_PERIOD);
     return EVENT_CONT;
   }
 
@@ -1537,13 +1541,8 @@ HostDBContinuation::probeEvent(int /* event ATS_UNUSED */, Event *e)
     return EVENT_DONE;
   }
 
-  // Go ahead and grab the continuation mutex or just grab the action mutex again of there is no continuation mutex
-  MUTEX_TRY_LOCK(lock2, (action.continuation && action.continuation->mutex) ? action.continuation->mutex : action.mutex, t);
-  // Don't continue unless we have both mutexes
-  if (!lock2.is_locked()) {
-    mutex->thread_holding->schedule_in(this, HOST_DB_RETRY_PERIOD);
-    return EVENT_CONT;
-  }
+  //  If the action.continuation->mutex != action.mutex, we have a use after free/realloc
+  ink_release_assert(!action.continuation || action.continuation->mutex == action.mutex);
 
   if (!hostdb_enable || (!*hash.host_name && !hash.ip.isValid())) {
     if (action.continuation) {
@@ -1619,7 +1618,10 @@ HostDBContinuation::remove_trigger_pending_dns()
     if (!affinity_thread || affinity_thread == thread) {
       c->handleEvent(EVENT_IMMEDIATE, nullptr);
     } else {
-      eventProcessor.schedule_imm(c);
+      if (c->timeout) {
+        c->timeout->cancel();
+      }
+      c->timeout = eventProcessor.schedule_imm(c);
     }
   }
 }
diff --git a/iocore/hostdb/P_HostDBProcessor.h b/iocore/hostdb/P_HostDBProcessor.h
index 9739bf6..ac14387 100644
--- a/iocore/hostdb/P_HostDBProcessor.h
+++ b/iocore/hostdb/P_HostDBProcessor.h
@@ -37,7 +37,6 @@
 extern int hostdb_enable;
 extern int hostdb_migrate_on_demand;
 extern int hostdb_lookup_timeout;
-extern int hostdb_insert_timeout;
 extern int hostdb_re_dns_on_reload;
 
 // 0 = obey, 1 = ignore, 2 = min(X,ttl), 3 = max(X,ttl)