You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@trafficserver.apache.org by "Brian Geffon (JIRA)" <ji...@apache.org> on 2015/09/02 04:38:46 UTC

[jira] [Comment Edited] (TS-2193) Trafficserver 4.1 Crash with proxy.config.dns.dedicated_thread = 1

    [ https://issues.apache.org/jira/browse/TS-2193?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14726617#comment-14726617 ] 

Brian Geffon edited comment on TS-2193 at 9/2/15 2:38 AM:
----------------------------------------------------------

The issue here is that ET_DNS threads do not have the thread local that is expected by the per-thread session pools. One easy way to resolve that is to reschedule it to at ET_NET thread. The issue is to make that work I need to take advantage of the "cookie" parameter of the Event class, so it'll require a small modification to UnixEThread to allow the user to specify the cookie and call back the handler later with the cookie instead of the Event pointer. Below is a patch that fixes the crash, [~amc] [~zwoop] please review.

{code}
diff --git a/iocore/eventsystem/I_Event.h b/iocore/eventsystem/I_Event.h
index a61c49c..14056b0 100644
--- a/iocore/eventsystem/I_Event.h
+++ b/iocore/eventsystem/I_Event.h
@@ -212,6 +212,7 @@ public:
   unsigned int immediate : 1;
   unsigned int globally_allocated : 1;
   unsigned int in_heap : 4;
+  bool use_cookie_in_callback;
   int callback_event;

   ink_hrtime timeout_at;
diff --git a/iocore/eventsystem/P_UnixEvent.h b/iocore/eventsystem/P_UnixEvent.h
index a224443..c9e137b 100644
--- a/iocore/eventsystem/P_UnixEvent.h
+++ b/iocore/eventsystem/P_UnixEvent.h
@@ -32,6 +32,7 @@ Event::init(Continuation *c, ink_hrtime atimeout_at, ink_hrtime aperiod)
   period = aperiod;
   immediate = !period && !atimeout_at;
   cancelled = false;
+  use_cookie_in_callback = false;
   return this;
 }

@@ -45,7 +46,7 @@ Event::free()
 TS_INLINE
 Event::Event()
   : ethread(0), in_the_prot_queue(false), in_the_priority_queue(false), immediate(false), globally_allocated(true), in_heap(false),
-    timeout_at(0), period(0)
+    use_cookie_in_callback(false), timeout_at(0), period(0)
 {
 }

diff --git a/iocore/eventsystem/UnixEThread.cc b/iocore/eventsystem/UnixEThread.cc
index fb3af43..ecc3267 100644
--- a/iocore/eventsystem/UnixEThread.cc
+++ b/iocore/eventsystem/UnixEThread.cc
@@ -125,7 +125,7 @@ EThread::process_event(Event *e, int calling_code)
       return;
     }
     Continuation *c_temp = e->continuation;
-    e->continuation->handleEvent(calling_code, e);
+    e->continuation->handleEvent(calling_code, e->use_cookie_in_callback ? e->cookie : e);
     ink_assert(!e->in_the_priority_queue);
     ink_assert(c_temp == e->continuation);
     MUTEX_RELEASE(lock);
diff --git a/iocore/hostdb/HostDB.cc b/iocore/hostdb/HostDB.cc
index 984d137..12cf32e 100644
--- a/iocore/hostdb/HostDB.cc
+++ b/iocore/hostdb/HostDB.cc
@@ -65,6 +65,7 @@ char hostdb_hostfile_path[PATH_NAME_MAX] = "";
 int hostdb_sync_frequency = 120;
 int hostdb_srv_enabled = 0;
 int hostdb_disable_reverse_lookup = 0;
+extern int dns_thread;

 ClassAllocator<HostDBContinuation> hostDBContAllocator("hostDBContAllocator");

@@ -581,18 +582,36 @@ HostDBContinuation::refresh_MD5()
     mutex = hostDB.lock_for_bucket((int)(fold_md5(md5.hash) % hostDB.buckets));
 }

+static void callBackCont(Continuation *cont, int event,  HostDBInfo *r) {
+  if (dns_thread) {
+    /*
+     * TS-3882: Let's make sure we hop back to a net thread because at some point
+     * if you have per-thread server sessions it will attempt to access those pools
+     * that wont exist in a thread local on a DNS thread.
+     */
+    Event *e = eventAllocator.alloc();
+    e->callback_event = event;
+    e->cookie = static_cast<void*>(r);
+    e->init(cont, 0, 0);
+    e->use_cookie_in_callback = true;
+    eventProcessor.schedule(e, ET_NET);
+  } else {
+    cont->handleEvent(event, r);
+  }
+}
+
 static bool
 reply_to_cont(Continuation *cont, HostDBInfo *r, bool is_srv = false)
 {
   if (r == NULL || r->is_srv != is_srv || r->failed()) {
-    cont->handleEvent(is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL);
+    callBackCont(cont, is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL);
     return false;
   }

   if (r->reverse_dns) {
     if (!r->hostname()) {
       ink_assert(!"missing hostname");
-      cont->handleEvent(is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL);
+      callBackCont(cont, is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL);
       Warning("bogus entry deleted from HostDB: missing hostname");
       hostDB.delete_block(r);
       return false;
@@ -603,7 +622,7 @@ reply_to_cont(Continuation *cont, HostDBInfo *r, bool is_srv = false)
   if (!r->is_srv && r->round_robin) {
     if (!r->rr()) {
       ink_assert(!"missing round-robin");
-      cont->handleEvent(is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL);
+      callBackCont(cont, is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL);
       Warning("bogus entry deleted from HostDB: missing round-robin");
       hostDB.delete_block(r);
       return false;
@@ -612,7 +631,7 @@ reply_to_cont(Continuation *cont, HostDBInfo *r, bool is_srv = false)
     Debug("hostdb", "RR of %d with %d good, 1st IP = %s", r->rr()->rrcount, r->rr()->good, ats_ip_ntop(r->ip(), ipb, sizeof ipb));
   }

-  cont->handleEvent(is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, r);
+  callBackCont(cont, is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, r);

   if (!r->full) {
     Warning("bogus entry deleted from HostDB: none");
@@ -752,7 +771,7 @@ HostDBContinuation::insert(unsigned int attl)
     attl = HOST_DB_MAX_TTL;
   r->ip_timeout_interval = attl;
   r->ip_timestamp = hostdb_current_interval;
-  Debug("hostdb", "inserting for: %.*s: (md5: %" PRIx64 ") bucket: %d now: %u timeout: %u ttl: %u", md5.host_len, md5.host_name,
+  Debug("hostdb", "inserting (%p) for: %.*s: (md5: %" PRIx64 ") bucket: %d now: %u timeout: %u ttl: %u", r, md5.host_len, md5.host_name,
         folded_md5, bucket, r->ip_timestamp, r->ip_timeout_interval, attl);
   return r;
 }
{code}

I'm really not thrilled with having to modify the Event class and UnixEThread.


was (Author: briang):
The issue here is that ET_DNS threads do not have the thread local that is expected by the per-thread session pools. One easy way to resolve that is to reschedule it to at ET_NET thread. The issue is to make that work I need to take advantage of the "cookie" parameter of the Event class, so it'll require a small modification to UnixEThread to allow the user to specify the cookie and call back the handler later with the cookie instead of the Event pointer. Below is a patch that fixes the crash, [~amc] [~zwoop] please review.

{code}
diff --git a/iocore/eventsystem/I_Event.h b/iocore/eventsystem/I_Event.h
index a61c49c..14056b0 100644
--- a/iocore/eventsystem/I_Event.h
+++ b/iocore/eventsystem/I_Event.h
@@ -212,6 +212,7 @@ public:
   unsigned int immediate : 1;
   unsigned int globally_allocated : 1;
   unsigned int in_heap : 4;
+  bool use_cookie_in_callback;
   int callback_event;

   ink_hrtime timeout_at;
diff --git a/iocore/eventsystem/P_UnixEvent.h b/iocore/eventsystem/P_UnixEvent.h
index a224443..c9e137b 100644
--- a/iocore/eventsystem/P_UnixEvent.h
+++ b/iocore/eventsystem/P_UnixEvent.h
@@ -32,6 +32,7 @@ Event::init(Continuation *c, ink_hrtime atimeout_at, ink_hrtime aperiod)
   period = aperiod;
   immediate = !period && !atimeout_at;
   cancelled = false;
+  use_cookie_in_callback = false;
   return this;
 }

@@ -45,7 +46,7 @@ Event::free()
 TS_INLINE
 Event::Event()
   : ethread(0), in_the_prot_queue(false), in_the_priority_queue(false), immediate(false), globally_allocated(true), in_heap(false),
-    timeout_at(0), period(0)
+    use_cookie_in_callback(false), timeout_at(0), period(0)
 {
 }

diff --git a/iocore/eventsystem/UnixEThread.cc b/iocore/eventsystem/UnixEThread.cc
index fb3af43..ecc3267 100644
--- a/iocore/eventsystem/UnixEThread.cc
+++ b/iocore/eventsystem/UnixEThread.cc
@@ -125,7 +125,7 @@ EThread::process_event(Event *e, int calling_code)
       return;
     }
     Continuation *c_temp = e->continuation;
-    e->continuation->handleEvent(calling_code, e);
+    e->continuation->handleEvent(calling_code, e->use_cookie_in_callback ? e->cookie : e);
     ink_assert(!e->in_the_priority_queue);
     ink_assert(c_temp == e->continuation);
     MUTEX_RELEASE(lock);
diff --git a/iocore/hostdb/HostDB.cc b/iocore/hostdb/HostDB.cc
index 984d137..12cf32e 100644
--- a/iocore/hostdb/HostDB.cc
+++ b/iocore/hostdb/HostDB.cc
@@ -65,6 +65,7 @@ char hostdb_hostfile_path[PATH_NAME_MAX] = "";
 int hostdb_sync_frequency = 120;
 int hostdb_srv_enabled = 0;
 int hostdb_disable_reverse_lookup = 0;
+extern int dns_thread;

 ClassAllocator<HostDBContinuation> hostDBContAllocator("hostDBContAllocator");

@@ -581,18 +582,36 @@ HostDBContinuation::refresh_MD5()
     mutex = hostDB.lock_for_bucket((int)(fold_md5(md5.hash) % hostDB.buckets));
 }

+static void callBackCont(Continuation *cont, int event,  HostDBInfo *r) {
+  if (dns_thread) {
+    /*
+     * TS-3882: Let's make sure we hop back to a net thread because at some point
+     * if you have per-thread server sessions it will attempt to access those pools
+     * that wont exist in a thread local on a DNS thread.
+     */
+    Event *e = eventAllocator.alloc();
+    e->callback_event = event;
+    e->cookie = static_cast<void*>(r);
+    e->init(cont, 0, 0);
+    e->use_cookie_in_callback = true;
+    eventProcessor.schedule(e, ET_NET);
+  } else {
+    cont->handleEvent(event, r);
+  }
+}
+
 static bool
 reply_to_cont(Continuation *cont, HostDBInfo *r, bool is_srv = false)
 {
   if (r == NULL || r->is_srv != is_srv || r->failed()) {
-    cont->handleEvent(is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL);
+    callBackCont(cont, is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL);
     return false;
   }

   if (r->reverse_dns) {
     if (!r->hostname()) {
       ink_assert(!"missing hostname");
-      cont->handleEvent(is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL);
+      callBackCont(cont, is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL);
       Warning("bogus entry deleted from HostDB: missing hostname");
       hostDB.delete_block(r);
       return false;
@@ -603,7 +622,7 @@ reply_to_cont(Continuation *cont, HostDBInfo *r, bool is_srv = false)
   if (!r->is_srv && r->round_robin) {
     if (!r->rr()) {
       ink_assert(!"missing round-robin");
-      cont->handleEvent(is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL);
+      callBackCont(cont, is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL);
       Warning("bogus entry deleted from HostDB: missing round-robin");
       hostDB.delete_block(r);
       return false;
@@ -612,7 +631,7 @@ reply_to_cont(Continuation *cont, HostDBInfo *r, bool is_srv = false)
     Debug("hostdb", "RR of %d with %d good, 1st IP = %s", r->rr()->rrcount, r->rr()->good, ats_ip_ntop(r->ip(), ipb, sizeof ipb));
   }

-  cont->handleEvent(is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, r);
+  callBackCont(cont, is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, r);

   if (!r->full) {
     Warning("bogus entry deleted from HostDB: none");
@@ -752,7 +771,7 @@ HostDBContinuation::insert(unsigned int attl)
     attl = HOST_DB_MAX_TTL;
   r->ip_timeout_interval = attl;
   r->ip_timestamp = hostdb_current_interval;
-  Debug("hostdb", "inserting for: %.*s: (md5: %" PRIx64 ") bucket: %d now: %u timeout: %u ttl: %u", md5.host_len, md5.host_name,
+  Debug("hostdb", "inserting (%p) for: %.*s: (md5: %" PRIx64 ") bucket: %d now: %u timeout: %u ttl: %u", r, md5.host_len, md5.host_name,
         folded_md5, bucket, r->ip_timestamp, r->ip_timeout_interval, attl);
   return r;
 }
{code}

> Trafficserver 4.1 Crash with proxy.config.dns.dedicated_thread = 1
> ------------------------------------------------------------------
>
>                 Key: TS-2193
>                 URL: https://issues.apache.org/jira/browse/TS-2193
>             Project: Traffic Server
>          Issue Type: Bug
>          Components: DNS
>    Affects Versions: 4.1.2
>            Reporter: Tommy Lee
>            Assignee: Brian Geffon
>              Labels: Crash
>             Fix For: 6.1.0
>
>         Attachments: bt-01.txt
>
>
> Hi all,
>   I've tried to enable DNS Thread without luck.
>   When i set proxy.config.dns.dedicated_thread to 1, it crashes with the information below.
>   The ATS is working in Forward Proxy mode.
>   Thanks in advance.
> --------------------------------------
> traffic.out
> NOTE: Traffic Server received Sig 11: Segmentation fault
> /usr/local/cache-4.1/bin/traffic_server - STACK TRACE: 
> /lib/x86_64-linux-gnu/libpthread.so.0(+0xfcb0)[0x2af714875cb0]
> /usr/local/cache-4.1/bin/traffic_server(_Z16_acquire_sessionP13SessionBucketPK8sockaddrR7INK_MD5P6HttpSM+0x52)[0x51dac2]
> /usr/local/cache-4.1/bin/traffic_server(_ZN18HttpSessionManager15acquire_sessionEP12ContinuationPK8sockaddrPKcP17HttpClientSessionP6HttpSM+0x3d1)[0x51e0f1]
> /usr/local/cache-4.1/bin/traffic_server(_ZN6HttpSM19do_http_server_openEb+0x30c)[0x53644c]
> /usr/local/cache-4.1/bin/traffic_server(_ZN6HttpSM14set_next_stateEv+0x6a0)[0x537560]
> /usr/local/cache-4.1/bin/traffic_server(_ZN6HttpSM14set_next_stateEv+0x57e)[0x53743e]
> /usr/local/cache-4.1/bin/traffic_server(_ZN6HttpSM14set_next_stateEv+0x57e)[0x53743e]
> /usr/local/cache-4.1/bin/traffic_server(_ZN6HttpSM27state_hostdb_reverse_lookupEiPv+0xb9)[0x526b99]
> /usr/local/cache-4.1/bin/traffic_server(_ZN6HttpSM12main_handlerEiPv+0xd8)[0x531be8]
> /usr/local/cache-4.1/bin/traffic_server[0x5d7c8a]
> /usr/local/cache-4.1/bin/traffic_server(_ZN18HostDBContinuation8dnsEventEiP7HostEnt+0x821)[0x5decd1]
> /usr/local/cache-4.1/bin/traffic_server(_ZN8DNSEntry9postEventEiP5Event+0x44)[0x5f7a94]
> /usr/local/cache-4.1/bin/traffic_server[0x5fd382]
> /usr/local/cache-4.1/bin/traffic_server(_ZN10DNSHandler8recv_dnsEiP5Event+0x852)[0x5fee72]
> /usr/local/cache-4.1/bin/traffic_server(_ZN10DNSHandler9mainEventEiP5Event+0x14)[0x5ffd94]
> /usr/local/cache-4.1/bin/traffic_server(_ZN7EThread13process_eventEP5Eventi+0x91)[0x6b2a41]
> /usr/local/cache-4.1/bin/traffic_server(_ZN7EThread7executeEv+0x514)[0x6b3534]
> /usr/local/cache-4.1/bin/traffic_server[0x6b17ea]
> /lib/x86_64-linux-gnu/libpthread.so.0(+0x7e9a)[0x2af71486de9a]
> /lib/x86_64-linux-gnu/libc.so.6(clone+0x6d)[0x2af71558dccd]



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)