You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@trafficserver.apache.org by "Brian Geffon (JIRA)" <ji...@apache.org> on 2015/09/02 04:38:46 UTC
[jira] [Comment Edited] (TS-2193) Trafficserver 4.1 Crash with
proxy.config.dns.dedicated_thread = 1
[ https://issues.apache.org/jira/browse/TS-2193?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14726617#comment-14726617 ]
Brian Geffon edited comment on TS-2193 at 9/2/15 2:38 AM:
----------------------------------------------------------
The issue here is that ET_DNS threads do not have the thread local that is expected by the per-thread session pools. One easy way to resolve that is to reschedule it to at ET_NET thread. The issue is to make that work I need to take advantage of the "cookie" parameter of the Event class, so it'll require a small modification to UnixEThread to allow the user to specify the cookie and call back the handler later with the cookie instead of the Event pointer. Below is a patch that fixes the crash, [~amc] [~zwoop] please review.
{code}
diff --git a/iocore/eventsystem/I_Event.h b/iocore/eventsystem/I_Event.h
index a61c49c..14056b0 100644
--- a/iocore/eventsystem/I_Event.h
+++ b/iocore/eventsystem/I_Event.h
@@ -212,6 +212,7 @@ public:
unsigned int immediate : 1;
unsigned int globally_allocated : 1;
unsigned int in_heap : 4;
+ bool use_cookie_in_callback;
int callback_event;
ink_hrtime timeout_at;
diff --git a/iocore/eventsystem/P_UnixEvent.h b/iocore/eventsystem/P_UnixEvent.h
index a224443..c9e137b 100644
--- a/iocore/eventsystem/P_UnixEvent.h
+++ b/iocore/eventsystem/P_UnixEvent.h
@@ -32,6 +32,7 @@ Event::init(Continuation *c, ink_hrtime atimeout_at, ink_hrtime aperiod)
period = aperiod;
immediate = !period && !atimeout_at;
cancelled = false;
+ use_cookie_in_callback = false;
return this;
}
@@ -45,7 +46,7 @@ Event::free()
TS_INLINE
Event::Event()
: ethread(0), in_the_prot_queue(false), in_the_priority_queue(false), immediate(false), globally_allocated(true), in_heap(false),
- timeout_at(0), period(0)
+ use_cookie_in_callback(false), timeout_at(0), period(0)
{
}
diff --git a/iocore/eventsystem/UnixEThread.cc b/iocore/eventsystem/UnixEThread.cc
index fb3af43..ecc3267 100644
--- a/iocore/eventsystem/UnixEThread.cc
+++ b/iocore/eventsystem/UnixEThread.cc
@@ -125,7 +125,7 @@ EThread::process_event(Event *e, int calling_code)
return;
}
Continuation *c_temp = e->continuation;
- e->continuation->handleEvent(calling_code, e);
+ e->continuation->handleEvent(calling_code, e->use_cookie_in_callback ? e->cookie : e);
ink_assert(!e->in_the_priority_queue);
ink_assert(c_temp == e->continuation);
MUTEX_RELEASE(lock);
diff --git a/iocore/hostdb/HostDB.cc b/iocore/hostdb/HostDB.cc
index 984d137..12cf32e 100644
--- a/iocore/hostdb/HostDB.cc
+++ b/iocore/hostdb/HostDB.cc
@@ -65,6 +65,7 @@ char hostdb_hostfile_path[PATH_NAME_MAX] = "";
int hostdb_sync_frequency = 120;
int hostdb_srv_enabled = 0;
int hostdb_disable_reverse_lookup = 0;
+extern int dns_thread;
ClassAllocator<HostDBContinuation> hostDBContAllocator("hostDBContAllocator");
@@ -581,18 +582,36 @@ HostDBContinuation::refresh_MD5()
mutex = hostDB.lock_for_bucket((int)(fold_md5(md5.hash) % hostDB.buckets));
}
+static void callBackCont(Continuation *cont, int event, HostDBInfo *r) {
+ if (dns_thread) {
+ /*
+ * TS-3882: Let's make sure we hop back to a net thread because at some point
+ * if you have per-thread server sessions it will attempt to access those pools
+ * that wont exist in a thread local on a DNS thread.
+ */
+ Event *e = eventAllocator.alloc();
+ e->callback_event = event;
+ e->cookie = static_cast<void*>(r);
+ e->init(cont, 0, 0);
+ e->use_cookie_in_callback = true;
+ eventProcessor.schedule(e, ET_NET);
+ } else {
+ cont->handleEvent(event, r);
+ }
+}
+
static bool
reply_to_cont(Continuation *cont, HostDBInfo *r, bool is_srv = false)
{
if (r == NULL || r->is_srv != is_srv || r->failed()) {
- cont->handleEvent(is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL);
+ callBackCont(cont, is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL);
return false;
}
if (r->reverse_dns) {
if (!r->hostname()) {
ink_assert(!"missing hostname");
- cont->handleEvent(is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL);
+ callBackCont(cont, is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL);
Warning("bogus entry deleted from HostDB: missing hostname");
hostDB.delete_block(r);
return false;
@@ -603,7 +622,7 @@ reply_to_cont(Continuation *cont, HostDBInfo *r, bool is_srv = false)
if (!r->is_srv && r->round_robin) {
if (!r->rr()) {
ink_assert(!"missing round-robin");
- cont->handleEvent(is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL);
+ callBackCont(cont, is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL);
Warning("bogus entry deleted from HostDB: missing round-robin");
hostDB.delete_block(r);
return false;
@@ -612,7 +631,7 @@ reply_to_cont(Continuation *cont, HostDBInfo *r, bool is_srv = false)
Debug("hostdb", "RR of %d with %d good, 1st IP = %s", r->rr()->rrcount, r->rr()->good, ats_ip_ntop(r->ip(), ipb, sizeof ipb));
}
- cont->handleEvent(is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, r);
+ callBackCont(cont, is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, r);
if (!r->full) {
Warning("bogus entry deleted from HostDB: none");
@@ -752,7 +771,7 @@ HostDBContinuation::insert(unsigned int attl)
attl = HOST_DB_MAX_TTL;
r->ip_timeout_interval = attl;
r->ip_timestamp = hostdb_current_interval;
- Debug("hostdb", "inserting for: %.*s: (md5: %" PRIx64 ") bucket: %d now: %u timeout: %u ttl: %u", md5.host_len, md5.host_name,
+ Debug("hostdb", "inserting (%p) for: %.*s: (md5: %" PRIx64 ") bucket: %d now: %u timeout: %u ttl: %u", r, md5.host_len, md5.host_name,
folded_md5, bucket, r->ip_timestamp, r->ip_timeout_interval, attl);
return r;
}
{code}
I'm really not thrilled with having to modify the Event class and UnixEThread.
was (Author: briang):
The issue here is that ET_DNS threads do not have the thread local that is expected by the per-thread session pools. One easy way to resolve that is to reschedule it to at ET_NET thread. The issue is to make that work I need to take advantage of the "cookie" parameter of the Event class, so it'll require a small modification to UnixEThread to allow the user to specify the cookie and call back the handler later with the cookie instead of the Event pointer. Below is a patch that fixes the crash, [~amc] [~zwoop] please review.
{code}
diff --git a/iocore/eventsystem/I_Event.h b/iocore/eventsystem/I_Event.h
index a61c49c..14056b0 100644
--- a/iocore/eventsystem/I_Event.h
+++ b/iocore/eventsystem/I_Event.h
@@ -212,6 +212,7 @@ public:
unsigned int immediate : 1;
unsigned int globally_allocated : 1;
unsigned int in_heap : 4;
+ bool use_cookie_in_callback;
int callback_event;
ink_hrtime timeout_at;
diff --git a/iocore/eventsystem/P_UnixEvent.h b/iocore/eventsystem/P_UnixEvent.h
index a224443..c9e137b 100644
--- a/iocore/eventsystem/P_UnixEvent.h
+++ b/iocore/eventsystem/P_UnixEvent.h
@@ -32,6 +32,7 @@ Event::init(Continuation *c, ink_hrtime atimeout_at, ink_hrtime aperiod)
period = aperiod;
immediate = !period && !atimeout_at;
cancelled = false;
+ use_cookie_in_callback = false;
return this;
}
@@ -45,7 +46,7 @@ Event::free()
TS_INLINE
Event::Event()
: ethread(0), in_the_prot_queue(false), in_the_priority_queue(false), immediate(false), globally_allocated(true), in_heap(false),
- timeout_at(0), period(0)
+ use_cookie_in_callback(false), timeout_at(0), period(0)
{
}
diff --git a/iocore/eventsystem/UnixEThread.cc b/iocore/eventsystem/UnixEThread.cc
index fb3af43..ecc3267 100644
--- a/iocore/eventsystem/UnixEThread.cc
+++ b/iocore/eventsystem/UnixEThread.cc
@@ -125,7 +125,7 @@ EThread::process_event(Event *e, int calling_code)
return;
}
Continuation *c_temp = e->continuation;
- e->continuation->handleEvent(calling_code, e);
+ e->continuation->handleEvent(calling_code, e->use_cookie_in_callback ? e->cookie : e);
ink_assert(!e->in_the_priority_queue);
ink_assert(c_temp == e->continuation);
MUTEX_RELEASE(lock);
diff --git a/iocore/hostdb/HostDB.cc b/iocore/hostdb/HostDB.cc
index 984d137..12cf32e 100644
--- a/iocore/hostdb/HostDB.cc
+++ b/iocore/hostdb/HostDB.cc
@@ -65,6 +65,7 @@ char hostdb_hostfile_path[PATH_NAME_MAX] = "";
int hostdb_sync_frequency = 120;
int hostdb_srv_enabled = 0;
int hostdb_disable_reverse_lookup = 0;
+extern int dns_thread;
ClassAllocator<HostDBContinuation> hostDBContAllocator("hostDBContAllocator");
@@ -581,18 +582,36 @@ HostDBContinuation::refresh_MD5()
mutex = hostDB.lock_for_bucket((int)(fold_md5(md5.hash) % hostDB.buckets));
}
+static void callBackCont(Continuation *cont, int event, HostDBInfo *r) {
+ if (dns_thread) {
+ /*
+ * TS-3882: Let's make sure we hop back to a net thread because at some point
+ * if you have per-thread server sessions it will attempt to access those pools
+ * that wont exist in a thread local on a DNS thread.
+ */
+ Event *e = eventAllocator.alloc();
+ e->callback_event = event;
+ e->cookie = static_cast<void*>(r);
+ e->init(cont, 0, 0);
+ e->use_cookie_in_callback = true;
+ eventProcessor.schedule(e, ET_NET);
+ } else {
+ cont->handleEvent(event, r);
+ }
+}
+
static bool
reply_to_cont(Continuation *cont, HostDBInfo *r, bool is_srv = false)
{
if (r == NULL || r->is_srv != is_srv || r->failed()) {
- cont->handleEvent(is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL);
+ callBackCont(cont, is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL);
return false;
}
if (r->reverse_dns) {
if (!r->hostname()) {
ink_assert(!"missing hostname");
- cont->handleEvent(is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL);
+ callBackCont(cont, is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL);
Warning("bogus entry deleted from HostDB: missing hostname");
hostDB.delete_block(r);
return false;
@@ -603,7 +622,7 @@ reply_to_cont(Continuation *cont, HostDBInfo *r, bool is_srv = false)
if (!r->is_srv && r->round_robin) {
if (!r->rr()) {
ink_assert(!"missing round-robin");
- cont->handleEvent(is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL);
+ callBackCont(cont, is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, NULL);
Warning("bogus entry deleted from HostDB: missing round-robin");
hostDB.delete_block(r);
return false;
@@ -612,7 +631,7 @@ reply_to_cont(Continuation *cont, HostDBInfo *r, bool is_srv = false)
Debug("hostdb", "RR of %d with %d good, 1st IP = %s", r->rr()->rrcount, r->rr()->good, ats_ip_ntop(r->ip(), ipb, sizeof ipb));
}
- cont->handleEvent(is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, r);
+ callBackCont(cont, is_srv ? EVENT_SRV_LOOKUP : EVENT_HOST_DB_LOOKUP, r);
if (!r->full) {
Warning("bogus entry deleted from HostDB: none");
@@ -752,7 +771,7 @@ HostDBContinuation::insert(unsigned int attl)
attl = HOST_DB_MAX_TTL;
r->ip_timeout_interval = attl;
r->ip_timestamp = hostdb_current_interval;
- Debug("hostdb", "inserting for: %.*s: (md5: %" PRIx64 ") bucket: %d now: %u timeout: %u ttl: %u", md5.host_len, md5.host_name,
+ Debug("hostdb", "inserting (%p) for: %.*s: (md5: %" PRIx64 ") bucket: %d now: %u timeout: %u ttl: %u", r, md5.host_len, md5.host_name,
folded_md5, bucket, r->ip_timestamp, r->ip_timeout_interval, attl);
return r;
}
{code}
> Trafficserver 4.1 Crash with proxy.config.dns.dedicated_thread = 1
> ------------------------------------------------------------------
>
> Key: TS-2193
> URL: https://issues.apache.org/jira/browse/TS-2193
> Project: Traffic Server
> Issue Type: Bug
> Components: DNS
> Affects Versions: 4.1.2
> Reporter: Tommy Lee
> Assignee: Brian Geffon
> Labels: Crash
> Fix For: 6.1.0
>
> Attachments: bt-01.txt
>
>
> Hi all,
> I've tried to enable DNS Thread without luck.
> When i set proxy.config.dns.dedicated_thread to 1, it crashes with the information below.
> The ATS is working in Forward Proxy mode.
> Thanks in advance.
> --------------------------------------
> traffic.out
> NOTE: Traffic Server received Sig 11: Segmentation fault
> /usr/local/cache-4.1/bin/traffic_server - STACK TRACE:
> /lib/x86_64-linux-gnu/libpthread.so.0(+0xfcb0)[0x2af714875cb0]
> /usr/local/cache-4.1/bin/traffic_server(_Z16_acquire_sessionP13SessionBucketPK8sockaddrR7INK_MD5P6HttpSM+0x52)[0x51dac2]
> /usr/local/cache-4.1/bin/traffic_server(_ZN18HttpSessionManager15acquire_sessionEP12ContinuationPK8sockaddrPKcP17HttpClientSessionP6HttpSM+0x3d1)[0x51e0f1]
> /usr/local/cache-4.1/bin/traffic_server(_ZN6HttpSM19do_http_server_openEb+0x30c)[0x53644c]
> /usr/local/cache-4.1/bin/traffic_server(_ZN6HttpSM14set_next_stateEv+0x6a0)[0x537560]
> /usr/local/cache-4.1/bin/traffic_server(_ZN6HttpSM14set_next_stateEv+0x57e)[0x53743e]
> /usr/local/cache-4.1/bin/traffic_server(_ZN6HttpSM14set_next_stateEv+0x57e)[0x53743e]
> /usr/local/cache-4.1/bin/traffic_server(_ZN6HttpSM27state_hostdb_reverse_lookupEiPv+0xb9)[0x526b99]
> /usr/local/cache-4.1/bin/traffic_server(_ZN6HttpSM12main_handlerEiPv+0xd8)[0x531be8]
> /usr/local/cache-4.1/bin/traffic_server[0x5d7c8a]
> /usr/local/cache-4.1/bin/traffic_server(_ZN18HostDBContinuation8dnsEventEiP7HostEnt+0x821)[0x5decd1]
> /usr/local/cache-4.1/bin/traffic_server(_ZN8DNSEntry9postEventEiP5Event+0x44)[0x5f7a94]
> /usr/local/cache-4.1/bin/traffic_server[0x5fd382]
> /usr/local/cache-4.1/bin/traffic_server(_ZN10DNSHandler8recv_dnsEiP5Event+0x852)[0x5fee72]
> /usr/local/cache-4.1/bin/traffic_server(_ZN10DNSHandler9mainEventEiP5Event+0x14)[0x5ffd94]
> /usr/local/cache-4.1/bin/traffic_server(_ZN7EThread13process_eventEP5Eventi+0x91)[0x6b2a41]
> /usr/local/cache-4.1/bin/traffic_server(_ZN7EThread7executeEv+0x514)[0x6b3534]
> /usr/local/cache-4.1/bin/traffic_server[0x6b17ea]
> /lib/x86_64-linux-gnu/libpthread.so.0(+0x7e9a)[0x2af71486de9a]
> /lib/x86_64-linux-gnu/libc.so.6(clone+0x6d)[0x2af71558dccd]
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)