You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@httpd.apache.org by Stefan Fritsch <sf...@sfritsch.de> on 2010/11/03 21:12:05 UTC
Proxy regressions
Hi,
some of the recent changes in the proxy code introduced some
regressions:
Two proxy-related tests fail:
t/modules/rewrite.t (Wstat: 0 Tests: 29 Failed: 2)
Failed tests: 23-24
Also, the runtime of the suite seems to have increased greatly for me
(from around 80 to 360 seconds). I haven't made detailed measurements,
but my feeling is that most of the time is spent in the ssl/proxy.t
and modules/proxy.t tests.
And there is a segfault (triggered by the test suite):
Program terminated with signal 11, Segmentation fault.
#0 0xf7715e5d in apr_brigade_cleanup (data=0xa023a28) at
buckets/apr_brigade.c:44
44 buckets/apr_brigade.c: No such file or directory.
in buckets/apr_brigade.c
(gdb) bt full
#0 0xf7715e5d in apr_brigade_cleanup (data=0xa023a28) at
buckets/apr_brigade.c:44
No locals.
#1 0xf70f9d20 in ap_proxy_http_process_response (p=0x9fe55c0,
r=0x9fe5600, backend=0x9fa9510,
worker=0x9fa1908, conf=0x9cb3bb0, server_portstr=0xffc80728
":8529") at mod_proxy_http.c:1931
readbytes = 8
rv = 0
mode = APR_NONBLOCK_READ
finish = 1
rc = 0
c = 0x9fa5698
buffer = "\000\nntent-Type\000
text/html\000\n4bc0\"\000\n11:50:15 GMT\000\ndev OpenSSL/0.9.8o
DAV/2\000\nlete to 127.0.0.1:8529 (localhost)\000\000referer:
\000*)\000\000referer:
\000w\000\000\000G\271X\367\025\000\000\000\364OZ\367\270\346\307\377\211\270X\367\340P\270\t\307\347\307\377\000\000\000\000\000\000\000\000\001\000\000\000$\024\022\367\330\346\307\377\"\322\b\b\340P\270\t\340P\270\t\002\000\000\000$\024\022\367\001\000\000\000\210\034\372\t\210\a\310\377\t\327\b\b\232\332\021\367)\004\000\000;
\000\000\000\a\000\000\000\000\000\000\000"...
buf = 0xa00aeb8 "text/html"
keepchar = 32 ' '
rp = 0xa023a48
e = 0xf6e61e00
bb = 0xa023a28
tmp_bb = 0xa00a988
pass_bb = 0xa023a38
len = 15
backasswards = 0
interim_response = 0
pread_len = 0
save_table = 0xa00a9a0
backend_broke = 0
hop_by_hop_hdrs = {0xf70fac8e "Keep-Alive", 0xf70fb4fd "Proxy-
Authenticate", 0xf70fac99 "TE",
0xf70fac9c "Trailer", 0xf70faca4 "Upgrade", 0x0}
i = 5
te = 0x0
original_status = 200
proxy_status = 200
original_status_line = 0x0
proxy_status_line = 0xa00a998 "200 OK"
origin = 0x9fab6c0
old_timeout = 0
dconf = 0x9cb47c8
do_100_continue = 0
#2 0xf70fa6d5 in proxy_http_handler (r=0x9fe5600, worker=0x9fa1908,
conf=0x9cb3bb0,
url=0x9feefae
"http://localhost:8529/modules/rewrite/foo%20bar.html", proxyname=0x0,
proxyport=0)
at mod_proxy_http.c:2133
locurl = 0x9fef288 "/modules/rewrite/foo%20bar.html"
status = 0
server_portstr =
":8529\000X\367\300U\376\t\320\366\021\367\001\000\000\000\246\223\021\367\250\361\376\t/\000\000"
scheme = 0x9fef230 "http"
proxy_function = 0xf70fb40d "HTTP"
u = 0x9feefb2
"://localhost:8529/modules/rewrite/foo%20bar.html"
backend = 0x9fa9510
is_ssl = 0
c = 0x9fa5698
retry = 0
p = 0x9fe55c0
uri = 0x9fef200
#3 0xf7116278 in proxy_run_scheme_handler (r=0x9fe5600,
worker=0x9fa1908, conf=0x9cb3bb0,
url=0x9feefae
"http://localhost:8529/modules/rewrite/foo%20bar.html", proxyhost=0x0,
proxyport=0)
at mod_proxy.c:2456
pHook = 0x9e68e58
n = 0
rv = -3667972
#4 0xf71126de in proxy_handler (r=0x9fe5600) at mod_proxy.c:1067
url = 0x9feefae
"http://localhost:8529/modules/rewrite/foo%20bar.html"
uri = 0x9feefae
"http://localhost:8529/modules/rewrite/foo%20bar.html"
scheme = 0x9fef168 "http"
p = 0x9feefb2
"://localhost:8529/modules/rewrite/foo%20bar.html"
p2 = 0x1 <Address 0x1 out of bounds>
sconf = 0x9bb97b0
conf = 0x9cb3bb0
proxies = 0x9cb3c48
ents = 0x9cb3c60
i = 0
rc = 167663584
access_status = 0
direct_connect = 0
str = 0x0
maxfwd = -1
balancer = 0x0
worker = 0x9fa1908
attempts = 0
max_attempts = 0
list = 0x9cb3e88
#5 0x08086ff7 in ap_run_handler (r=0x9fe5600) at config.c:165
pHook = 0x9e681c0
n = 2
rv = -1
#6 0x08087716 in ap_invoke_handler (r=0x9fe5600) at config.c:406
handler = 0x0
p = 0xf758fd9c
"U\211\345S\203\354$\350\357'\377\377\201\303LR\001"
result = 0
old_handler = 0xf6e72700 "proxy-server"
ignore = 0x0
#7 0x0809ad8c in ap_process_async_request (r=0x9fe5600) at
http_request.c:316
c = 0x9fa5698
access_status = 0
#8 0x0809ae4c in ap_process_request (r=0x9fe5600) at
http_request.c:361
bb = 0x9fa5698
b = 0x9fe5600
c = 0x9fa5698
rv = 167666464
#9 0x080979f2 in ap_process_http_sync_connection (c=0x9fa5698) at
http_core.c:186
r = 0x9fe5600
cs = 0x9fa5a38
csd = 0x0
mpm_state = 0
#10 0x08097b06 in ap_process_http_connection (c=0x9fa5698) at
http_core.c:227
No locals.
#11 0x0809127b in ap_run_process_connection (c=0x9fa5698) at
connection.c:41
pHook = 0x9e687f0
n = 3
rv = -1
#12 0x08091683 in ap_process_connection (c=0x9fa5698, csd=0x9fa5500)
at connection.c:188
rc = -2
#13 0xf777ef85 in child_main (child_num_arg=1) at prefork.c:667
current_conn = 0x9fa5698
csd = 0x9fa5500
thd = 0x9fa18d8
osthd = 4147828480
ptrans = 0x9fa54c0
allocator = 0x9fa1810
status = 0
i = -1
lr = 0x9b857c0
pollset = 0x9fa1c90
sbh = 0x9fa1c88
bucket_alloc = 0x9fd9590
last_poll_idx = 1
lockfile = 0x0
#14 0xf777f1ca in make_child (s=0x9bb9620, slot=1) at prefork.c:762
pid = 0
#15 0xf777f243 in startup_children (number_to_start=1) at
prefork.c:780
i = 1
#16 0xf777f68b in prefork_run (_pconf=0x9b870a8, plog=0x9bb5418,
s=0x9bb9620) at prefork.c:951
index = -3666824
remaining_children_to_start = 2
rv = 0
#17 0x08070e25 in ap_run_mpm (pconf=0x9b870a8, plog=0x9bb5418,
s=0x9bb9620) at mpm_common.c:92
pHook = 0x9e68930
n = 0
rv = 5
#18 0x0806b873 in main (argc=9, argv=0xffc80e64) at main.c:741
c = 68 'D'
configtestonly = 0
showcompile = 0
confname = 0xffc812ed "/home/sf/apache/perl-
framework/t/conf/httpd.conf"
def_server_root = 0xffc812c8 "/home/sf/apache/perl-
framework/t"
temp_error_log = 0x0
error = 0x0
process = 0x9b85130
pconf = 0x9b870a8
plog = 0x9bb5418
ptemp = 0x9bc7870
pcommands = 0x9b89120
opt = 0x9b891c0
rv = 0
mod = 0x80ad60c
opt_arg = 0xffc8132d "PERL_USEITHREADS"
signal_server = 0
Re: Proxy regressions
Posted by Jeff Trawick <tr...@gmail.com>.
On Wed, Nov 3, 2010 at 4:28 PM, Stefan Fritsch <sf...@sfritsch.de> wrote:
> On Wednesday 03 November 2010, Stefan Fritsch wrote:
>> some of the recent changes in the proxy code introduced some
>> regressions
>
> Strange, I have these problems only with prefork, not with event.
> But with event I get a segfault in the reslist cleanup code.
>
> Can somebody cross-check this?
look familiar?
httpd: misc/apr_reslist.c:161: reslist_cleanup: Assertion `rl->ntotal
== 0' failed.
[Tue Nov 02 07:59:32.863718 2010] [core:notice] [pid 26494:tid
3078747840] child pid 26495 exit signal Aborted (6), possible coredump
in /home/trawick/ApacheCon/23inst
(gdb) where
#0 0x008c1422 in __kernel_vsyscall ()
#1 0x0013a651 in *__GI_raise (sig=6) at
../nptl/sysdeps/unix/sysv/linux/raise.c:64
#2 0x0013da82 in *__GI_abort () at abort.c:92
#3 0x00133718 in *__GI___assert_fail (assertion=0x31c7a2 "rl->ntotal == 0",
file=0x31c780 "misc/apr_reslist.c", line=161, function=0x31c7b2
"reslist_cleanup")
at assert.c:81
#4 0x00315b4c in reslist_cleanup (data_=0x96ec250) at misc/apr_reslist.c:161
#5 0x00dedb5e in run_cleanups (pool=0x96ec210) at memory/unix/apr_pools.c:2308
#6 apr_pool_destroy (pool=0x96ec210) at memory/unix/apr_pools.c:779
#7 0x00dedb40 in apr_pool_destroy (pool=0x96ea208) at
memory/unix/apr_pools.c:776
#8 0x00559959 in clean_child_exit (child_num_arg=<value optimized
out>) at worker.c:379
This is with HEAD of apr-util 1.4.
Re: Proxy regressions
Posted by Stefan Fritsch <sf...@sfritsch.de>.
On Friday 05 November 2010, Joe Orton wrote:
> On Thu, Nov 04, 2010 at 08:57:53PM +0100, Stefan Fritsch wrote:
> > On Thursday 04 November 2010, Jim Jagielski wrote:
> > > Tested so +1
> >
> > Yes, the latest round of fixes seems to have fixed all my
> > problems. Thanks.
>
> I get a bunch of 404s in the aaa.t authz/form tests, did you forget
> to check in t/htdocs/authz/form or is that supposed to be created
> on-the-fly? I haven't dug into it.
>
It's an alias. But it requires mod_session_cookie. I have fixed it to
skip the auth_form tests if that one is missing.
Re: Proxy regressions
Posted by Rainer Jung <ra...@kippdata.de>.
On 06.11.2010 12:03, Stefan Fritsch wrote:
> On Friday 05 November 2010, Jim Jagielski wrote:
>> On Nov 4, 2010, at 4:02 PM, Stefan Fritsch wrote:
>>> On Thursday 04 November 2010, Stefan Fritsch wrote:
>>>> Yes, the latest round of fixes seems to have fixed all my
>>>> problems.
>>>
>>> Oops. Minus the remaining failure already mentioned by Graham:
>>>
>>> t/modules/proxy.t (Wstat: 0 Tests: 15 Failed: 1)
>>>
>>> Failed test: 10
>>
>> FWIW, I can't recreate this:
>>
>> [warning] setting ulimit to allow core files
>> ulimit -c unlimited; /opt/local/bin/perl
>> /Users/jim/src/asf/code/stable/httpd-test/framework/t/TEST
>> /usr/local/apache2/bin/httpd -d
>> /Users/jim/src/asf/code/stable/httpd-test/framework/t -f
>> /Users/jim/src/asf/code/stable/httpd-test/framework/t/conf/httpd.c
>> onf -D APACHE2 using Apache/2.3.9-dev (event MPM)
>
> It works for me with event, too. I only get the failure with prefork.
>
> FWIW, I am using Debian's apr 1.4.2/apr-util 1.3.9 (with some fixes
> backported from 1.3.10).
Me too. Failure only for prefork, not for worker or event. Failure for
apr/apu 1.4.2/1.3.10 and *not* for apr trunk, not even with prefork.
Regards,
Rainer
Re: Proxy regressions
Posted by Stefan Fritsch <sf...@sfritsch.de>.
On Friday 05 November 2010, Jim Jagielski wrote:
> On Nov 4, 2010, at 4:02 PM, Stefan Fritsch wrote:
> > On Thursday 04 November 2010, Stefan Fritsch wrote:
> >> Yes, the latest round of fixes seems to have fixed all my
> >> problems.
> >
> > Oops. Minus the remaining failure already mentioned by Graham:
> >
> > t/modules/proxy.t (Wstat: 0 Tests: 15 Failed: 1)
> >
> > Failed test: 10
>
> FWIW, I can't recreate this:
>
> [warning] setting ulimit to allow core files
> ulimit -c unlimited; /opt/local/bin/perl
> /Users/jim/src/asf/code/stable/httpd-test/framework/t/TEST
> /usr/local/apache2/bin/httpd -d
> /Users/jim/src/asf/code/stable/httpd-test/framework/t -f
> /Users/jim/src/asf/code/stable/httpd-test/framework/t/conf/httpd.c
> onf -D APACHE2 using Apache/2.3.9-dev (event MPM)
It works for me with event, too. I only get the failure with prefork.
FWIW, I am using Debian's apr 1.4.2/apr-util 1.3.9 (with some fixes
backported from 1.3.10).
Re: Proxy regressions
Posted by Graham Leggett <mi...@sharp.fm>.
On 05 Nov 2010, at 10:52 PM, Jim Jagielski wrote:
> FWIW, I can't recreate this:
>
> [warning] setting ulimit to allow core files
> ulimit -c unlimited; /opt/local/bin/perl /Users/jim/src/asf/code/
> stable/httpd-test/framework/t/TEST
> /usr/local/apache2/bin/httpd -d /Users/jim/src/asf/code/stable/
> httpd-test/framework/t -f /Users/jim/src/asf/code/stable/httpd-test/
> framework/t/conf/httpd.conf -D APACHE2
> using Apache/2.3.9-dev (event MPM)
>
> ...
> t/modules/proxy.............ok
> t/modules/proxy_balancer....ok
> ...
>
> proxy.t was maybe a bit slow though...
>
> OS X 10.6.4
I see the issue on MacOSX 10.5.8, wonder if it's a Leopard thing, or
an apr thing?
Regards,
Graham
--
Re: Proxy regressions
Posted by Jim Jagielski <ji...@jaguNET.com>.
On Nov 4, 2010, at 4:02 PM, Stefan Fritsch wrote:
> On Thursday 04 November 2010, Stefan Fritsch wrote:
>> Yes, the latest round of fixes seems to have fixed all my
>> problems.
>
> Oops. Minus the remaining failure already mentioned by Graham:
>
> t/modules/proxy.t (Wstat: 0 Tests: 15 Failed: 1)
> Failed test: 10
>
FWIW, I can't recreate this:
[warning] setting ulimit to allow core files
ulimit -c unlimited; /opt/local/bin/perl /Users/jim/src/asf/code/stable/httpd-test/framework/t/TEST
/usr/local/apache2/bin/httpd -d /Users/jim/src/asf/code/stable/httpd-test/framework/t -f /Users/jim/src/asf/code/stable/httpd-test/framework/t/conf/httpd.conf -D APACHE2
using Apache/2.3.9-dev (event MPM)
...
t/modules/proxy.............ok
t/modules/proxy_balancer....ok
...
proxy.t was maybe a bit slow though...
OS X 10.6.4
Re: Proxy regressions
Posted by Stefan Fritsch <sf...@sfritsch.de>.
On Thursday 04 November 2010, Stefan Fritsch wrote:
> Yes, the latest round of fixes seems to have fixed all my
> problems.
Oops. Minus the remaining failure already mentioned by Graham:
t/modules/proxy.t (Wstat: 0 Tests: 15 Failed: 1)
Failed test: 10
Re: Proxy regressions
Posted by Joe Orton <jo...@redhat.com>.
On Thu, Nov 04, 2010 at 08:57:53PM +0100, Stefan Fritsch wrote:
> On Thursday 04 November 2010, Jim Jagielski wrote:
> > Tested so +1
>
> Yes, the latest round of fixes seems to have fixed all my problems.
> Thanks.
I get a bunch of 404s in the aaa.t authz/form tests, did you forget to
check in t/htdocs/authz/form or is that supposed to be created
on-the-fly? I haven't dug into it.
t/modules/aaa.t ............. 1/38 # Test 23 got: "404" (t/modules/aaa.t at line 117)
# Expected: "302" (form: access without user/env should redirect with 302)
# Test 24 got: <UNDEF> (t/modules/aaa.t at line 125)
# Expected: "/authz/login.html" (form: login without user/env should redirect to login form)
# Test 25 got: "404" (t/modules/aaa.t at line 135)
# Expected: "302" (form: login with wrong passwd should redirect with 302)
# Test 26 got: <UNDEF> (t/modules/aaa.t at line 143)
# Expected: "/authz/login.html" (form: login with wrong passwd should redirect to login form)
...
error_log sez:
127.0.0.1 - udigest [05/Nov/2010:17:28:42 -0400] "GET /authz/digest/index.html HTTP/1.1" 200 26
127.0.0.1 - - [05/Nov/2010:17:28:42 -0400] "GET /authz/digest/index.html HTTP/1.1" 200 26
127.0.0.1 - - [05/Nov/2010:17:28:42 -0400] "GET /authz/digest/index.html.foo HTTP/1.1" 404 225
127.0.0.1 - - [05/Nov/2010:17:28:42 -0400] "GET /authz/form/index.html HTTP/1.1" 404 219
127.0.0.1 - - [05/Nov/2010:17:28:42 -0400] "POST /authz/form/dologin.html HTTP/1.1" 404 221
...
Re: Proxy regressions
Posted by Stefan Fritsch <sf...@sfritsch.de>.
On Thursday 04 November 2010, Jim Jagielski wrote:
> Tested so +1
Yes, the latest round of fixes seems to have fixed all my problems.
Thanks.
Re: Proxy regressions
Posted by Jim Jagielski <ji...@jaguNET.com>.
Tested so +1
On Nov 4, 2010, at 3:36 AM, Ruediger Pluem wrote:
>
>
> On 11/04/2010 01:34 AM, Rainer Jung wrote:
>> On 04.11.2010 00:57, Graham Leggett wrote:
>>> On 03 Nov 2010, at 10:28 PM, Stefan Fritsch wrote:
>>>
>>>> Strange, I have these problems only with prefork, not with event.
>>>> But with event I get a segfault in the reslist cleanup code.
>>>>
>>>> Can somebody cross-check this?
>>>
>>> This smelled like a pool lifetime issue, and looking closer it looked
>>> like we were cleaning up a brigade after we had returned the backend to
>>> the pool. The attached patch stops the crash, and it seems to run at a
>>> sensible speed again (I suspect the late cleanup of the brigade was
>>> sending the code into a spin). Can you give it a try?
>
>> From just looking at the code this seems to be the correct thing to do.
>
>>>
>>> One proxy test still fails though:
>>>
>>> t/modules/proxy.............ok 7/15# Failed test 10 in t/modules/proxy.t
>>> at line 34
>>> t/modules/proxy.............FAILED test 10
>>> Failed 1/15 tests, 93.33% okay
>>>
>>> This fails because the first bucket of the response has been corrupted:
>>>
>>> graham-leggetts-macbook-pro-3:httpd-test minfrin$ curl
>>> http://localhost:8536/reverse/modules/cgi/nph-102.pl
>>> sl/2.3.9-dev OpenS
>>>
>>> I've seen this a few weeks ago and it went away, so I suspect this isn't
>>> proxy that's doing it. Need to dig further.
>>
>> Your patch fixes the crashes for me. All tests pass if the following
>> patch is also applied. Otherwise worker hangs.
>>
>> I don't know, whether that is the right thing to add, especially since
>> "cleared" now has a use very similar to "inreslist". The patch is an
>> addon to r1026665 where the "cleaned" flag was introduced, occassionally
>> set to "1" and tested against "0", but never reset.
>>
>>
>> Index: modules/proxy/proxy_util.c
>> ===================================================================
>> --- modules/proxy/proxy_util.c (revision 1030775)
>> +++ modules/proxy/proxy_util.c (working copy)
>> @@ -2096,6 +2096,7 @@
>> #if APR_HAS_THREADS
>> (*conn)->inreslist = 0;
>> #endif
>> + (*conn)->cleaned = 0;
>>
>> return OK;
>> }
>
> Yeah, this is correct. As I reviewed the patch initially I thought the reset
> would happen through an regular apr_pcalloc for conn, but this is not the case.
> So the reset is the correct thing to do here.
>
> Regards
>
> Rüdiger
>
Re: Proxy regressions
Posted by Rainer Jung <ra...@kippdata.de>.
On 04.11.2010 08:36, Ruediger Pluem wrote:
> On 11/04/2010 01:34 AM, Rainer Jung wrote:
>> On 04.11.2010 00:57, Graham Leggett wrote:
>>> On 03 Nov 2010, at 10:28 PM, Stefan Fritsch wrote:
>>>
>>>> Strange, I have these problems only with prefork, not with event.
>>>> But with event I get a segfault in the reslist cleanup code.
>>>>
>>>> Can somebody cross-check this?
>>>
>>> This smelled like a pool lifetime issue, and looking closer it looked
>>> like we were cleaning up a brigade after we had returned the backend to
>>> the pool. The attached patch stops the crash, and it seems to run at a
>>> sensible speed again (I suspect the late cleanup of the brigade was
>>> sending the code into a spin). Can you give it a try?
>
> From just looking at the code this seems to be the correct thing to do.
OK, I'll let Graham commit that one.
>> Your patch fixes the crashes for me. All tests pass if the following
>> patch is also applied. Otherwise worker hangs.
>>
>> I don't know, whether that is the right thing to add, especially since
>> "cleared" now has a use very similar to "inreslist". The patch is an
>> addon to r1026665 where the "cleaned" flag was introduced, occassionally
>> set to "1" and tested against "0", but never reset.
>>
...
> Yeah, this is correct. As I reviewed the patch initially I thought the reset
> would happen through an regular apr_pcalloc for conn, but this is not the case.
> So the reset is the correct thing to do here.
Committed in r1030850.
Regards,
Rainer
Re: Proxy regressions
Posted by Ruediger Pluem <rp...@apache.org>.
On 11/04/2010 01:34 AM, Rainer Jung wrote:
> On 04.11.2010 00:57, Graham Leggett wrote:
>> On 03 Nov 2010, at 10:28 PM, Stefan Fritsch wrote:
>>
>>> Strange, I have these problems only with prefork, not with event.
>>> But with event I get a segfault in the reslist cleanup code.
>>>
>>> Can somebody cross-check this?
>>
>> This smelled like a pool lifetime issue, and looking closer it looked
>> like we were cleaning up a brigade after we had returned the backend to
>> the pool. The attached patch stops the crash, and it seems to run at a
>> sensible speed again (I suspect the late cleanup of the brigade was
>> sending the code into a spin). Can you give it a try?
>From just looking at the code this seems to be the correct thing to do.
>>
>> One proxy test still fails though:
>>
>> t/modules/proxy.............ok 7/15# Failed test 10 in t/modules/proxy.t
>> at line 34
>> t/modules/proxy.............FAILED test 10
>> Failed 1/15 tests, 93.33% okay
>>
>> This fails because the first bucket of the response has been corrupted:
>>
>> graham-leggetts-macbook-pro-3:httpd-test minfrin$ curl
>> http://localhost:8536/reverse/modules/cgi/nph-102.pl
>> sl/2.3.9-dev OpenS
>>
>> I've seen this a few weeks ago and it went away, so I suspect this isn't
>> proxy that's doing it. Need to dig further.
>
> Your patch fixes the crashes for me. All tests pass if the following
> patch is also applied. Otherwise worker hangs.
>
> I don't know, whether that is the right thing to add, especially since
> "cleared" now has a use very similar to "inreslist". The patch is an
> addon to r1026665 where the "cleaned" flag was introduced, occassionally
> set to "1" and tested against "0", but never reset.
>
>
> Index: modules/proxy/proxy_util.c
> ===================================================================
> --- modules/proxy/proxy_util.c (revision 1030775)
> +++ modules/proxy/proxy_util.c (working copy)
> @@ -2096,6 +2096,7 @@
> #if APR_HAS_THREADS
> (*conn)->inreslist = 0;
> #endif
> + (*conn)->cleaned = 0;
>
> return OK;
> }
Yeah, this is correct. As I reviewed the patch initially I thought the reset
would happen through an regular apr_pcalloc for conn, but this is not the case.
So the reset is the correct thing to do here.
Regards
Rüdiger
Re: Proxy regressions
Posted by Rainer Jung <ra...@kippdata.de>.
On 04.11.2010 00:57, Graham Leggett wrote:
> On 03 Nov 2010, at 10:28 PM, Stefan Fritsch wrote:
>
>> Strange, I have these problems only with prefork, not with event.
>> But with event I get a segfault in the reslist cleanup code.
>>
>> Can somebody cross-check this?
>
> This smelled like a pool lifetime issue, and looking closer it looked
> like we were cleaning up a brigade after we had returned the backend to
> the pool. The attached patch stops the crash, and it seems to run at a
> sensible speed again (I suspect the late cleanup of the brigade was
> sending the code into a spin). Can you give it a try?
>
> One proxy test still fails though:
>
> t/modules/proxy.............ok 7/15# Failed test 10 in t/modules/proxy.t
> at line 34
> t/modules/proxy.............FAILED test 10
> Failed 1/15 tests, 93.33% okay
>
> This fails because the first bucket of the response has been corrupted:
>
> graham-leggetts-macbook-pro-3:httpd-test minfrin$ curl
> http://localhost:8536/reverse/modules/cgi/nph-102.pl
> sl/2.3.9-dev OpenS
>
> I've seen this a few weeks ago and it went away, so I suspect this isn't
> proxy that's doing it. Need to dig further.
Your patch fixes the crashes for me. All tests pass if the following
patch is also applied. Otherwise worker hangs.
I don't know, whether that is the right thing to add, especially since
"cleared" now has a use very similar to "inreslist". The patch is an
addon to r1026665 where the "cleaned" flag was introduced, occassionally
set to "1" and tested against "0", but never reset.
Index: modules/proxy/proxy_util.c
===================================================================
--- modules/proxy/proxy_util.c (revision 1030775)
+++ modules/proxy/proxy_util.c (working copy)
@@ -2096,6 +2096,7 @@
#if APR_HAS_THREADS
(*conn)->inreslist = 0;
#endif
+ (*conn)->cleaned = 0;
return OK;
}
Regards,
Rainer
Re: Proxy regressions
Posted by Ruediger Pluem <rp...@apache.org>.
On 11/10/2010 10:31 PM, Jim Jagielski wrote:
> On Nov 10, 2010, at 9:25 AM, Graham Leggett wrote:
>
>> On 10 Nov 2010, at 4:13 PM, Plüm, Rüdiger, VF-Group wrote:
>>
>>> The core input filter in ap_core_input_filter which is used to read
>>> the response from the backend creates the socket bucket from the conn rec
>>> bucket allocator. So the used bucket allocator must live as long
>>> as the conn rec of the backend connection lives.
>>> And the conn rec of the backend connection lives longer then one request
>>> and possibly longer than the frontend connection if the backend has keepalive
>>> enabled, in fact it is reused by the next frontend request that leases the
>>> backend connection from the backend connection pool.
>> In that case, the way to fix this would be to insert the frontend conn_rec's allocator into the backend conn_rec when we take it out of the pool, and then remove this allocator again when we're finished and place the connection back into the pool.
>>
>> This way, all our buckets would live as long as the frontend allocator, and we would be free to return the backend conn_rec back to the pool at any time.
>
> +1... Seems the only way to address this.
>
As long as underlying connection filters like mod_ssl do not buffer any buckets that need to used
later once we get the connection back from the pool. I know I play devils advocate here :-).
Regards
Rüdiger
Re: Proxy regressions
Posted by Jim Jagielski <ji...@jaguNET.com>.
On Nov 10, 2010, at 9:25 AM, Graham Leggett wrote:
> On 10 Nov 2010, at 4:13 PM, Plüm, Rüdiger, VF-Group wrote:
>
>> The core input filter in ap_core_input_filter which is used to read
>> the response from the backend creates the socket bucket from the conn rec
>> bucket allocator. So the used bucket allocator must live as long
>> as the conn rec of the backend connection lives.
>> And the conn rec of the backend connection lives longer then one request
>> and possibly longer than the frontend connection if the backend has keepalive
>> enabled, in fact it is reused by the next frontend request that leases the
>> backend connection from the backend connection pool.
>
> In that case, the way to fix this would be to insert the frontend conn_rec's allocator into the backend conn_rec when we take it out of the pool, and then remove this allocator again when we're finished and place the connection back into the pool.
>
> This way, all our buckets would live as long as the frontend allocator, and we would be free to return the backend conn_rec back to the pool at any time.
+1... Seems the only way to address this.
Re: Proxy regressions
Posted by Graham Leggett <mi...@sharp.fm>.
On 10 Nov 2010, at 4:13 PM, Plüm, Rüdiger, VF-Group wrote:
> The core input filter in ap_core_input_filter which is used to read
> the response from the backend creates the socket bucket from the
> conn rec
> bucket allocator. So the used bucket allocator must live as long
> as the conn rec of the backend connection lives.
> And the conn rec of the backend connection lives longer then one
> request
> and possibly longer than the frontend connection if the backend has
> keepalive
> enabled, in fact it is reused by the next frontend request that
> leases the
> backend connection from the backend connection pool.
In that case, the way to fix this would be to insert the frontend
conn_rec's allocator into the backend conn_rec when we take it out of
the pool, and then remove this allocator again when we're finished and
place the connection back into the pool.
This way, all our buckets would live as long as the frontend
allocator, and we would be free to return the backend conn_rec back to
the pool at any time.
Regards,
Graham
--
RE: Proxy regressions
Posted by "Plüm, Rüdiger, VF-Group" <ru...@vodafone.com>.
> -----Original Message-----
> From: "Plüm, Rüdiger, VF-Group"
> Sent: Mittwoch, 10. November 2010 14:55
> To: dev@httpd.apache.org
> Subject: RE: Proxy regressions
>
>
>
> > -----Original Message-----
> > From: Graham Leggett
> > Sent: Mittwoch, 10. November 2010 14:05
> > To: dev@httpd.apache.org
> > Subject: Re: Proxy regressions
> >
> > On 10 Nov 2010, at 1:09 PM, Plüm, Rüdiger, VF-Group wrote:
> >
> > >> The proxy currently creates the allocator in
> > >> ap_proxy_connection_create(), and then passes the
> allocator to the
> > >> various submodules via the ap_run_create_connection() hook, so it
> > >> looks like we just passing the wrong allocator.
> > >
> > > The problem is that we keep the connrec structure in the
> conn pool.
> > > It is not created each time we fetch a connection from the
> > conn pool.
> > > This is required to enable keepalives with SSL backends.
> > > As said if we pass the bucket allocator from the front end
> > connection
> > > we possibly end up with other pool lifetime issues and as
> I speak of
> > > it SSL comes to my mind.
> >
> > This doesn't sound right to me - ideally anything doing a read of
> > anything that will ultimately be sent up the filter stack
> should use
> > the allocator belonging to the frontend connection. When
> the backend
> > connection is returned to the pool, the allocator should be
> removed,
> > and the next allocator inserted when the backend connection is
> > subsequently reused.
> >
> > Currently what we seem to have is data allocated out of a
> pool that
> > has a lifetime completely unrelated to the frontend request,
> > and then
> > we're working around this by trying to keep this unrelated
> > pool alive
> > way longer than it's useful lifetime, and at least as long as the
> > original request. This seems broken to me, we should really
> be using
> > the correct pools all the way through.
>
> As said this sounds doable for http backends, but not for
> https backends
> where we need to keep some data regarding the SSL state in
> the conn rec
> of the backend connection.
The core input filter in ap_core_input_filter which is used to read
the response from the backend creates the socket bucket from the conn rec
bucket allocator. So the used bucket allocator must live as long
as the conn rec of the backend connection lives.
And the conn rec of the backend connection lives longer then one request
and possibly longer than the frontend connection if the backend has keepalive
enabled, in fact it is reused by the next frontend request that leases the
backend connection from the backend connection pool.
Regards
Rüdiger
Re: Proxy regressions
Posted by Graham Leggett <mi...@sharp.fm>.
On 10 Nov 2010, at 3:54 PM, Plüm, Rüdiger, VF-Group wrote:
> As said this sounds doable for http backends, but not for https
> backends
> where we need to keep some data regarding the SSL state in the conn
> rec
> of the backend connection.
This is entirely fine, it's only the contents of the buckets in the
brigade that need to have a lifetime as long as the request, other SSL
specific data can work as it does now, there is no need for it to be
changed.
>> A further issue is with backend servers where keepalive is switched
>> off. Instead of acknowledging the connection close and releasing the
>> connection, we hold the connection open for ages until the client
>> finally acknowledges the request as finished.
>
> Is this a problem of a too long lingering close period on the
> backend server
> blocking the expensive backend server threads?
> I mean in general the backend server is the one who closes the
> connection
> if its keepalive timeout was used up and hence it can close the socket
> from its side. The only thing that comes to mind that could keep it
> blocked is a lingering close. Is this the case here?
We do see this, yes, and it caused two outages in our testing
environment recently caused by the initial version of the patch and
the original pool lifetime bug.
>> In v2.0, it was only saved in the connection if a keepalive was
>> present. If there was no keepalive, it was released immediately.
>
> Which resulted in no connection pooling at all.
Having a connection pool isn't a win if it comes at the cost of
performance drag elsewhere. Ultimately the connections themselves in
our case are far cheaper than the application servers behind them.
Regards,
Graham
--
RE: Proxy regressions
Posted by "Plüm, Rüdiger, VF-Group" <ru...@vodafone.com>.
> -----Original Message-----
> From: Graham Leggett
> Sent: Mittwoch, 10. November 2010 14:05
> To: dev@httpd.apache.org
> Subject: Re: Proxy regressions
>
> On 10 Nov 2010, at 1:09 PM, Plüm, Rüdiger, VF-Group wrote:
>
> >> The proxy currently creates the allocator in
> >> ap_proxy_connection_create(), and then passes the allocator to the
> >> various submodules via the ap_run_create_connection() hook, so it
> >> looks like we just passing the wrong allocator.
> >
> > The problem is that we keep the connrec structure in the conn pool.
> > It is not created each time we fetch a connection from the
> conn pool.
> > This is required to enable keepalives with SSL backends.
> > As said if we pass the bucket allocator from the front end
> connection
> > we possibly end up with other pool lifetime issues and as I speak of
> > it SSL comes to my mind.
>
> This doesn't sound right to me - ideally anything doing a read of
> anything that will ultimately be sent up the filter stack should use
> the allocator belonging to the frontend connection. When the backend
> connection is returned to the pool, the allocator should be removed,
> and the next allocator inserted when the backend connection is
> subsequently reused.
>
> Currently what we seem to have is data allocated out of a pool that
> has a lifetime completely unrelated to the frontend request,
> and then
> we're working around this by trying to keep this unrelated
> pool alive
> way longer than it's useful lifetime, and at least as long as the
> original request. This seems broken to me, we should really be using
> the correct pools all the way through.
As said this sounds doable for http backends, but not for https backends
where we need to keep some data regarding the SSL state in the conn rec
of the backend connection.
>
> >> Right now, we are holding backend connections open for as
> long as it
> >> takes for a frontend connection to acknowledge the request. A
> >> typical
> >> backend could be finished within milliseconds, while the
> >> connection to
> >> the frontend often takes hundreds, sometimes thousands of
> >> milliseconds. While the backend connection is being held open, that
> >> slot cannot be used by anyone else.
> >
> > Used by whom?
>
> Another worker in httpd.
>
> > As said if you put it back in the pool and your pool has the
> > same max size as the number of threads in the process then
> there is
> > some
> > chance that this connection will idle in the pool until the actual
> > thread
> > sent data to the client and fetches the connection from the pool
> > again.
> > As said I can only follow if the max pool size is configured to be
> > smaller
> > than the number of threads in the process. Do you do this?
>
> Yes. Threads in an application server are expensive, while
> threads in
> httpd are cheap.
>
> A further issue is with backend servers where keepalive is switched
> off. Instead of acknowledging the connection close and releasing the
> connection, we hold the connection open for ages until the client
> finally acknowledges the request as finished.
Is this a problem of a too long lingering close period on the backend server
blocking the expensive backend server threads?
I mean in general the backend server is the one who closes the connection
if its keepalive timeout was used up and hence it can close the socket
from its side. The only thing that comes to mind that could keep it
blocked is a lingering close. Is this the case here?
>
> >> This issue is a regression that was introduced in httpd v2.2, httpd
> >> 2.0 released the connection as soon as it was done.
> >
> > Because it had a completly different architecture and the released
> > connection
> > was not usable by anyone else but the same frontend connection
> > because it was stored
> > in the conn structure of the frontend request. So the result with
> > 2.0 is the same
> > as with 2.2.
>
> In v2.0, it was only saved in the connection if a keepalive was
> present. If there was no keepalive, it was released immediately.
Which resulted in no connection pooling at all.
Regards
Rüdiger
Re: Proxy regressions
Posted by Graham Leggett <mi...@sharp.fm>.
On 10 Nov 2010, at 1:09 PM, Plüm, Rüdiger, VF-Group wrote:
>> The proxy currently creates the allocator in
>> ap_proxy_connection_create(), and then passes the allocator to the
>> various submodules via the ap_run_create_connection() hook, so it
>> looks like we just passing the wrong allocator.
>
> The problem is that we keep the connrec structure in the conn pool.
> It is not created each time we fetch a connection from the conn pool.
> This is required to enable keepalives with SSL backends.
> As said if we pass the bucket allocator from the front end connection
> we possibly end up with other pool lifetime issues and as I speak of
> it SSL comes to my mind.
This doesn't sound right to me - ideally anything doing a read of
anything that will ultimately be sent up the filter stack should use
the allocator belonging to the frontend connection. When the backend
connection is returned to the pool, the allocator should be removed,
and the next allocator inserted when the backend connection is
subsequently reused.
Currently what we seem to have is data allocated out of a pool that
has a lifetime completely unrelated to the frontend request, and then
we're working around this by trying to keep this unrelated pool alive
way longer than it's useful lifetime, and at least as long as the
original request. This seems broken to me, we should really be using
the correct pools all the way through.
>> Right now, we are holding backend connections open for as long as it
>> takes for a frontend connection to acknowledge the request. A
>> typical
>> backend could be finished within milliseconds, while the
>> connection to
>> the frontend often takes hundreds, sometimes thousands of
>> milliseconds. While the backend connection is being held open, that
>> slot cannot be used by anyone else.
>
> Used by whom?
Another worker in httpd.
> As said if you put it back in the pool and your pool has the
> same max size as the number of threads in the process then there is
> some
> chance that this connection will idle in the pool until the actual
> thread
> sent data to the client and fetches the connection from the pool
> again.
> As said I can only follow if the max pool size is configured to be
> smaller
> than the number of threads in the process. Do you do this?
Yes. Threads in an application server are expensive, while threads in
httpd are cheap.
A further issue is with backend servers where keepalive is switched
off. Instead of acknowledging the connection close and releasing the
connection, we hold the connection open for ages until the client
finally acknowledges the request as finished.
>> This issue is a regression that was introduced in httpd v2.2, httpd
>> 2.0 released the connection as soon as it was done.
>
> Because it had a completly different architecture and the released
> connection
> was not usable by anyone else but the same frontend connection
> because it was stored
> in the conn structure of the frontend request. So the result with
> 2.0 is the same
> as with 2.2.
In v2.0, it was only saved in the connection if a keepalive was
present. If there was no keepalive, it was released immediately.
Regards,
Graham
--
RE: Proxy regressions
Posted by "Plüm, Rüdiger, VF-Group" <ru...@vodafone.com>.
> -----Original Message-----
> From: Graham Leggett
> Sent: Mittwoch, 10. November 2010 11:47
> To: dev@httpd.apache.org
> Subject: Re: Proxy regressions
>
> On 10 Nov 2010, at 11:49 AM, Plüm, Rüdiger, VF-Group wrote:
>
> >> Have we not created a pool lifetime problem for ourselves here?
> >>
> >> In theory, any attempt to read from the backend connection should
> >> create buckets allocated from the r->connection->bucket_alloc
> >> allocator, which should be removed from the backend connection when
> >> the backend connection is returned to the pool.
> >
> > I guess we need a dedicated bucket_allocator at least in
> the beginning
> > as we cannot guarantee that anyone in the create_connection
> hook uses
> > the bucket_allocator to create an object that should
> persist until the
> > connrec of the backend connection dies.
> >
> > Exchanging the allocator later each time we get the connection from
> > the conn pool might create similar risks. But I admit that the later
> > is only a gut feeling and I simply do not feel well with
> exchanging
> > the
> > allocator. I have no real hard facts why this cannot be done.
>
> The proxy currently creates the allocator in
> ap_proxy_connection_create(), and then passes the allocator to the
> various submodules via the ap_run_create_connection() hook, so it
> looks like we just passing the wrong allocator.
The problem is that we keep the connrec structure in the conn pool.
It is not created each time we fetch a connection from the conn pool.
This is required to enable keepalives with SSL backends.
As said if we pass the bucket allocator from the front end connection
we possibly end up with other pool lifetime issues and as I speak of
it SSL comes to my mind.
>
> > So without trying to offend anyone, can we see the use case
> for the
> > asap returning
> > again?
>
> Right now, we are holding backend connections open for as long as it
> takes for a frontend connection to acknowledge the request. A
> typical
> backend could be finished within milliseconds, while the
> connection to
> the frontend often takes hundreds, sometimes thousands of
> milliseconds. While the backend connection is being held open, that
> slot cannot be used by anyone else.
Used by whom? As said if you put it back in the pool and your pool has the
same max size as the number of threads in the process then there is some
chance that this connection will idle in the pool until the actual thread
sent data to the client and fetches the connection from the pool again.
As said I can only follow if the max pool size is configured to be smaller
than the number of threads in the process. Do you do this?
Another possibility would be that depending on the request behaviour
on your frontend and the distribution between locally handled requests
(e.g. static content, cache) and backend content it might happen that
the number of actual backend connections in the pool does not increase that
much (aka. to its max size) if the connection is returned to the pool asap.
Do you intend to get this effect?
>
> In addition, when backend keepalives are kept short (as ours
> are), the
> time it takes to serve a frontend request can exceed the keepalive
> timeout, creating unnecessary errors.
Why does this create errors? The connection is released by the backend
because it has delivered all data to the frontend server and has not received
a new request within the keepalive timeframe. So the backend is actually
free to reuse these resources. And the frontend will notice that the backend
has disconnected the next time it fetches the connection again from the pool
and will establish a new connection.
>
> This issue is a regression that was introduced in httpd v2.2, httpd
> 2.0 released the connection as soon as it was done.
Because it had a completly different architecture and the released connection
was not usable by anyone else but the same frontend connection because it was stored
in the conn structure of the frontend request. So the result with 2.0 is the same
as with 2.2.
Regards
Rüdiger
Re: Proxy regressions
Posted by Graham Leggett <mi...@sharp.fm>.
On 10 Nov 2010, at 11:49 AM, Plüm, Rüdiger, VF-Group wrote:
>> Have we not created a pool lifetime problem for ourselves here?
>>
>> In theory, any attempt to read from the backend connection should
>> create buckets allocated from the r->connection->bucket_alloc
>> allocator, which should be removed from the backend connection when
>> the backend connection is returned to the pool.
>
> I guess we need a dedicated bucket_allocator at least in the beginning
> as we cannot guarantee that anyone in the create_connection hook uses
> the bucket_allocator to create an object that should persist until the
> connrec of the backend connection dies.
>
> Exchanging the allocator later each time we get the connection from
> the conn pool might create similar risks. But I admit that the later
> is only a gut feeling and I simply do not feel well with exchanging
> the
> allocator. I have no real hard facts why this cannot be done.
The proxy currently creates the allocator in
ap_proxy_connection_create(), and then passes the allocator to the
various submodules via the ap_run_create_connection() hook, so it
looks like we just passing the wrong allocator.
> So without trying to offend anyone, can we see the use case for the
> asap returning
> again?
Right now, we are holding backend connections open for as long as it
takes for a frontend connection to acknowledge the request. A typical
backend could be finished within milliseconds, while the connection to
the frontend often takes hundreds, sometimes thousands of
milliseconds. While the backend connection is being held open, that
slot cannot be used by anyone else.
In addition, when backend keepalives are kept short (as ours are), the
time it takes to serve a frontend request can exceed the keepalive
timeout, creating unnecessary errors.
This issue is a regression that was introduced in httpd v2.2, httpd
2.0 released the connection as soon as it was done.
Regards,
Graham
--
RE: Proxy regressions
Posted by "Plüm, Rüdiger, VF-Group" <ru...@vodafone.com>.
> -----Original Message-----
> From: Graham Leggett [mailto:minfrin@sharp.fm]
> Sent: Mittwoch, 10. November 2010 10:28
> To: dev@httpd.apache.org
> Subject: Re: Proxy regressions
>
> On 10 Nov 2010, at 9:02 AM, Ruediger Pluem wrote:
>
> >> The fix in r1030855 is wrong:
> ap_proxy_buckets_lifetime_transform is
> >> not copying the data but only creates transient buckets
> from the data
> >> in the buckets in bb. If you then destroy bb before
> passing pass_bb,
> >> the data where the buckets in pass_bb point gets freed and
> later gets
> >> overwritten.
> >
> > Good catch. I reviewed the code again and then remembered
> the idea why
> > we use transient buckets: If everything goes well and the data is
> > sent out
> > by ap_pass_brigade no copying of the contents of the buckets is
> > needed.
> > Only if things are buffered somewhere down the chain, the
> according
> > filter
> > needs to set the buckets aside (which causes copying).
> > So I guess with the approach to release the backend connection
> > immediately
> > we will loose this advantage. That is regrettable.
> > I guess an easy solution would be to use heap buckets instead of
> > transient
> > buckets.
>
> Have we not created a pool lifetime problem for ourselves here?
>
> In theory, any attempt to read from the backend connection should
> create buckets allocated from the r->connection->bucket_alloc
> allocator, which should be removed from the backend connection when
> the backend connection is returned to the pool.
I guess we need a dedicated bucket_allocator at least in the beginning
as we cannot guarantee that anyone in the create_connection hook uses
the bucket_allocator to create an object that should persist until the
connrec of the backend connection dies.
Exchanging the allocator later each time we get the connection from
the conn pool might create similar risks. But I admit that the later
is only a gut feeling and I simply do not feel well with exchanging the
allocator. I have no real hard facts why this cannot be done.
>
> Instead, it seems we create our own bucket_alloc from the backend
> connection's pool, and then we only work because we're holding onto
> the backend until we've stopped writing buckets - far longer than we
> should be, and all along we've been working, but only by accident.
In the light of these regressions I took a step back.
My first thought on returning the backend connection to the connection
pool asap was: This makes sense.
Now after taking a step back I ask myself: Why is this benefitial?
In the default configuration your connection pool to the backend can be as large
as the number of threads you use per process in your MPM.
So not returning it to the conn pool asap does not block any other thread from
getting a connection to the backend from this conn pool.
Only one exception comes to my mind where the asap returning makes sense:
The maxium pool size has been configured to be lower than the number of threads.
So without trying to offend anyone, can we see the use case for the asap returning
again?
Regards
Rüdiger
Re: Proxy regressions
Posted by Graham Leggett <mi...@sharp.fm>.
On 10 Nov 2010, at 9:02 AM, Ruediger Pluem wrote:
>> The fix in r1030855 is wrong: ap_proxy_buckets_lifetime_transform is
>> not copying the data but only creates transient buckets from the data
>> in the buckets in bb. If you then destroy bb before passing pass_bb,
>> the data where the buckets in pass_bb point gets freed and later gets
>> overwritten.
>
> Good catch. I reviewed the code again and then remembered the idea why
> we use transient buckets: If everything goes well and the data is
> sent out
> by ap_pass_brigade no copying of the contents of the buckets is
> needed.
> Only if things are buffered somewhere down the chain, the according
> filter
> needs to set the buckets aside (which causes copying).
> So I guess with the approach to release the backend connection
> immediately
> we will loose this advantage. That is regrettable.
> I guess an easy solution would be to use heap buckets instead of
> transient
> buckets.
Have we not created a pool lifetime problem for ourselves here?
In theory, any attempt to read from the backend connection should
create buckets allocated from the r->connection->bucket_alloc
allocator, which should be removed from the backend connection when
the backend connection is returned to the pool.
Instead, it seems we create our own bucket_alloc from the backend
connection's pool, and then we only work because we're holding onto
the backend until we've stopped writing buckets - far longer than we
should be, and all along we've been working, but only by accident.
Regards,
Graham
--
Re: Proxy regressions
Posted by Ruediger Pluem <rp...@apache.org>.
On 11/09/2010 09:54 PM, Stefan Fritsch wrote:
> On Thursday 04 November 2010, Graham Leggett wrote:
>> On 03 Nov 2010, at 10:28 PM, Stefan Fritsch wrote:
>>> Strange, I have these problems only with prefork, not with event.
>>> But with event I get a segfault in the reslist cleanup code.
>>>
>>> Can somebody cross-check this?
>> This smelled like a pool lifetime issue, and looking closer it
>> looked like we were cleaning up a brigade after we had returned
>> the backend to the pool. The attached patch stops the crash, and
>> it seems to run at a sensible speed again (I suspect the late
>> cleanup of the brigade was sending the code into a spin). Can you
>> give it a try?
>>
>> One proxy test still fails though:
>>
>> t/modules/proxy.............ok 7/15# Failed test 10 in t/modules/
>> proxy.t at line 34
>> t/modules/proxy.............FAILED test 10
>> Failed 1/15 tests, 93.33% okay
>>
>> This fails because the first bucket of the response has been
>> corrupted:
>>
>> graham-leggetts-macbook-pro-3:httpd-test minfrin$ curl
>> http://localhost:8536/reverse/modules/cgi/nph-102.pl sl/2.3.9-dev
>> OpenS
>>
>> I've seen this a few weeks ago and it went away, so I suspect this
>> isn't proxy that's doing it. Need to dig further.
>
> The fix in r1030855 is wrong: ap_proxy_buckets_lifetime_transform is
> not copying the data but only creates transient buckets from the data
> in the buckets in bb. If you then destroy bb before passing pass_bb,
> the data where the buckets in pass_bb point gets freed and later gets
> overwritten.
Good catch. I reviewed the code again and then remembered the idea why
we use transient buckets: If everything goes well and the data is sent out
by ap_pass_brigade no copying of the contents of the buckets is needed.
Only if things are buffered somewhere down the chain, the according filter
needs to set the buckets aside (which causes copying).
So I guess with the approach to release the backend connection immediately
we will loose this advantage. That is regrettable.
I guess an easy solution would be to use heap buckets instead of transient
buckets.
Regards
Rüdiger
Re: Proxy regressions
Posted by Stefan Fritsch <sf...@sfritsch.de>.
On Thursday 04 November 2010, Graham Leggett wrote:
> On 03 Nov 2010, at 10:28 PM, Stefan Fritsch wrote:
> > Strange, I have these problems only with prefork, not with event.
> > But with event I get a segfault in the reslist cleanup code.
> >
> > Can somebody cross-check this?
>
> This smelled like a pool lifetime issue, and looking closer it
> looked like we were cleaning up a brigade after we had returned
> the backend to the pool. The attached patch stops the crash, and
> it seems to run at a sensible speed again (I suspect the late
> cleanup of the brigade was sending the code into a spin). Can you
> give it a try?
>
> One proxy test still fails though:
>
> t/modules/proxy.............ok 7/15# Failed test 10 in t/modules/
> proxy.t at line 34
> t/modules/proxy.............FAILED test 10
> Failed 1/15 tests, 93.33% okay
>
> This fails because the first bucket of the response has been
> corrupted:
>
> graham-leggetts-macbook-pro-3:httpd-test minfrin$ curl
> http://localhost:8536/reverse/modules/cgi/nph-102.pl sl/2.3.9-dev
> OpenS
>
> I've seen this a few weeks ago and it went away, so I suspect this
> isn't proxy that's doing it. Need to dig further.
The fix in r1030855 is wrong: ap_proxy_buckets_lifetime_transform is
not copying the data but only creates transient buckets from the data
in the buckets in bb. If you then destroy bb before passing pass_bb,
the data where the buckets in pass_bb point gets freed and later gets
overwritten.
Cheers,
Stefan
Re: Proxy regressions
Posted by Graham Leggett <mi...@sharp.fm>.
On 03 Nov 2010, at 10:28 PM, Stefan Fritsch wrote:
> Strange, I have these problems only with prefork, not with event.
> But with event I get a segfault in the reslist cleanup code.
>
> Can somebody cross-check this?
This smelled like a pool lifetime issue, and looking closer it looked
like we were cleaning up a brigade after we had returned the backend
to the pool. The attached patch stops the crash, and it seems to run
at a sensible speed again (I suspect the late cleanup of the brigade
was sending the code into a spin). Can you give it a try?
One proxy test still fails though:
t/modules/proxy.............ok 7/15# Failed test 10 in t/modules/
proxy.t at line 34
t/modules/proxy.............FAILED test 10
Failed 1/15 tests, 93.33% okay
This fails because the first bucket of the response has been corrupted:
graham-leggetts-macbook-pro-3:httpd-test minfrin$ curl http://localhost:8536/reverse/modules/cgi/nph-102.pl
sl/2.3.9-dev OpenS
I've seen this a few weeks ago and it went away, so I suspect this
isn't proxy that's doing it. Need to dig further.
Regards,
Graham
--
Re: Proxy regressions
Posted by Stefan Fritsch <sf...@sfritsch.de>.
On Wednesday 03 November 2010, Stefan Fritsch wrote:
> some of the recent changes in the proxy code introduced some
> regressions
Strange, I have these problems only with prefork, not with event.
But with event I get a segfault in the reslist cleanup code.
Can somebody cross-check this?
Re: Proxy regressions
Posted by Rainer Jung <ra...@kippdata.de>.
On 03.11.2010 21:12, Stefan Fritsch wrote:
> some of the recent changes in the proxy code introduced some
> regressions:
>
> Two proxy-related tests fail:
>
> t/modules/rewrite.t (Wstat: 0 Tests: 29 Failed: 2)
> Failed tests: 23-24
Not reproducable with prefork here, using Solaris 10, r1030642 of httpd
trunk and r1004302 apr trunk. But the two tests fail for worker and event:
# testing : request was proxied
# expected: JACKPOT
# received: 500 unexpected EOF before status line seen
not ok 23
# Failed test 23 in t/modules/rewrite.t at line 68
# testing : per-dir proxied rewrite escaping worked
# expected: foo bar
# received: 500 unexpected EOF before status line seen
not ok 24
> Also, the runtime of the suite seems to have increased greatly for me
> (from around 80 to 360 seconds). I haven't made detailed measurements,
> but my feeling is that most of the time is spent in the ssl/proxy.t
> and modules/proxy.t tests.
In my prefork run, t/modules/proxy.t was a bit slow, but t/ssl/proxy.t
was very slow.
For worker, also t/modules/proxy.t was slow, in fact hanging. Typical
stack hangs in apr_reslist_acquire:
#0 0xff0c8d3c in __lwp_park () from /lib/libc.so.1
No symbol table info available.
#1 0xff0c2b94 in cond_sleep_queue () from /lib/libc.so.1
No symbol table info available.
#2 0xff0c2d58 in cond_wait_queue () from /lib/libc.so.1
No symbol table info available.
#3 0xff0c32a0 in cond_wait () from /lib/libc.so.1
No symbol table info available.
#4 0xff0c32dc in pthread_cond_wait () from /lib/libc.so.1
No symbol table info available.
#5 0xff333d9c in apr_reslist_acquire (reslist=0x482758,
resource=0xfd37ba24) at
/shared/build/dev/httpd/sources/apr/trunk/r1004302/util-misc/apr_reslist.c:374
rv = 0
res = (apr_res_t *) 0xfd37ba7f
now = 1288826750667273
#6 0xfe24e784 in ap_proxy_acquire_connection (proxy_function=0xfe1e8618
"HTTP", conn=0xfd37ba24, worker=0x4dc8d8, s=0x3b5ef8)
at
/shared/build/dev/httpd/sources/httpd/trunk/r1030642/modules/proxy/proxy_util.c:2068
rv = -31135876
#7 0xfe1e598c in proxy_http_handler (r=0x4f63d8, worker=0x4dc8d8,
conf=0x4dc7d0, url=0x5429a6
"http://localhost:8529/modules/cgi/nph-102.pl", proxyname=0x0, proxyport=0)
at
/shared/build/dev/httpd/sources/httpd/trunk/r1030642/modules/proxy/mod_proxy_http.c:2054
status = 5202904
server_portstr = ":8541\000$¸ýJ\n", '\0' <repeats 12 times>,
"\001ý7»¬\000\000\000\030"
scheme = 0x543880 "http"
proxy_function = 0xfe1e8618 "HTTP"
backend = (proxy_conn_rec *) 0x0
is_ssl = 0
c = (conn_rec *) 0x4f2688
retry = 0
p = (apr_pool_t *) 0x4f6398
uri = (apr_uri_t *) 0x543850
#8 0xfe245c08 in proxy_run_scheme_handler (r=0x4f63d8, worker=0x4dc8d8,
conf=0x4dc7d0, url=0x5429a6
"http://localhost:8529/modules/cgi/nph-102.pl", proxyhost=0x0,
proxyport=0) at
/shared/build/dev/httpd/sources/httpd/trunk/r1030642/modules/proxy/mod_proxy.c:2451
pHook = (proxy_LINK_scheme_handler_t *) 0x0
n = 1
rv = 5202904
#9 0xfe24974c in proxy_handler (r=0x4f63d8) at
/shared/build/dev/httpd/sources/httpd/trunk/r1030642/modules/proxy/mod_proxy.c:1067
url = 0x5429a6 "http://localhost:8529/modules/cgi/nph-102.pl"
uri = 0x5429a6 "http://localhost:8529/modules/cgi/nph-102.pl"
scheme = 0x5437e8 "http"
sconf = (void *) 0x5429a0
conf = (proxy_server_conf *) 0x4dc7d0
proxies = (apr_array_header_t *) 0x4dc828
ents = (struct proxy_remote *) 0x195098
i = 0
access_status = 0
direct_connect = 0
maxfwd = 5202904
balancer = (proxy_balancer *) 0x0
worker = (proxy_worker *) 0x4dc8d8
attempts = 0
max_attempts = 0
list = (struct dirconn_entry *) 0x1952c0
#10 0x00044b10 in ap_run_handler (r=0x4f63d8) at
/shared/build/dev/httpd/sources/httpd/trunk/r1030642/server/config.c:164
pHook = (ap_LINK_handler_t *) 0x4dc828
n = 3
rv = 5202904
#11 0x00044fa0 in ap_invoke_handler (r=0x4f63d8) at
/shared/build/dev/httpd/sources/httpd/trunk/r1030642/server/config.c:406
handler = 0x4f63d8 ""
result = 5202904
old_handler = 0xfe252098 "proxy-server"
#12 0x00056cfc in ap_process_async_request (r=0x4f63d8) at
/shared/build/dev/httpd/sources/httpd/trunk/r1030642/modules/http/http_request.c:316
i = 502784
t_h = (const apr_array_header_t *) 0x2
t_elt = (const apr_table_entry_t *) 0xa
c = (conn_rec *) 0x4f2688
access_status = 0
#13 0x00056e60 in ap_process_request (r=0x4f63d8) at
/shared/build/dev/httpd/sources/httpd/trunk/r1030642/modules/http/http_request.c:361
bb = (apr_bucket_brigade *) 0xfd4c00e8
b = (apr_bucket *) 0x4f63d8
c = (conn_rec *) 0x4f2688
rv = 502784
...
The data for ap_reslist_acquire is:
#5 0xff333d9c in apr_reslist_acquire (reslist=0x482758,
resource=0xfd37ba24) at
/shared/build/dev/httpd/sources/apr/trunk/r1004302/util-misc/apr_reslist.c:374
374 apr_thread_cond_wait(reslist->avail, reslist->listlock);
(gdb) print *reslist
$1 = {pool = 0x482718, ntotal = 2, nidle = 0, min = 0, smax = 2, hmax =
2, ttl = 0, timeout = 0, constructor = 0xfe24d830 <connection_constructor>,
destructor = 0xfe24d8e0 <connection_destructor>, params = 0x4dc8d8,
avail_list = {next = 0x482780, prev = 0x482780}, free_list = {next =
0x482820, prev = 0x482820},
listlock = 0x4827a8, avail = 0x4827d8}
(gdb) print *resource
$2 = (void *) 0x0
(gdb) print reslist->avail
$3 = (apr_thread_cond_t *) 0x4827d8
(gdb) print *(reslist->avail)
$4 = {pool = 0x482718, cond = {__pthread_cond_flags =
{__pthread_cond_flag = "\000\003\001", __pthread_cond_type = 0,
__pthread_cond_magic = 17238},
__pthread_cond_data = 0}}
(gdb) print *(reslist->listlock)
$5 = {pool = 0x482718, mutex = {__pthread_mutex_flags =
{__pthread_mutex_flag1 = 4, __pthread_mutex_flag2 = 0 '\0',
__pthread_mutex_ceiling = 0 '\0',
__pthread_mutex_type = 0, __pthread_mutex_magic = 19800},
__pthread_mutex_lock = {__pthread_mutex_lock64 = {__pthread_mutex_pad =
"\000\000\000\000\000\000\000"},
__pthread_mutex_lock32 = {__pthread_ownerpid = 0,
__pthread_lockword = 0}, __pthread_mutex_owner64 = 0},
__pthread_mutex_data = 0}}
(gdb) print *reslist->pool
$6 = {parent = 0x3ea3b0, child = 0x490750, sibling = 0x480710, ref =
0x484728, cleanups = 0x482810, free_cleanups = 0x0, allocator = 0x80488,
subprocesses = 0x0,
abort_fn = 0x2ac90 <abort_on_oom>, user_data = 0x0, tag = 0xfe253b00
"proxy_worker_cp", active = 0x482700, self = 0x482700, self_first_avail
= 0x482758 "",
pre_cleanups = 0x0}
(gdb) print *reslist->avail_list->next
$7 = {freed = -133744383980414752, opaque = 0x4dc8d8, link = {next =
0x482780, prev = 0x482780}}
(gdb) print *reslist->free_list->next
$8 = {freed = 1288826743985216, opaque = 0x490790, link = {next =
0x482788, prev = 0x482788}}
(gdb) up
#6 0xfe24e784 in ap_proxy_acquire_connection (proxy_function=0xfe1e8618
"HTTP", conn=0xfd37ba24, worker=0x4dc8d8, s=0x3b5ef8)
at
/shared/build/dev/httpd/sources/httpd/trunk/r1030642/modules/proxy/proxy_util.c:2068
2068 rv = apr_reslist_acquire(worker->cp->res, (void **)conn);
(gdb) print *conn
$9 = (proxy_conn_rec *) 0x0
(gdb) print *worker
$10 = {name = 0x3bc6a0 "http://localhost:8529/", scheme = 0x3bc638
"http", hostname = 0x3bc650 "localhost", route = 0x0, redirect = 0x0, id
= 0, status = 1, lbfactor = 0,
lbset = 0, min = 0, smax = 2, hmax = 2, ttl = 0, retry = 60000000,
timeout = 0, acquire = 0, ping_timeout = 0, conn_timeout = 0,
recv_buffer_size = 0,
io_buffer_size = 8192, port = 8529, keepalive = 0 '\0', disablereuse
= 0 '\0', is_address_reusable = -1, cp = 0x3eab10, s = 0xfd4fa0e8,
opaque = 0x0, context = 0x0,
flush_packets = flush_off, flush_wait = 10000, flusher = 0x0, mutex =
0x3eab20, retry_set = 0, timeout_set = 0, acquire_set = 0,
ping_timeout_set = 0,
conn_timeout_set = 0, recv_buffer_size_set = 0, io_buffer_size_set =
0, keepalive_set = 0, disablereuse_set = 0, apr_hash = 513223906,
our_hash = 3592597298}
(gdb) print *s
$11 = {process = 0x845b0, next = 0x3b0b68, error_fname = 0x0, error_log
= 0x84560, log = {module_levels = 0x0, level = 7}, module_config = 0x3ba0b0,
lookup_defaults = 0x4dc9f8, defn_name = 0x283400
"/shared/build/dev/httpd/test/httpd/trunk/r1030642/solaris10.sparc-trunk-all-shared/worker/t/conf/extra.conf",
defn_line_number = 263, is_virtual = 1 '\001', port = 8541,
server_scheme = 0x0, server_admin = 0x3a8018 "you@example.com",
server_hostname = 0x3ba860 "localhost",
addrs = 0x3ba5f8, timeout = 300000000, keep_alive_timeout = 5000000,
keep_alive_max = 100, keep_alive = 1, names = 0x3b5f70, wild_names =
0x3b5f98, path = 0x0,
pathlen = 0, limit_req_line = 8190, limit_req_fieldsize = 8190,
limit_req_fields = 100, context = 0x0}
I had to kill the processes.
Event was not hanging and not very slow. But note, that the test
includes explicit MPM configs with small number of threads and processes
for worker, but default settings for event!
During the final shutdown, worker and event crash. The data for worker
(couldn't get reasoable data for event):
#0 0xff31e498 in apr_brigade_cleanup (data=0x5050a8) at
/shared/build/dev/httpd/sources/apr/trunk/r1004302/buckets/apr_brigade.c:46
No locals.
#1 0xfe1e3f9c in ap_proxy_http_process_response (p=0x4f6398,
r=0x4f63d8, backend=0x486768, worker=0x3ea810, conf=0x194fe8,
server_portstr=0xfd27ba60 ":8529")
at
/shared/build/dev/httpd/sources/httpd/trunk/r1030642/modules/proxy/mod_proxy_http.c:1931
readbytes = 8
rv = 5263528
mode = APR_NONBLOCK_READ
finish = 1
c = (conn_rec *) 0x4f2680
buffer = "\000\nntent-Type\000
text/html\000\n7ac0\"\000\n15:52:51 GMT\000\ndev OpenSSL/0.9.8o
DAV/2\000\n", '\0' <repeats 409 times>, "\v\232\210", '\0' <repeats 12
times>, "þ%\021Ð\000\000\004)\000\000\000A\000\000\000\a", '\0' <repeats
17 times>, "Thu Nov 04 00:41:34.563204 2010] [proxy:debug] [pid
19304:tid 4] mod_proxy.c(10"...
buf = 0x5050a8 ""
keepchar = -84 '¬'
rp = (request_rec *) 0x5050c8
e = (apr_bucket *) 0x5050a8
bb = (apr_bucket_brigade *) 0x5050a8
tmp_bb = (apr_bucket_brigade *) 0x508018
pass_bb = (apr_bucket_brigade *) 0x5050b8
len = 15
backasswards = 0
interim_response = 0
pread_len = 0
save_table = (apr_table_t *) 0x508030
backend_broke = 0
hop_by_hop_hdrs = {0xfe1e7fd0 "Keep-Alive", 0xfe1e7fe0
"Proxy-Authenticate", 0xfe1e7ff8 "TE", 0xfe1e8000 "Trailer", 0xfe1e8008
"Upgrade", 0x0}
i = 5263532
te = 0x0
original_status = 200
proxy_status = 200
original_status_line = 0x0
origin = (conn_rec *) 0x48ea38
old_timeout = 0
dconf = (proxy_dir_conf *) 0x195c28
do_100_continue = 0
#2 0xfe1e79ac in proxy_http_handler (r=0x4f63d8, worker=0x3ea810,
conf=0x194fe8, url=0x5025be
"http://localhost:8529/modules/rewrite/foo%20bar.html", proxyname=0x0,
proxyport=0) at
/shared/build/dev/httpd/sources/httpd/trunk/r1030642/modules/proxy/mod_proxy_http.c:2133
temp_dir = 0x4faf90 ""
template = 0x4f63d8 ""
status = 0
server_portstr =
":8529\000c\230\000\000\000\000\000\000\000\000\000Oc\230\000P\023¨ý'»¬\000\000\000\030"
scheme = 0x0
proxy_function = 0xfe1e8618 "HTTP"
backend = (proxy_conn_rec *) 0x486768
is_ssl = 0
c = (conn_rec *) 0x4f2680
retry = 0
p = (apr_pool_t *) 0x4f6398
uri = (apr_uri_t *) 0x503f60
#3 0xfe245c08 in proxy_run_scheme_handler (r=0x4f63d8, worker=0x3ea810,
conf=0x194fe8, url=0x5025be
"http://localhost:8529/modules/rewrite/foo%20bar.html",
proxyhost=0x0, proxyport=0) at
/shared/build/dev/httpd/sources/httpd/trunk/r1030642/modules/proxy/mod_proxy.c:2451
pHook = (proxy_LINK_scheme_handler_t *) 0x0
n = 1
rv = 0
#4 0xfe24974c in proxy_handler (r=0x4f63d8) at
/shared/build/dev/httpd/sources/httpd/trunk/r1030642/modules/proxy/mod_proxy.c:1067
url = 0x5025be
"http://localhost:8529/modules/rewrite/foo%20bar.html"
uri = 0x5025be
"http://localhost:8529/modules/rewrite/foo%20bar.html"
scheme = 0x503ec8 "http"
sconf = (void *) 0x5025b8
conf = (proxy_server_conf *) 0x194fe8
proxies = (apr_array_header_t *) 0x195080
ents = (struct proxy_remote *) 0x195098
i = 0
access_status = 0
---Type <return> to continue, or q <return> to quit---
direct_connect = 0
maxfwd = 5202904
balancer = (proxy_balancer *) 0x0
worker = (proxy_worker *) 0x3ea810
attempts = 0
max_attempts = 0
list = (struct dirconn_entry *) 0x1952c0
#5 0x00044b10 in ap_run_handler (r=0x4f63d8) at
/shared/build/dev/httpd/sources/httpd/trunk/r1030642/server/config.c:164
pHook = (ap_LINK_handler_t *) 0x195080
n = 3
rv = 5202904
#6 0x00044fa0 in ap_invoke_handler (r=0x4f63d8) at
/shared/build/dev/httpd/sources/httpd/trunk/r1030642/server/config.c:406
handler = 0x4f63d8 ""
result = 5202904
old_handler = 0xfdaad9f8 "proxy-server"
#7 0x00056cfc in ap_process_async_request (r=0x4f63d8) at
/shared/build/dev/httpd/sources/httpd/trunk/r1030642/modules/http/http_request.c:316
i = 502784
t_h = (const apr_array_header_t *) 0x2
t_elt = (const apr_table_entry_t *) 0xa
c = (conn_rec *) 0x4f2680
access_status = 0
#8 0x00056e60 in ap_process_request (r=0x4f63d8) at
/shared/build/dev/httpd/sources/httpd/trunk/r1030642/modules/http/http_request.c:361
bb = (apr_bucket_brigade *) 0xfd4c01d0
b = (apr_bucket *) 0x4f63d8
c = (conn_rec *) 0x4f2680
rv = 502784
#9 0x00053698 in ap_process_http_connection (c=0x4f2680) at
/shared/build/dev/httpd/sources/httpd/trunk/r1030642/modules/http/http_core.c:186
No locals.
#10 0x0004d58c in ap_run_process_connection (c=0x4f2680) at
/shared/build/dev/httpd/sources/httpd/trunk/r1030642/server/connection.c:41
pHook = (ap_LINK_process_connection_t *) 0x5
n = 4
rv = 5187200
#11 0xfdd84520 in worker_thread (thd=0x3ead10, dummy=0x1) at
/shared/build/dev/httpd/sources/httpd/trunk/r1030642/server/mpm/worker/worker.c:561
process_slot = 0
thread_slot = 1
csd = (apr_socket_t *) 0x4f23c8
bucket_alloc = (apr_bucket_alloc_t *) 0x4f4390
last_ptrans = (apr_pool_t *) 0x0
ptrans = (apr_pool_t *) 0x4f2388
rv = 5187192
is_idle = 0
#12 0xff34abc0 in dummy_worker (opaque=0x3ead10) at
/shared/build/dev/httpd/sources/apr/trunk/r1004302/threadproc/unix/thread.c:142
Regards,
Rainer