You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2007/05/03 14:03:48 UTC
svn commit: r534809 -
/spamassassin/trunk/lib/Mail/SpamAssassin/SpamdForkScaling.pm
Author: jm
Date: Thu May 3 05:03:48 2007
New Revision: 534809
URL: http://svn.apache.org/viewvc?view=rev&rev=534809
Log:
bug 5422: deleting hash entries from the SIGCHLD signal handler is unsafe, causes corruption of the data structure, and results in 'prefork: ordered child N to accept, but they reported state '1', killing rogue' errors. fix
Modified:
spamassassin/trunk/lib/Mail/SpamAssassin/SpamdForkScaling.pm
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/SpamdForkScaling.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/SpamdForkScaling.pm?view=diff&rev=534809&r1=534808&r2=534809
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/SpamdForkScaling.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/SpamdForkScaling.pm Thu May 3 05:03:48 2007
@@ -36,7 +36,7 @@
@PFSTATE_VARS = qw(
PFSTATE_ERROR PFSTATE_STARTING PFSTATE_IDLE PFSTATE_BUSY PFSTATE_KILLED
- PFORDER_ACCEPT
+ PFORDER_ACCEPT PFSTATE_GOT_SIGCHLD
);
%EXPORT_TAGS = (
@@ -49,6 +49,7 @@
use constant PFSTATE_IDLE => 1;
use constant PFSTATE_BUSY => 2;
use constant PFSTATE_KILLED => 3;
+use constant PFSTATE_GOT_SIGCHLD => 4;
use constant PFORDER_ACCEPT => 10;
@@ -127,13 +128,28 @@
my ($self, $pid) = @_;
dbg("prefork: child $pid: just exited");
- delete $self->{kids}->{$pid};
+
+ # defer removal from the list until after return from the signal
+ # handler; it seems that we may be corrupting the list structure
+ # by deleting the {kids} hash entry from a sig handler. (bug 5422)
+ $self->set_child_state ($pid, PFSTATE_GOT_SIGCHLD);
# note this for the select()-caller's benefit
$self->{child_just_exited} = 1;
+}
+
+sub post_sigchld_cleanup {
+ my ($self) = @_;
+ my @pids = grep { $self->{kids}->{$_} == PFSTATE_GOT_SIGCHLD }
+ keys %{$self->{kids}};
+ return unless @pids;
- # remove the child from the backchannel list, too
- $self->{backchannel}->delete_socket_for_child($pid);
+ foreach my $pid (@pids) {
+ delete $self->{kids}->{$pid}; # remove from list
+
+ # remove the child from the backchannel list, too
+ $self->{backchannel}->delete_socket_for_child($pid);
+ }
# ensure we recompute, so that we don't try to tell that child to
# accept a request, only to find that it's died in the meantime.
@@ -180,7 +196,8 @@
# I keep misreading this -- so: this says, if the child is starting, or is
# dying, or it has an entry in the {kids} hash, then allow the state to be
# set. otherwise the update can be ignored.
- if ($state == PFSTATE_STARTING || $state == PFSTATE_KILLED || exists $self->{kids}->{$pid})
+ if ($state == PFSTATE_STARTING || $state == PFSTATE_KILLED ||
+ $state == PFSTATE_GOT_SIGCHLD || exists $self->{kids}->{$pid})
{
$self->{kids}->{$pid} = $state;
dbg("prefork: child $pid: entering state $state");
@@ -232,6 +249,9 @@
$self->vec_all(\$rin, $self->{server_fileno}, 0);
}
+ # clean up any fresh zombies before we select()
+ $self->post_sigchld_cleanup();
+
my ($rout, $eout, $nfound, $timeleft, $selerr);
# use alarm to back up select()'s built-in alarm, to debug Theo's bug.
@@ -249,6 +269,9 @@
});
+ # in case any kids exited during select()
+ $self->post_sigchld_cleanup();
+
# bug 4696: under load, the process can go for such a long time without
# being context-switched in, that when it does return the alarm() fires
# before the select() timeout does. Treat this as a select() timeout
@@ -756,6 +779,9 @@
}
elsif ($k == PFSTATE_KILLED) {
$statestr .= 'K';
+ }
+ elsif ($k == PFSTATE_GOT_SIGCHLD) {
+ $statestr .= 'Z';
}
elsif ($k == PFSTATE_ERROR) {
$statestr .= 'E';