You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by si...@apache.org on 2007/06/05 06:27:59 UTC

svn commit: r544370 - /spamassassin/branches/3.2/lib/Mail/SpamAssassin/SpamdForkScaling.pm

Author: sidney
Date: Mon Jun  4 21:27:58 2007
New Revision: 544370

URL: http://svn.apache.org/viewvc?view=rev&rev=544370
Log:
bug 5422: spamd error, prefork: ordered child N to accept...

Modified:
    spamassassin/branches/3.2/lib/Mail/SpamAssassin/SpamdForkScaling.pm

Modified: spamassassin/branches/3.2/lib/Mail/SpamAssassin/SpamdForkScaling.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/3.2/lib/Mail/SpamAssassin/SpamdForkScaling.pm?view=diff&rev=544370&r1=544369&r2=544370
==============================================================================
--- spamassassin/branches/3.2/lib/Mail/SpamAssassin/SpamdForkScaling.pm (original)
+++ spamassassin/branches/3.2/lib/Mail/SpamAssassin/SpamdForkScaling.pm Mon Jun  4 21:27:58 2007
@@ -36,7 +36,7 @@
 
 @PFSTATE_VARS = qw(
   PFSTATE_ERROR PFSTATE_STARTING PFSTATE_IDLE PFSTATE_BUSY PFSTATE_KILLED
-  PFORDER_ACCEPT 
+  PFORDER_ACCEPT PFSTATE_GOT_SIGCHLD
 );
 
 %EXPORT_TAGS = (
@@ -49,6 +49,7 @@
 use constant PFSTATE_IDLE        => 1;
 use constant PFSTATE_BUSY        => 2;
 use constant PFSTATE_KILLED      => 3;
+use constant PFSTATE_GOT_SIGCHLD => 4;
 
 use constant PFORDER_ACCEPT      => 10;
 
@@ -127,13 +128,28 @@
   my ($self, $pid) = @_;
 
   dbg("prefork: child $pid: just exited");
-  delete $self->{kids}->{$pid};
+
+  # defer removal from the list until after return from the signal
+  # handler; it seems that we may be corrupting the list structure
+  # by deleting the {kids} hash entry from a sig handler. (bug 5422)
+  $self->set_child_state ($pid, PFSTATE_GOT_SIGCHLD);
 
   # note this for the select()-caller's benefit
   $self->{child_just_exited} = 1;
+}
+ 
+sub post_sigchld_cleanup {
+  my ($self) = @_;
+  my @pids = grep { $self->{kids}->{$_} == PFSTATE_GOT_SIGCHLD }
+        keys %{$self->{kids}};
+  return unless @pids;
 
-  # remove the child from the backchannel list, too
-  $self->{backchannel}->delete_socket_for_child($pid);
+  foreach my $pid (@pids) {
+    delete $self->{kids}->{$pid};       # remove from list
+
+    # remove the child from the backchannel list, too
+    $self->{backchannel}->delete_socket_for_child($pid);
+  }
 
   # ensure we recompute, so that we don't try to tell that child to
   # accept a request, only to find that it's died in the meantime.
@@ -180,7 +196,8 @@
   # I keep misreading this -- so: this says, if the child is starting, or is
   # dying, or it has an entry in the {kids} hash, then allow the state to be
   # set.  otherwise the update can be ignored.
-  if ($state == PFSTATE_STARTING || $state == PFSTATE_KILLED || exists $self->{kids}->{$pid})
+  if ($state == PFSTATE_STARTING || $state == PFSTATE_KILLED ||
+        $state == PFSTATE_GOT_SIGCHLD || exists $self->{kids}->{$pid})
   {
     $self->{kids}->{$pid} = $state;
     dbg("prefork: child $pid: entering state $state");
@@ -232,6 +249,9 @@
     $self->vec_all(\$rin, $self->{server_fileno}, 0);
   }
 
+  # clean up any fresh zombies before we select()
+  $self->post_sigchld_cleanup();
+
   my ($rout, $eout, $nfound, $timeleft, $selerr);
 
   # use alarm to back up select()'s built-in alarm, to debug Theo's bug.
@@ -249,6 +269,9 @@
 
   });
 
+  # in case any kids exited during select()
+  $self->post_sigchld_cleanup();
+
   # bug 4696: under load, the process can go for such a long time without
   # being context-switched in, that when it does return the alarm() fires
   # before the select() timeout does.   Treat this as a select() timeout
@@ -756,6 +779,9 @@
     }
     elsif ($k == PFSTATE_KILLED) {
       $statestr .= 'K';
+    }
+    elsif ($k == PFSTATE_GOT_SIGCHLD) {
+      $statestr .= 'Z';
     }
     elsif ($k == PFSTATE_ERROR) {
       $statestr .= 'E';