You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2007/04/03 20:34:22 UTC

svn commit: r525237 - in /spamassassin/trunk/lib/Mail/SpamAssassin: SpamdForkScaling.pm SubProcBackChannel.pm

Author: jm
Date: Tue Apr  3 11:34:21 2007
New Revision: 525237

URL: http://svn.apache.org/viewvc?view=rev&rev=525237
Log:
bug 5313: fix spamd 'prefork: select returned -1, recovering: Bad file descriptor' error.  do this by ensuring FDs are fully cleaned up (and removed from the selector bitvec) if a child is killed, and a few other related cleanups, to fully clarify the code.  also, fix a case where write errors from child to parent were not being caught or logged, due to incorrect use of write() instead of die().  finally, add test instrumentation code, allowing simulation of certain reported system errors.  Note: this checkin also contains additional debugging log messages

Modified:
    spamassassin/trunk/lib/Mail/SpamAssassin/SpamdForkScaling.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/SubProcBackChannel.pm

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/SpamdForkScaling.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/SpamdForkScaling.pm?view=diff&rev=525237&r1=525236&r2=525237
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/SpamdForkScaling.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/SpamdForkScaling.pm Tue Apr  3 11:34:21 2007
@@ -54,6 +54,23 @@
 
 ###########################################################################
 
+# change to 1 to enable the below test instrumentation points
+use constant SUPPORT_TEST_INSTRUMENTATION => 0;
+
+# test instrumentation point: simulate random child failures in 1 in
+# every N lookups
+our $TEST_MODE_CAUSE_RANDOM_KID_FAILURES = 0;
+
+# test instrumentation point: simulate child->parent and parent->child
+# write failures (needing retries) once in every N syswrite()s
+our $TEST_MODE_CAUSE_RANDOM_WRITE_RETRIES = 0;
+
+# test instrumentation point: simulate ping failures (for unspecified
+# reasons) once in every N pings
+our $TEST_MODE_CAUSE_RANDOM_PING_FAILURES = 0;
+
+###########################################################################
+
 # we use the following protocol between the master and child processes to
 # control when they accept/who accepts: server tells a child to accept with a
 # PF_ACCEPT_ORDER, child responds with "B$pid\n" when it's busy, and "I$pid\n"
@@ -109,6 +126,8 @@
 sub child_exited {
   my ($self, $pid) = @_;
 
+warn "JMD bug5313 child_exited $pid";
+  dbg("prefork: child $pid: just exited");
   delete $self->{kids}->{$pid};
 
   # note this for the select()-caller's benefit
@@ -143,7 +162,14 @@
   kill 'INT' => $pid
     or warn "prefork: kill of failed child $pid failed: $!\n";
 
+warn "JMD bug5313 deleting sock: ".unpack("b*", ${$self->{backchannel}->{selector}});
   $self->{backchannel}->delete_socket_for_child($pid);
+
+  if (defined $sock && defined $sock->fileno()) {
+    $self->{backchannel}->remove_from_selector($sock);
+  }
+
+warn "JMD bug5313 deleting sock: ".unpack("b*", ${$self->{backchannel}->{selector}});
   if ($sock) {
     $sock->close;
   }
@@ -219,7 +245,9 @@
 
   $timer->run(sub {
 
-    $self->{child_just_exited} = 0;
+warn "JMD bug5313 child_just_exited = 0";
+    # right before select() syscall, but after alarm(), eval scope, etc.
+    $self->{child_just_exited} = 0;     
     ($nfound, $timeleft) = select($rout=$rin, undef, $eout=$rin, $tout);
     $selerr = $!;
 
@@ -340,15 +368,23 @@
 
   $self->{server_last_ping} = $now;
 
+  keys %{$self->{backchannel}->{kids}};     # reset each() iterator
   my ($sock, $kid);
   while (($kid, $sock) = each %{$self->{backchannel}->{kids}}) {
     # if the file handle is still defined ping the child
     # bug 4852: if not, we've run into a race condition with the child's
     # SIGCHLD handler... try killing again just in case something else happened
-    if (defined $sock && defined $sock->fileno) {
+
+    if (SUPPORT_TEST_INSTRUMENTATION && $TEST_MODE_CAUSE_RANDOM_PING_FAILURES &&
+              rand $TEST_MODE_CAUSE_RANDOM_PING_FAILURES < 1)
+    {
+      warn "prefork: TEST_MODE_CAUSE_RANDOM_PING_FAILURES simulating ping failure";
+    }
+    elsif (defined $sock && defined $sock->fileno) {
       $self->syswrite_with_retry($sock, PF_PING_ORDER, $kid, 3) and next;
       warn "prefork: write of ping failed to $kid fd=".$sock->fileno.": ".$!;
-    } else {
+    }
+    else {
       warn "prefork: cannot ping $kid, file handle not defined, child likely ".
 	   "to still be processing SIGCHLD handler after killing itself\n";
     }
@@ -372,7 +408,8 @@
     # stop it being select'd
     my $fno = $sock->fileno;
     if (defined $fno) {
-      vec(${$self->{backchannel}->{selector}}, $fno, 1) = 0;
+      $self->{backchannel}->remove_from_selector($sock);
+warn "JMD bug5313 $sock/$fno removed from selector";
       $sock->close();
     }
 
@@ -409,6 +446,13 @@
   if (defined $kid)
   {
     my $sock = $self->{backchannel}->get_socket_for_child($kid);
+
+    if (SUPPORT_TEST_INSTRUMENTATION && $TEST_MODE_CAUSE_RANDOM_KID_FAILURES) {
+      if (rand $TEST_MODE_CAUSE_RANDOM_KID_FAILURES < 1) {
+        $sock = undef; warn "prefork: TEST_MODE_CAUSE_RANDOM_KID_FAILURES simulating no socket for kid $kid";
+      }
+    }
+
     if (!$sock)
     {
       # this should not happen, but if it does, trap it here
@@ -506,7 +550,7 @@
   my ($self, $str) = @_;
   my $sock = $self->{backchannel}->get_parent_socket();
   $self->syswrite_with_retry($sock, $str, 'parent')
-        or write "syswrite() to parent failed: $!";
+        or die "syswrite() to parent failed: $!";
 }
 
 sub wait_for_orders {
@@ -639,7 +683,16 @@
     }
   }
 
-  my $nbytes = $sock->syswrite($buf);
+  my $nbytes;
+  if (SUPPORT_TEST_INSTRUMENTATION && $TEST_MODE_CAUSE_RANDOM_WRITE_RETRIES &&
+            rand $TEST_MODE_CAUSE_RANDOM_WRITE_RETRIES < 1)
+  {
+    warn "prefork: TEST_MODE_CAUSE_RANDOM_WRITE_RETRIES simulating write failure";
+    $nbytes = undef; $! = &Errno::EAGAIN;
+  }
+  else {
+    $nbytes = $sock->syswrite($buf);
+  }
 
   if (!defined $nbytes) {
     unless ((exists &Errno::EAGAIN && $! == &Errno::EAGAIN)

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/SubProcBackChannel.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/SubProcBackChannel.pm?view=diff&rev=525237&r1=525236&r2=525237
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/SubProcBackChannel.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/SubProcBackChannel.pm Tue Apr  3 11:34:21 2007
@@ -99,9 +99,22 @@
 
 sub add_to_selector {
   my ($self, $fh) = @_;
+  if (!defined $fh) {
+    warn "undef fh in add_to_selector"; return;
+  }
   my $fno = fileno($fh);
   $self->{fileno_to_fh}->{$fno} = $fh;
   vec (${$self->{selector}}, $fno, 1) = 1;
+}
+
+sub remove_from_selector {
+  my ($self, $fh) = @_;
+  if (!defined $fh) {
+    warn "undef fh in remove_from_selector"; return;
+  }
+  my $fno = fileno($fh);
+  delete $self->{fileno_to_fh}->{$fno};
+  vec (${$self->{selector}}, $fno, 1) = 0;
 }
 
 sub select_vec_to_fh_list {