You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by do...@apache.org on 2007/09/09 06:46:39 UTC

svn commit: r573948 - /spamassassin/trunk/masses/mass-check

Author: dos
Date: Sat Sep  8 21:46:39 2007
New Revision: 573948

URL: http://svn.apache.org/viewvc?rev=573948&view=rev
Log:
add --cs_max_retries mass-check option to limit the number of times a message
will be retried; keeps mass-check server from dying if archive iterator has
some sort of error with a message (like it's been removed from the corpus);
use --noisy to find out which messages are being skipped; no change in
functionality if you don't use the --cs_max_retries option

Modified:
    spamassassin/trunk/masses/mass-check

Modified: spamassassin/trunk/masses/mass-check
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/mass-check?rev=573948&r1=573947&r2=573948&view=diff
==============================================================================
--- spamassassin/trunk/masses/mass-check (original)
+++ spamassassin/trunk/masses/mass-check Sat Sep  8 21:46:39 2007
@@ -66,6 +66,11 @@
 		server, only ask for paths to the messages and not the
 		messages themselves.  useful when the client and server
 		have the same paths to the corpus data.
+  --cs_max_retries N
+		only used in server mode.  set the maximum number of times
+		to retry having a client scan the message.  you need to use
+		this option if it's possible that messages will be removed
+		from your corpus while a scan is in progress.
 
   log options
   -o            write all logs to stdout
@@ -126,8 +131,8 @@
 	    $opt_logmem $opt_after $opt_before $opt_rewrite $opt_deencap
 	    $opt_learn $opt_reuse $opt_lint $opt_cache $opt_noisy $opt_cf
 	    $total_messages $statusevery $opt_cachedir $opt_scanprob
-	    $opt_client $opt_cs_max $opt_cs_timeout $opt_cs_paths_only
-	    $opt_server %postdata %real $svn_revision
+	    $opt_client $opt_cs_max $opt_cs_max_retries $opt_cs_timeout
+	    $opt_cs_paths_only $opt_server %postdata %real $svn_revision
 	    $tmpfd %reuse %orig_conf %reuse_conf $reuse_rules_loaded_p);
 
 use FindBin;
@@ -172,8 +177,8 @@
 	   "rules=s", "restart=i", "loguris",
 	   "deencap=s", "logmem", "learn=i", "reuse", "lint", "cache",
            "cachedir=s", "noisy", "scanprob=f",
-	   "server=s", "cs_max=i", "cs_timeout=i", "cs_paths_only",
-	   "client=s",
+	   "server=s", "cs_max=i", "cs_max_retries=i", "cs_timeout=i",
+	   "cs_paths_only", "client=s",
 	   "before=s" => \&deal_with_before_after,
 	   "after=s" => \&deal_with_before_after,
            'cf=s' => \@{$opt_cf},
@@ -1087,7 +1092,7 @@
 # Returns: scalar path to gzip file
 #
 sub generate_messages {
-  my($msgs, $timestamps, $msgsout, $paths_only) = @_;
+  my($msgs, $timestamps, $msgsout, $paths_only, $retries) = @_;
 
   # Hold the message numbers we'll be sending out
   my @tosend = ();
@@ -1115,10 +1120,37 @@
       delete $timestamps->{$_};
     }
 
+    # skip any messages that we've already retried enough
+    if ($opt_cs_max_retries) {
+      my @goodtosend;
+      for (my $i = 0; $i < @tosend; $i++) {
+        if (exists $retries->{$tosend[$i]} && $retries->{$tosend[$i]} == $opt_cs_max_retries) {
+          if ($opt_noisy) {
+            my $data = $msgsout->{$tosend[$i]}->{'data'};
+            my $path = ($iter->_run_message($data))[3];
+            print 'status: skipping '.
+              (defined $path ? $path : '(unknown message path)')." after $opt_cs_max_retries retries\n";
+          }
+          delete $msgsout->{$tosend[$i]};
+        } else {
+          push @goodtosend, $tosend[$i];
+        }
+      }
+      @tosend = @goodtosend;
+      @goodtosend = ();
+    }
+
     # Ok, we have enough messages so we can stop now.
     last if (@tosend == $msgs);
   }
 
+  # keep track of how many times we've retried a message
+  if ($opt_cs_max_retries) {
+    foreach (@tosend) {
+      $retries->{$_}++;
+    }
+  }
+
   # if we still have the temp file with the input messages open, we'll fillup
   # out message output queue with messages from there.
   if ($tmpfd) {
@@ -1160,15 +1192,23 @@
     # 1- server message number in text format
     # 2- server index string, binary packed format
     # 3- message content -- unless paths_only
-    send_line($gzfd, $num) || die "mass-check: error when writing to gz temp file\n";
-
     my $data = $msgsout->{$num}->{'data'};
-    send_line($gzfd, $data) || die "mass-check: error when writing to gz temp file\n";
-
     if (!$paths_only) {
       my $msg = ($iter->_run_message($data))[4];
+      unless ($msg) {
+        # skip the message on error, the retry limit code will take care of not
+        # getting stuck in a loop trying to send this message
+        warn "mass-check: error getting message to send, skipping message for now\n";
+        die "mass-check: cannot continue without --cs_max_retries N option to handle message errors\n" unless $opt_cs_max_retries;
+        next;
+      }
+      send_line($gzfd, $num) || die "mass-check: error when writing to gz temp file\n";
+      send_line($gzfd, $data) || die "mass-check: error when writing to gz temp file\n";
       send_line($gzfd, join('', @{$msg})) ||
         die "mass-check: error when writing to gz temp file\n";
+    } else {
+      send_line($gzfd, $num) || die "mass-check: error when writing to gz temp file\n";
+      send_line($gzfd, $data) || die "mass-check: error when writing to gz temp file\n";
     }
   }
 
@@ -1499,6 +1539,7 @@
   # Setup out "what messages have been sent out" hashes
   my $timestamps = {};
   my $msgsout = { 'curnum' => 0 };
+  my $retries = {};
 
   # Generate an IO::Select object and put the server socket on the queue
   my $select = IO::Select->new( $serv_socket );
@@ -1548,7 +1589,7 @@
 	      print "client requested ".$postdata->{'max_messages'}." messages\n";
 	    }
 
-	    $messages = generate_messages($msgnum, $timestamps, $msgsout, $postdata->{'paths_only'});
+	    $messages = generate_messages($msgnum, $timestamps, $msgsout, $postdata->{'paths_only'}, $retries);
 	  }
 
           # $messages will contain the path to the gzip file if there are