You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by do...@apache.org on 2007/09/09 06:46:39 UTC
svn commit: r573948 - /spamassassin/trunk/masses/mass-check
Author: dos
Date: Sat Sep 8 21:46:39 2007
New Revision: 573948
URL: http://svn.apache.org/viewvc?rev=573948&view=rev
Log:
add --cs_max_retries mass-check option to limit the number of times a message
will be retried; keeps mass-check server from dying if archive iterator has
some sort of error with a message (like it's been removed from the corpus);
use --noisy to find out which messages are being skipped; no change in
functionality if you don't use the --cs_max_retries option
Modified:
spamassassin/trunk/masses/mass-check
Modified: spamassassin/trunk/masses/mass-check
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/mass-check?rev=573948&r1=573947&r2=573948&view=diff
==============================================================================
--- spamassassin/trunk/masses/mass-check (original)
+++ spamassassin/trunk/masses/mass-check Sat Sep 8 21:46:39 2007
@@ -66,6 +66,11 @@
server, only ask for paths to the messages and not the
messages themselves. useful when the client and server
have the same paths to the corpus data.
+ --cs_max_retries N
+ only used in server mode. set the maximum number of times
+ to retry having a client scan the message. you need to use
+ this option if it's possible that messages will be removed
+ from your corpus while a scan is in progress.
log options
-o write all logs to stdout
@@ -126,8 +131,8 @@
$opt_logmem $opt_after $opt_before $opt_rewrite $opt_deencap
$opt_learn $opt_reuse $opt_lint $opt_cache $opt_noisy $opt_cf
$total_messages $statusevery $opt_cachedir $opt_scanprob
- $opt_client $opt_cs_max $opt_cs_timeout $opt_cs_paths_only
- $opt_server %postdata %real $svn_revision
+ $opt_client $opt_cs_max $opt_cs_max_retries $opt_cs_timeout
+ $opt_cs_paths_only $opt_server %postdata %real $svn_revision
$tmpfd %reuse %orig_conf %reuse_conf $reuse_rules_loaded_p);
use FindBin;
@@ -172,8 +177,8 @@
"rules=s", "restart=i", "loguris",
"deencap=s", "logmem", "learn=i", "reuse", "lint", "cache",
"cachedir=s", "noisy", "scanprob=f",
- "server=s", "cs_max=i", "cs_timeout=i", "cs_paths_only",
- "client=s",
+ "server=s", "cs_max=i", "cs_max_retries=i", "cs_timeout=i",
+ "cs_paths_only", "client=s",
"before=s" => \&deal_with_before_after,
"after=s" => \&deal_with_before_after,
'cf=s' => \@{$opt_cf},
@@ -1087,7 +1092,7 @@
# Returns: scalar path to gzip file
#
sub generate_messages {
- my($msgs, $timestamps, $msgsout, $paths_only) = @_;
+ my($msgs, $timestamps, $msgsout, $paths_only, $retries) = @_;
# Hold the message numbers we'll be sending out
my @tosend = ();
@@ -1115,10 +1120,37 @@
delete $timestamps->{$_};
}
+ # skip any messages that we've already retried enough
+ if ($opt_cs_max_retries) {
+ my @goodtosend;
+ for (my $i = 0; $i < @tosend; $i++) {
+ if (exists $retries->{$tosend[$i]} && $retries->{$tosend[$i]} == $opt_cs_max_retries) {
+ if ($opt_noisy) {
+ my $data = $msgsout->{$tosend[$i]}->{'data'};
+ my $path = ($iter->_run_message($data))[3];
+ print 'status: skipping '.
+ (defined $path ? $path : '(unknown message path)')." after $opt_cs_max_retries retries\n";
+ }
+ delete $msgsout->{$tosend[$i]};
+ } else {
+ push @goodtosend, $tosend[$i];
+ }
+ }
+ @tosend = @goodtosend;
+ @goodtosend = ();
+ }
+
# Ok, we have enough messages so we can stop now.
last if (@tosend == $msgs);
}
+ # keep track of how many times we've retried a message
+ if ($opt_cs_max_retries) {
+ foreach (@tosend) {
+ $retries->{$_}++;
+ }
+ }
+
# if we still have the temp file with the input messages open, we'll fillup
# out message output queue with messages from there.
if ($tmpfd) {
@@ -1160,15 +1192,23 @@
# 1- server message number in text format
# 2- server index string, binary packed format
# 3- message content -- unless paths_only
- send_line($gzfd, $num) || die "mass-check: error when writing to gz temp file\n";
-
my $data = $msgsout->{$num}->{'data'};
- send_line($gzfd, $data) || die "mass-check: error when writing to gz temp file\n";
-
if (!$paths_only) {
my $msg = ($iter->_run_message($data))[4];
+ unless ($msg) {
+ # skip the message on error, the retry limit code will take care of not
+ # getting stuck in a loop trying to send this message
+ warn "mass-check: error getting message to send, skipping message for now\n";
+ die "mass-check: cannot continue without --cs_max_retries N option to handle message errors\n" unless $opt_cs_max_retries;
+ next;
+ }
+ send_line($gzfd, $num) || die "mass-check: error when writing to gz temp file\n";
+ send_line($gzfd, $data) || die "mass-check: error when writing to gz temp file\n";
send_line($gzfd, join('', @{$msg})) ||
die "mass-check: error when writing to gz temp file\n";
+ } else {
+ send_line($gzfd, $num) || die "mass-check: error when writing to gz temp file\n";
+ send_line($gzfd, $data) || die "mass-check: error when writing to gz temp file\n";
}
}
@@ -1499,6 +1539,7 @@
# Setup out "what messages have been sent out" hashes
my $timestamps = {};
my $msgsout = { 'curnum' => 0 };
+ my $retries = {};
# Generate an IO::Select object and put the server socket on the queue
my $select = IO::Select->new( $serv_socket );
@@ -1548,7 +1589,7 @@
print "client requested ".$postdata->{'max_messages'}." messages\n";
}
- $messages = generate_messages($msgnum, $timestamps, $msgsout, $postdata->{'paths_only'});
+ $messages = generate_messages($msgnum, $timestamps, $msgsout, $postdata->{'paths_only'}, $retries);
}
# $messages will contain the path to the gzip file if there are