You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by qu...@apache.org on 2004/05/09 22:17:05 UTC
svn commit: rev 10569 - incubator/spamassassin/trunk/masses/rule-qa
Author: quinlan
Date: Sun May 9 13:17:04 2004
New Revision: 10569
Added:
incubator/spamassassin/trunk/masses/rule-qa/corpus.example
Modified:
incubator/spamassassin/trunk/masses/rule-qa/README.nightly
incubator/spamassassin/trunk/masses/rule-qa/corpus-hourly
incubator/spamassassin/trunk/masses/rule-qa/corpus-nightly
Log:
update of corpus tools
- more retries and reliability stuff for corpus-nightly
- more complete crontab
- updates to summary script
Modified: incubator/spamassassin/trunk/masses/rule-qa/README.nightly
==============================================================================
--- incubator/spamassassin/trunk/masses/rule-qa/README.nightly (original)
+++ incubator/spamassassin/trunk/masses/rule-qa/README.nightly Sun May 9 13:17:04 2004
@@ -15,44 +15,27 @@
$HOME/cvs/spamassassin - the tree (checked out with the correct tag)
$HOME/cvs/spamassassin/corpus - the corpus description (for "mass-check -f")
- - $HOME/.corpus contains various settings:
+ - $HOME/.corpus contains various settings, see corpus.example in this
+ directory.
- # location of corpus file (mass-check -f $corpus)
- corpus=/home/corpus/corpus
+ - A cron job (hours for tagtime and corpus-nightly for your
+ local time, this is US/Pacific, adjust appropriately for your
+ timezone)
+
+ The "tagtime" and "corpus-nightly" cron jobs should be run twice a
+ day to handle daylight savings since cron does not. They exit if
+ it's 0800-0859 or 1000-1059 UTC (which means you can "corpus-nightly"
+ any other time of day if you want).
- # location for summary results
- html=/home/html/root/users/corpus
+ The "corpus-hourly" script only needs to be run if you are producing
+ optional mass-check summary reports.
- # location of tagtime file
- tagtime=/home/corpus/log/tagtime
-
- # temporary working directory for summary results
- tmp=/home/corpus/tmp
-
- # subversion directory location
- tree=/home/corpus/svn/spamassassin
-
- # rsync username and password
- username=joe
- password=xyzzy
-
- # weekly and nightly mass-check options
- opts_weekly="--restart=500 --tail=15000 --net -j 8 -f /home/corpus/mail/corpus"
- opts_nightly="--restart=500 --tail=15000 -f /home/corpus/mail/corpus"
-
- # weekly and nightly mass-check user_prefs files
- prefs_weekly=/home/corpus/mail/user_prefs.weekly
- prefs_nightly=/home/corpus/mail/user_prefs.nightly
-
- - the following cron job (hours for tagtime and corpus-nightly for your
- local time, this is US/Pacific)
-
-The "tagtime" and "corpus-nightly" cron jobs run twice a day due to
-daylight savings, but exit if it's 0800-0859 or 1000-1059 UTC (which
-means you can "corpus-nightly" any other time of day if you want).
+ "pyzor discover" only needs to be run if you are running Pyzor.
------- start of cut text --------------
-0 1,2 * * * /home/corpus/scripts/corpus-tagtime
-10 1,2 * * * /home/corpus/scripts/corpus-nightly >/home/corpus/log/nightly 2>&1
-30 * * * * /home/corpus/scripts/corpus-hourly >/home/corpus/log/hourly 2>&1
+PATH=/home/corpus/scripts:/usr/local/bin:/usr/local/sbin:/bin:/sbin:/usr/bin:/usr/sbin
+0 1,2 * * * corpus-tagtime
+10 1,2 * * * corpus-nightly >/home/corpus/log/nightly 2>&1
+30 * * * * corpus-hourly >/home/corpus/log/hourly 2>&1
+5 1 * * * pyzor discover >/dev/null 2>/dev/null
------- end ----------------------------
Modified: incubator/spamassassin/trunk/masses/rule-qa/corpus-hourly
==============================================================================
--- incubator/spamassassin/trunk/masses/rule-qa/corpus-hourly (original)
+++ incubator/spamassassin/trunk/masses/rule-qa/corpus-hourly Sun May 9 13:17:04 2004
@@ -73,7 +73,12 @@
close(FIND);
if (! $files) {
print STDERR "no new corpus files\n";
- exit 0;
+ if (rand(24) > 1) {
+ exit 0;
+ }
+ else {
+ print STDERR "updating anyway\n";
+ }
}
}
open(RSYNC, "> rsync.last");
@@ -147,18 +152,26 @@
sub current {
for my $class ("DETAILS", "HTML", "NET") {
- for my $age ("new", "all", "age", "1day", "2day", "7day") {
+# for my $age ("new", "all", "age", "1day", "2day", "7day") {
+ for my $age ("new", "all", "age") {
+ print STDERR "generating $class.$age\n";
+
+ next if ($class eq "NET" && $age !~ /^(?:new|all|age|7day)$/);
+
my @ham = grep { /^ham/ } @files;
my @spam = grep { /^spam/ } @files;
- chdir $opt{corpus};
+ print STDERR "ham: " . join(' ', @ham) . "\n";
+ print STDERR "spam: " . join(' ', @spam) . "\n";
- next if ($class eq "NET" && $age !~ /^(?:new|all|age|7day)$/);
+ chdir $opt{corpus};
# net vs. local
if ($class eq "NET") {
@ham = grep { /-net-/ } @ham;
@spam = grep { /-net-/ } @spam;
+ print STDERR "ham: " . join(' ', @ham) . "\n";
+ print STDERR "spam: " . join(' ', @spam) . "\n";
}
else {
# if both net and local exist, use newer
@@ -173,30 +186,30 @@
}
while (my ($user, $count) = each %ham) {
if ($count > 1) {
- my @matches = grep { m/-$user\.log$/ } @ham;
- my $new;
- for (@matches) {
- if (!defined $new || -M $_ < -M $new) {
- $new = $_;
- }
+ my $nightly = "ham-$user.log";
+ my $weekly = "ham-net-$user.log";
+ if ($revision{$nightly} >= $revision{$weekly}) {
+ @ham = grep { $_ ne $weekly } @ham;
+ }
+ else {
+ @ham = grep { $_ ne $nightly } @ham;
}
- next unless $new;
- @ham = grep { !/-$user\.log$/ || $_ eq $new } @ham;
}
}
while (my ($user, $count) = each %spam) {
if ($count > 1) {
- my @matches = grep { m/-$user\.log$/ } @spam;
- my $new;
- for (@matches) {
- if (!defined $new || -M $_ < -M $new) {
- $new = $_;
- }
+ my $nightly = "spam-$user.log";
+ my $weekly = "spam-net-$user.log";
+ if ($revision{$nightly} >= $revision{$weekly}) {
+ @spam = grep { $_ ne $weekly } @spam;
+ }
+ else {
+ @spam = grep { $_ ne $nightly } @spam;
}
- next unless $new;
- @spam = grep { !/-$user\.log$/ || $_ eq $new } @spam;
}
}
+ print STDERR "ham: " . join(' ', @ham) . "\n";
+ print STDERR "spam: " . join(' ', @spam) . "\n";
}
# age
@@ -210,17 +223,23 @@
}
@spam = grep { $revision{$_} eq $wanted } @spam;
@ham = grep { $revision{$_} eq $wanted } @ham;
+ print STDERR "ham: " . join(' ', @ham) . "\n";
+ print STDERR "spam: " . join(' ', @spam) . "\n";
}
elsif ($age =~ /^(?:new|all|age)$/) {
@ham = grep { -M "$_" < -M $opt{tagtime} } @ham;
@spam = grep { -M "$_" < -M $opt{tagtime} } @spam;
@ham = grep { $revision{$_} eq $revision } @ham;
@spam = grep { $revision{$_} eq $revision } @spam;
+ print STDERR "ham: " . join(' ', @ham) . "\n";
+ print STDERR "spam: " . join(' ', @spam) . "\n";
}
elsif ($age =~ /(\d+)day/) {
my $mtime = $1;
@ham = grep { -M "$_" < $mtime } @ham;
@spam = grep { -M "$_" < $mtime } @spam;
+ print STDERR "ham: " . join(' ', @ham) . "\n";
+ print STDERR "spam: " . join(' ', @spam) . "\n";
}
open(OUT, "> $opt{html}/$class.$age");
Modified: incubator/spamassassin/trunk/masses/rule-qa/corpus-nightly
==============================================================================
--- incubator/spamassassin/trunk/masses/rule-qa/corpus-nightly (original)
+++ incubator/spamassassin/trunk/masses/rule-qa/corpus-nightly Sun May 9 13:17:04 2004
@@ -28,10 +28,11 @@
cd $tree
# find current revision
-rm -f weekly-versions.txt
-rm -f nightly-versions.txt
-wget http://rsync.spamassassin.org/weekly-versions.txt
-wget http://rsync.spamassassin.org/nightly-versions.txt
+rm -f weekly-versions.txt nightly-versions.txt
+if ! wget --tries=120 --wait=30 http://rsync.spamassassin.org/weekly-versions.txt http://rsync.spamassassin.org/nightly-versions.txt; then
+ echo "wget failed" 1>&2
+ exit 0
+fi
if [ -n "$net" ]; then
revision=$(tail -1 weekly-versions.txt|awk '{print $2}')
else
@@ -42,14 +43,18 @@
set +e
retry=0
while true; do
+ killall -TERM svn
+ sleep 10
+ killall -KILL svn
if svn update -r $revision; then
break;
fi
if [ $retry -eq 120 ]; then
+ echo "svn update failed" 1>&2
exit 1
fi
retry=$(( $retry + 1 ))
- sleep 30
+ sleep 20
done
set -e
@@ -75,6 +80,22 @@
uptime
date > test.end
-# submit results
-rsync -CPcvuzb --timeout=120 ham.log $username@rsync.spamassassin.org::corpus/ham-$net$username.log
-rsync -CPcvuzb --timeout=120 spam.log $username@rsync.spamassassin.org::corpus/spam-$net$username.log
+# results name
+mv spam.log spam-$net$username.log
+mv ham.log ham-$net$username.log
+
+# rsync
+set +e
+retry=0
+while true; do
+ if rsync -CPcvuzb --timeout=120 spam-$net$username.log ham-$net$username.log $username@rsync.spamassassin.org::corpus/; then
+ break;
+ fi
+ if [ $retry -eq 120 ]; then
+ echo "rsync failed" 1>&2
+ exit 1
+ fi
+ retry=$(( $retry + 1 ))
+ sleep 30
+done
+set -e
Added: incubator/spamassassin/trunk/masses/rule-qa/corpus.example
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/masses/rule-qa/corpus.example Sun May 9 13:17:04 2004
@@ -0,0 +1,26 @@
+# location of corpus file (mass-check -f $corpus)
+corpus=/home/corpus/corpus
+
+# location for summary results
+html=/home/html/root/users/corpus
+
+# location of tagtime file
+tagtime=/home/corpus/log/tagtime
+
+# temporary working directory for summary results
+tmp=/home/corpus/tmp
+
+# subversion directory location
+tree=/home/corpus/svn/spamassassin
+
+# rsync username and password
+username=joe
+password=xyzzy
+
+# weekly and nightly mass-check options
+opts_weekly="--restart=500 --tail=15000 --net -j 8 -f /home/corpus/mail/corpus"
+opts_nightly="--restart=500 --tail=15000 -f /home/corpus/mail/corpus"
+
+# weekly and nightly mass-check user_prefs files
+prefs_weekly=/home/corpus/mail/user_prefs.weekly
+prefs_nightly=/home/corpus/mail/user_prefs.nightly