You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by qu...@apache.org on 2004/05/09 22:17:05 UTC

svn commit: rev 10569 - incubator/spamassassin/trunk/masses/rule-qa

Author: quinlan
Date: Sun May  9 13:17:04 2004
New Revision: 10569

Added:
   incubator/spamassassin/trunk/masses/rule-qa/corpus.example
Modified:
   incubator/spamassassin/trunk/masses/rule-qa/README.nightly
   incubator/spamassassin/trunk/masses/rule-qa/corpus-hourly
   incubator/spamassassin/trunk/masses/rule-qa/corpus-nightly
Log:
update of corpus tools
- more retries and reliability stuff for corpus-nightly
- more complete crontab
- updates to summary script


Modified: incubator/spamassassin/trunk/masses/rule-qa/README.nightly
==============================================================================
--- incubator/spamassassin/trunk/masses/rule-qa/README.nightly	(original)
+++ incubator/spamassassin/trunk/masses/rule-qa/README.nightly	Sun May  9 13:17:04 2004
@@ -15,44 +15,27 @@
    $HOME/cvs/spamassassin        - the tree (checked out with the correct tag)
    $HOME/cvs/spamassassin/corpus - the corpus description (for "mass-check -f")
 
- - $HOME/.corpus contains various settings:
+ - $HOME/.corpus contains various settings, see corpus.example in this
+   directory.
 
-   # location of corpus file (mass-check -f $corpus)
-   corpus=/home/corpus/corpus
+ - A cron job (hours for tagtime and corpus-nightly for your
+   local time, this is US/Pacific, adjust appropriately for your
+   timezone)
+
+   The "tagtime" and "corpus-nightly" cron jobs should be run twice a
+   day to handle daylight savings since cron does not.  They exit if
+   it's 0800-0859 or 1000-1059 UTC (which means you can "corpus-nightly"
+   any other time of day if you want).
 
-   # location for summary results
-   html=/home/html/root/users/corpus
+   The "corpus-hourly" script only needs to be run if you are producing
+   optional mass-check summary reports.
 
-   # location of tagtime file
-   tagtime=/home/corpus/log/tagtime
-
-   # temporary working directory for summary results
-   tmp=/home/corpus/tmp
-
-   # subversion directory location
-   tree=/home/corpus/svn/spamassassin
-
-   # rsync username and password
-   username=joe
-   password=xyzzy
-
-   # weekly and nightly mass-check options
-   opts_weekly="--restart=500 --tail=15000 --net -j 8 -f /home/corpus/mail/corpus"
-   opts_nightly="--restart=500 --tail=15000 -f /home/corpus/mail/corpus"
-
-   # weekly and nightly mass-check user_prefs files
-   prefs_weekly=/home/corpus/mail/user_prefs.weekly
-   prefs_nightly=/home/corpus/mail/user_prefs.nightly
-
- - the following cron job (hours for tagtime and corpus-nightly for your
-   local time, this is US/Pacific)
-
-The "tagtime" and "corpus-nightly" cron jobs run twice a day due to
-daylight savings, but exit if it's 0800-0859 or 1000-1059 UTC (which
-means you can "corpus-nightly" any other time of day if you want).
+   "pyzor discover" only needs to be run if you are running Pyzor.
 
 ------- start of cut text --------------
-0 1,2 * * * /home/corpus/scripts/corpus-tagtime
-10 1,2 * * * /home/corpus/scripts/corpus-nightly >/home/corpus/log/nightly 2>&1
-30 * * * * /home/corpus/scripts/corpus-hourly >/home/corpus/log/hourly 2>&1
+PATH=/home/corpus/scripts:/usr/local/bin:/usr/local/sbin:/bin:/sbin:/usr/bin:/usr/sbin
+0 1,2 * * * corpus-tagtime
+10 1,2 * * * corpus-nightly >/home/corpus/log/nightly 2>&1
+30 * * * * corpus-hourly >/home/corpus/log/hourly 2>&1
+5 1 * * * pyzor discover >/dev/null 2>/dev/null
 ------- end ----------------------------

Modified: incubator/spamassassin/trunk/masses/rule-qa/corpus-hourly
==============================================================================
--- incubator/spamassassin/trunk/masses/rule-qa/corpus-hourly	(original)
+++ incubator/spamassassin/trunk/masses/rule-qa/corpus-hourly	Sun May  9 13:17:04 2004
@@ -73,7 +73,12 @@
 	close(FIND);
 	if (! $files) {
 	    print STDERR "no new corpus files\n";
-	    exit 0;
+	    if (rand(24) > 1) {
+		exit 0;
+	    }
+	    else {
+		print STDERR "updating anyway\n";
+	    }
 	}
     }
     open(RSYNC, "> rsync.last");
@@ -147,18 +152,26 @@
 
 sub current {
     for my $class ("DETAILS", "HTML", "NET") {
-	for my $age ("new", "all", "age", "1day", "2day", "7day") {
+#	for my $age ("new", "all", "age", "1day", "2day", "7day") {
+	for my $age ("new", "all", "age") {
+	    print STDERR "generating $class.$age\n";
+
+	    next if ($class eq "NET" && $age !~ /^(?:new|all|age|7day)$/);
+
 	    my @ham = grep { /^ham/ } @files;
 	    my @spam = grep { /^spam/ } @files;
 
-	    chdir $opt{corpus};
+	    print STDERR "ham: " . join(' ', @ham) . "\n";
+	    print STDERR "spam: " . join(' ', @spam) . "\n";
 
-	    next if ($class eq "NET" && $age !~ /^(?:new|all|age|7day)$/);
+	    chdir $opt{corpus};
 
 	    # net vs. local
 	    if ($class eq "NET") {
 		@ham = grep { /-net-/ } @ham;
 		@spam = grep { /-net-/ } @spam;
+		print STDERR "ham: " . join(' ', @ham) . "\n";
+		print STDERR "spam: " . join(' ', @spam) . "\n";
 	    }
 	    else {
 		# if both net and local exist, use newer
@@ -173,30 +186,30 @@
 		}
 		while (my ($user, $count) = each %ham) {
 		    if ($count > 1) {
-			my @matches = grep { m/-$user\.log$/ } @ham;
-			my $new;
-			for (@matches) {
-			    if (!defined $new || -M $_ < -M $new) {
-				$new = $_;
-			    }
+			my $nightly = "ham-$user.log";
+			my $weekly = "ham-net-$user.log";
+			if ($revision{$nightly} >= $revision{$weekly}) {
+			    @ham = grep { $_ ne $weekly } @ham;
+			}
+			else {
+			    @ham = grep { $_ ne $nightly } @ham;
 			}
-			next unless $new;
-			@ham = grep { !/-$user\.log$/ || $_ eq $new } @ham;
 		    }
 		}
 		while (my ($user, $count) = each %spam) {
 		    if ($count > 1) {
-			my @matches = grep { m/-$user\.log$/ } @spam;
-			my $new;
-			for (@matches) {
-			    if (!defined $new || -M $_ < -M $new) {
-				$new = $_;
-			    }
+			my $nightly = "spam-$user.log";
+			my $weekly = "spam-net-$user.log";
+			if ($revision{$nightly} >= $revision{$weekly}) {
+			    @spam = grep { $_ ne $weekly } @spam;
+			}
+			else {
+			    @spam = grep { $_ ne $nightly } @spam;
 			}
-			next unless $new;
-			@spam = grep { !/-$user\.log$/ || $_ eq $new } @spam;
 		    }
 		}
+		print STDERR "ham: " . join(' ', @ham) . "\n";
+		print STDERR "spam: " . join(' ', @spam) . "\n";
 	    }
 	    
 	    # age
@@ -210,17 +223,23 @@
 		}
 		@spam = grep { $revision{$_} eq $wanted } @spam;
 		@ham = grep { $revision{$_} eq $wanted } @ham;
+		print STDERR "ham: " . join(' ', @ham) . "\n";
+		print STDERR "spam: " . join(' ', @spam) . "\n";
 	    }
 	    elsif ($age =~ /^(?:new|all|age)$/) {
 		@ham = grep { -M "$_" < -M $opt{tagtime} } @ham;
 		@spam = grep { -M "$_" < -M $opt{tagtime} } @spam;
 		@ham = grep { $revision{$_} eq $revision } @ham;
 		@spam = grep { $revision{$_} eq $revision } @spam;
+		print STDERR "ham: " . join(' ', @ham) . "\n";
+		print STDERR "spam: " . join(' ', @spam) . "\n";
 	    }
 	    elsif ($age =~ /(\d+)day/) {
 		my $mtime = $1;
 		@ham = grep { -M "$_" < $mtime } @ham;
 		@spam = grep { -M "$_" < $mtime } @spam;
+		print STDERR "ham: " . join(' ', @ham) . "\n";
+		print STDERR "spam: " . join(' ', @spam) . "\n";
 	    }
 	    
 	    open(OUT, "> $opt{html}/$class.$age");

Modified: incubator/spamassassin/trunk/masses/rule-qa/corpus-nightly
==============================================================================
--- incubator/spamassassin/trunk/masses/rule-qa/corpus-nightly	(original)
+++ incubator/spamassassin/trunk/masses/rule-qa/corpus-nightly	Sun May  9 13:17:04 2004
@@ -28,10 +28,11 @@
 cd $tree
 
 # find current revision
-rm -f weekly-versions.txt
-rm -f nightly-versions.txt
-wget http://rsync.spamassassin.org/weekly-versions.txt
-wget http://rsync.spamassassin.org/nightly-versions.txt
+rm -f weekly-versions.txt nightly-versions.txt
+if ! wget --tries=120 --wait=30 http://rsync.spamassassin.org/weekly-versions.txt http://rsync.spamassassin.org/nightly-versions.txt; then
+	echo "wget failed" 1>&2
+	exit 0
+fi
 if [ -n "$net" ]; then
 	revision=$(tail -1 weekly-versions.txt|awk '{print $2}')
 else
@@ -42,14 +43,18 @@
 set +e
 retry=0
 while true; do
+	killall -TERM svn
+	sleep 10
+	killall -KILL svn
 	if svn update -r $revision; then
 		break;
 	fi
 	if [ $retry -eq 120 ]; then
+		echo "svn update failed" 1>&2
 		exit 1
 	fi
 	retry=$(( $retry + 1 ))
-	sleep 30
+	sleep 20
 done
 set -e
 
@@ -75,6 +80,22 @@
 uptime
 date > test.end
 
-# submit results
-rsync -CPcvuzb --timeout=120 ham.log $username@rsync.spamassassin.org::corpus/ham-$net$username.log
-rsync -CPcvuzb --timeout=120 spam.log $username@rsync.spamassassin.org::corpus/spam-$net$username.log
+# results name
+mv spam.log spam-$net$username.log
+mv ham.log ham-$net$username.log
+
+# rsync
+set +e
+retry=0
+while true; do
+	if rsync -CPcvuzb --timeout=120 spam-$net$username.log ham-$net$username.log $username@rsync.spamassassin.org::corpus/; then
+		break;
+	fi
+	if [ $retry -eq 120 ]; then
+		echo "rsync failed" 1>&2
+		exit 1
+	fi
+	retry=$(( $retry + 1 ))
+	sleep 30
+done
+set -e

Added: incubator/spamassassin/trunk/masses/rule-qa/corpus.example
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/masses/rule-qa/corpus.example	Sun May  9 13:17:04 2004
@@ -0,0 +1,26 @@
+# location of corpus file (mass-check -f $corpus)
+corpus=/home/corpus/corpus
+
+# location for summary results
+html=/home/html/root/users/corpus
+
+# location of tagtime file
+tagtime=/home/corpus/log/tagtime
+
+# temporary working directory for summary results
+tmp=/home/corpus/tmp
+
+# subversion directory location
+tree=/home/corpus/svn/spamassassin
+
+# rsync username and password
+username=joe
+password=xyzzy
+
+# weekly and nightly mass-check options
+opts_weekly="--restart=500 --tail=15000 --net -j 8 -f /home/corpus/mail/corpus"
+opts_nightly="--restart=500 --tail=15000 -f /home/corpus/mail/corpus"
+
+# weekly and nightly mass-check user_prefs files
+prefs_weekly=/home/corpus/mail/user_prefs.weekly
+prefs_nightly=/home/corpus/mail/user_prefs.nightly