You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by pd...@apache.org on 2019/06/01 09:51:32 UTC
svn commit: r1860471 - in /spamassassin/trunk/masses/rule-qa: corpus-cleanup
corpus-hourly
Author: pds
Date: Sat Jun 1 09:51:31 2019
New Revision: 1860471
URL: http://svn.apache.org/viewvc?rev=1860471&view=rev
Log:
Bug 7715
Added:
spamassassin/trunk/masses/rule-qa/corpus-cleanup
Modified:
spamassassin/trunk/masses/rule-qa/corpus-hourly
Added: spamassassin/trunk/masses/rule-qa/corpus-cleanup
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/rule-qa/corpus-cleanup?rev=1860471&view=auto
==============================================================================
--- spamassassin/trunk/masses/rule-qa/corpus-cleanup (added)
+++ spamassassin/trunk/masses/rule-qa/corpus-cleanup Sat Jun 1 09:51:31 2019
@@ -0,0 +1,177 @@
+#!/usr/bin/perl
+
+my $email_to = 'pds@apache.org';
+#my $email_to = 'ruleqa@spamassassin.apache.org';
+
+use strict;
+use Getopt::Long;
+our ( $corpusdir );
+GetOptions(
+ "dir=s" => \$corpusdir,
+);
+
+use File::Path;
+use File::Copy;
+use Time::ParseDate;
+use Cwd;
+use POSIX qw(nice strftime);
+
+nice(15);
+
+my %revision = ();
+my %logs_by_rev = ();
+my %is_net_revision = ();
+my %dateline = ();
+my %time = ();
+my @files;
+my $time_start = time;
+my %revision_date = ();
+my %before_nine = ();
+
+my $delete_weekly = 60*60*24*9;
+my $delete_nightly = 60*60*24*3;
+
+&rename_corpus;
+&read_files;
+&cleanup_old;
+&email_beforenine;
+
+sub rename_corpus {
+ opendir(CORPUS, $corpusdir);
+ my @rfiles = sort readdir(CORPUS);
+ closedir(CORPUS);
+
+ @rfiles = grep {
+ /^(?:spam|ham)-(?:net-)?[-\w]+\.log$/ && !(/\.r[0-9]+\.log$/) && -f "$corpusdir/$_" && -M _ < 10
+ } @rfiles;
+
+ foreach my $file (@rfiles) {
+ my $rev;
+ open(FILE, "$corpusdir/$file") or warn "cannot read $corpusdir/$file";
+ while (my $line = <FILE>) {
+ last if $line !~ /^#/;
+ if ($line =~ m/^# Date:\s*(\S+)/) {
+ my $date_line = $1;
+ my ($yyyy, $mm, $dd, $h, $m, $s) = $date_line =~ /(\d\d\d\d)(\d\d)(\d\d)T(\d\d)(\d\d)(\d\d)Z/;
+
+ my $timet = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} ${h}:${m}:${s} GMT+0",
+ GMT => 1, PREFER_PAST => 1);
+
+ my $timetgt = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} 09:00:00 GMT+0",
+ GMT => 1, PREFER_PAST => 1);
+
+ if ($timet < $timetgt) {
+ $before_nine{$file} = $timet;
+ }
+ }
+ if ($line =~ m/^# SVN revision:\s*(\S+)/) {
+ $rev = $1;
+ }
+ }
+
+ close(FILE);
+
+ if ($rev) {
+ my $newfile = $file;
+ $newfile =~ s/\.log$/.r$rev.log/;
+ rename("$corpusdir/$file", "$corpusdir/$newfile");
+ }
+
+ }
+}
+
+sub read_files {
+ opendir(CORPUS, $corpusdir);
+ @files = sort readdir(CORPUS);
+ closedir(CORPUS);
+
+ @files = grep {
+ /^(?:spam|ham)-(?:net-)?[-\w]+\.r[0-9]+\.log$/ && -f "$corpusdir/$_" && -M _ < 10
+ } @files;
+
+ foreach my $file (@files) {
+ open(FILE, "$corpusdir/$file") or warn "cannot read $corpusdir/$file";
+ while (my $line = <FILE>) {
+ last if $line !~ /^#/;
+ if ($line =~ m/^# Date:\s*(\S+)/) {
+ $dateline{$file} = $1;
+ # if time line unparseable (localized?) use this instead
+ my ($yyyy, $mm, $dd, $h, $m, $s) = $dateline{$file} =~ /(\d\d\d\d)(\d\d)(\d\d)T(\d\d)(\d\d)(\d\d)Z/;
+
+ my $timetgt = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} 09:00:00 GMT+0",
+ GMT => 1, PREFER_PAST => 1);
+
+ $time{$file} = $timetgt;
+ }
+ if ($line =~ m/^# SVN revision:\s*(\S+)/) {
+ my $rev = $1;
+ $revision{$file} = $rev;
+
+ $logs_by_rev{$rev} ||= [ ];
+ push (@{$logs_by_rev{$rev}}, $file);
+
+ if ($file =~ /-net-/) {
+ $is_net_revision{$rev} = 1;
+ }
+ }
+ }
+ if ($time{$file} && $revision{$file}) {
+ my $rev = $revision{$file};
+ $revision_date{$rev} = $time{$file} unless defined $revision_date{$rev};
+
+ # set earliest file that has this revision
+
+ if ($time{$file} < $revision_date{$rev}) {
+ $revision_date{$rev} = $time{$file};
+ }
+ }
+ close(FILE);
+ }
+}
+
+sub cleanup_old {
+ my @cleanup = ();
+
+ foreach my $revision (keys %revision_date) {
+ # set target date based on if net rev
+ my $target_date = ($time_start - $delete_nightly);
+ $target_date = ($time_start - $delete_weekly) if $is_net_revision{$revision};
+ # add all files to cleanup arr
+ if ($revision_date{$revision} < $target_date) {
+ push(@cleanup, @{$logs_by_rev{$revision}})
+ }
+ }
+
+ my @cleanup = map "$corpusdir/$cleanup[$_]", 0..$#cleanup;
+
+ unlink($_) foreach @cleanup;
+}
+
+sub email_beforenine {
+ my $size = keys %before_nine;
+ return unless $size;
+
+ my $from = 'automc@sa-vm1.apache.org';
+ my $subject = '[corpus-cleanup] Early runners';
+ my $message = "The following files were submitted by early runners:\n\n";
+ foreach my $revision (keys %before_nine) {
+ my $time = strftime("%F %R:%S %z", gmtime($before_nine{$revision}));
+ $message .= "$revision - Started at $time\n";
+ }
+ $message .= "\nPlease run automasscheck after 0900 UTC";
+ open(MAIL, "|/usr/sbin/sendmail -t");
+
+ # Email Header
+ print MAIL "To: $email_to\n";
+ print MAIL "From: $from\n";
+ print MAIL "Subject: $subject\n";
+ print MAIL "MIME-Version: 1.0\n";
+ print MAIL "Content-Type: text/plain; charset=UTF-8\n";
+ print MAIL "Content-Transfer-Encoding: 8bit\n";
+ print MAIL "\n";
+ # Email Body
+ print MAIL $message;
+
+ close(MAIL);
+
+}
Modified: spamassassin/trunk/masses/rule-qa/corpus-hourly
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/rule-qa/corpus-hourly?rev=1860471&r1=1860470&r2=1860471&view=diff
==============================================================================
--- spamassassin/trunk/masses/rule-qa/corpus-hourly (original)
+++ spamassassin/trunk/masses/rule-qa/corpus-hourly Sat Jun 1 09:51:31 2019
@@ -31,10 +31,12 @@ my %revision = ();
my %logs_by_rev = ();
my %is_net_revision = ();
my %time = ();
+my %revision_date = ();
my @files;
my @tmps = ();
my $skip = '';
my $time_start = time;
+$time_start -= ($time_start % 3600);
my $output_revpath;
&configure;
@@ -130,20 +132,26 @@ sub locate {
closedir(CORPUS);
@files = grep {
- /^(?:spam|ham)-(?:net-)?[-\w]+\.log$/ && -f "$corpusdir/$_" && -M _ < 10
+ /^(?:spam|ham)-(?:net-)?[-\w]+\.r[0-9]+\.log$/ && -f "$corpusdir/$_" && -M _ < 10
} @files;
foreach my $file (@files) {
# my $time = 0;
my $tag = 0;
+ my $revtime;
open(FILE, "$corpusdir/$file") or warn "cannot read $corpusdir/$file";
while (my $line = <FILE>) {
last if $line !~ /^#/;
- if ($line =~ /, on (... ... .. )(..)(:..:.. ... ....)/) {
- my ($datepre, $hh, $datepost) = ($1,$2,$3);
-
- my $timet = Time::ParseDate::parsedate($datepre.$hh.$datepost,
- GMT => 1, PREFER_PAST => 1);
+ if ($line =~ m/^# Date:\s*(\S+)/) {
+ my $date_line = $1;
+ my ($yyyy, $mm, $dd, $h, $m, $s) = $date_line =~ /(\d\d\d\d)(\d\d)(\d\d)T(\d\d)(\d\d)(\d\d)Z/;
+
+ my $timet = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} ${h}:${m}:${s} GMT+0",
+ GMT => 1, PREFER_PAST => 1);
+
+ $revtime = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} 09:00:00 GMT+0",
+ GMT => 1, PREFER_PAST => 1);
+
$time{$file} = $timet;
print "$corpusdir/$file: time=$timet\n";
@@ -166,6 +174,14 @@ sub locate {
}
}
close(FILE);
+ if ($revtime) {
+ my $rev = $revision{$file};
+ $revision_date{$rev} = $revtime unless defined $revision_date{$rev};
+
+ if ($revtime < $revision_date{$rev}) {
+ $revision_date{$rev} = $revtime;
+ }
+ }
# if (!$time) {
# $skip .= "# skipped $_: time is between 0800 UTC and 0900 UTC\n";
# }
@@ -189,10 +205,10 @@ sub sort_all {
}
sub time_filter {
- my ($after, $before) = @_;
+ my ($target, $after, $before) = @_;
if (/time=(\d+)/) {
- return (($time_start - $1 >= WEEK * $after) &&
- ($time_start - $1 < WEEK * $before));
+ return (($target - $1 >= WEEK * $after) &&
+ ($target - $1 < WEEK * $before));
}
return 0;
}
@@ -245,10 +261,10 @@ sub gen_class {
my %ham;
for my $file (@spam) {
- $spam{$1}++ if ($file =~ m/-(\w[-\w]+)\.log$/);
+ $spam{$1}++ if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/);
}
for my $file (@ham) {
- $ham{$1}++ if ($file =~ m/-(\w[-\w]+)\.log$/);
+ $ham{$1}++ if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/);
}
while (my ($user, $count) = each %ham) {
if ($count > 1) {
@@ -301,15 +317,14 @@ sub gen_class {
return;
}
- my $time = $time{$ham[0]}; # use the ham file's time
+ my $time = $revision_date{$rev};
my $dir = create_outputdir($rev, $time);
my $fname = "$dir/$class.$age";
-
# now, if the target file already exists, check to see if it's newer
# than all the sources, make-style
if (-f $fname) {
- my $targetfreshness = (-M $fname);
+ my $targetfreshness = (-M $fname) + (6*3600);
my $needsrebuild = 0;
foreach my $srcfile (@spam, @ham) {
@@ -347,6 +362,12 @@ sub gen_class {
warn "cannot rename $zf.$$ to $zf";
$bytes += (-s $zf);
}
+ # reduce the number of times that the raw logs are copied over
+ my $tmpfname = "$fname.$$";
+ open(OUT, "> $tmpfname") or warn "cannot write to $tmpfname";
+ print OUT "$$ : $time_start";
+ close(OUT);
+ rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname";
}
else {
my $tmpfname = "$fname.$$";
@@ -378,10 +399,10 @@ sub gen_class {
my @output;
for my $file (@spam) {
- $spam{$1} = $file if ($file =~ m/-(\w[-\w]+)\.log$/);
+ $spam{$1} = $file if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/);
}
for my $file (@ham) {
- $ham{$1} = $file if ($file =~ m/-(\w[-\w]+)\.log$/);
+ $ham{$1} = $file if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/);
}
unlink "$opt{tmp}/ham.log.$$";
unlink "$opt{tmp}/spam.log.$$";
@@ -423,7 +444,7 @@ sub gen_class {
for my $file (@array) {
open(IN, $file) or warn "cannot read $file";
while (<IN>) {
- print TMP $_ if time_filter($after, $before);
+ print TMP $_ if time_filter($time, $after, $before);
}
close(IN);
}