You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by pd...@apache.org on 2019/06/01 09:51:32 UTC

svn commit: r1860471 - in /spamassassin/trunk/masses/rule-qa: corpus-cleanup corpus-hourly

Author: pds
Date: Sat Jun  1 09:51:31 2019
New Revision: 1860471

URL: http://svn.apache.org/viewvc?rev=1860471&view=rev
Log:
Bug 7715

Added:
    spamassassin/trunk/masses/rule-qa/corpus-cleanup
Modified:
    spamassassin/trunk/masses/rule-qa/corpus-hourly

Added: spamassassin/trunk/masses/rule-qa/corpus-cleanup
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/rule-qa/corpus-cleanup?rev=1860471&view=auto
==============================================================================
--- spamassassin/trunk/masses/rule-qa/corpus-cleanup (added)
+++ spamassassin/trunk/masses/rule-qa/corpus-cleanup Sat Jun  1 09:51:31 2019
@@ -0,0 +1,177 @@
+#!/usr/bin/perl
+
+my $email_to = 'pds@apache.org';
+#my $email_to = 'ruleqa@spamassassin.apache.org';
+
+use strict;
+use Getopt::Long;
+our ( $corpusdir );
+GetOptions(
+    "dir=s" => \$corpusdir,
+);
+
+use File::Path;
+use File::Copy;
+use Time::ParseDate;
+use Cwd;
+use POSIX qw(nice strftime);
+
+nice(15);
+
+my %revision = ();
+my %logs_by_rev = ();
+my %is_net_revision = ();
+my %dateline = ();
+my %time = ();
+my @files;
+my $time_start = time;
+my %revision_date = ();
+my %before_nine = ();
+
+my $delete_weekly = 60*60*24*9;
+my $delete_nightly = 60*60*24*3;
+
+&rename_corpus;
+&read_files;
+&cleanup_old;
+&email_beforenine;
+
+sub rename_corpus {
+  opendir(CORPUS, $corpusdir);
+  my @rfiles = sort readdir(CORPUS);
+  closedir(CORPUS);
+
+  @rfiles = grep {
+    /^(?:spam|ham)-(?:net-)?[-\w]+\.log$/ && !(/\.r[0-9]+\.log$/) && -f "$corpusdir/$_" && -M _ < 10
+  } @rfiles;
+
+  foreach my $file (@rfiles) {
+    my $rev;
+    open(FILE, "$corpusdir/$file") or warn "cannot read $corpusdir/$file";
+    while (my $line = <FILE>) {
+      last if $line !~ /^#/;
+      if ($line =~ m/^# Date:\s*(\S+)/) {
+        my $date_line = $1;
+        my ($yyyy, $mm, $dd, $h, $m, $s) = $date_line =~ /(\d\d\d\d)(\d\d)(\d\d)T(\d\d)(\d\d)(\d\d)Z/;
+
+        my $timet = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} ${h}:${m}:${s} GMT+0",
+                  GMT => 1, PREFER_PAST => 1);
+
+        my $timetgt = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} 09:00:00 GMT+0",
+                  GMT => 1, PREFER_PAST => 1);
+
+        if ($timet < $timetgt) {
+          $before_nine{$file} = $timet;
+        }
+      }
+      if ($line =~ m/^# SVN revision:\s*(\S+)/) {
+        $rev = $1;
+      }
+    }
+
+    close(FILE);
+
+    if ($rev) {
+      my $newfile = $file;
+      $newfile =~ s/\.log$/.r$rev.log/;
+      rename("$corpusdir/$file", "$corpusdir/$newfile");
+    }
+
+  }
+}
+
+sub read_files {
+  opendir(CORPUS, $corpusdir);
+  @files = sort readdir(CORPUS);
+  closedir(CORPUS);
+
+  @files = grep {
+    /^(?:spam|ham)-(?:net-)?[-\w]+\.r[0-9]+\.log$/ && -f "$corpusdir/$_" && -M _ < 10
+  } @files;
+
+  foreach my $file (@files) {
+    open(FILE, "$corpusdir/$file") or warn "cannot read $corpusdir/$file";
+    while (my $line = <FILE>) {
+      last if $line !~ /^#/;
+      if ($line =~ m/^# Date:\s*(\S+)/) {
+        $dateline{$file} = $1;
+        # if time line unparseable (localized?) use this instead
+        my ($yyyy, $mm, $dd, $h, $m, $s) = $dateline{$file} =~ /(\d\d\d\d)(\d\d)(\d\d)T(\d\d)(\d\d)(\d\d)Z/;
+
+        my $timetgt = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} 09:00:00 GMT+0",
+                  GMT => 1, PREFER_PAST => 1);
+
+        $time{$file} = $timetgt;
+      }
+      if ($line =~ m/^# SVN revision:\s*(\S+)/) {
+        my $rev = $1;
+        $revision{$file} = $rev;
+
+        $logs_by_rev{$rev} ||= [ ];
+        push (@{$logs_by_rev{$rev}}, $file);
+
+        if ($file =~ /-net-/) {
+          $is_net_revision{$rev} = 1;
+        }
+      }
+    }
+    if ($time{$file} && $revision{$file}) {
+      my $rev = $revision{$file};
+      $revision_date{$rev} = $time{$file} unless defined $revision_date{$rev};
+
+      # set earliest file that has this revision
+
+      if ($time{$file} < $revision_date{$rev}) {
+        $revision_date{$rev} = $time{$file};
+      }
+    }
+    close(FILE);
+  }
+}
+
+sub cleanup_old {
+  my @cleanup = ();
+
+  foreach my $revision (keys %revision_date) {
+    # set target date based on if net rev
+    my $target_date = ($time_start - $delete_nightly);
+    $target_date = ($time_start - $delete_weekly) if $is_net_revision{$revision};
+    # add all files to cleanup arr
+    if ($revision_date{$revision} < $target_date) {
+      push(@cleanup, @{$logs_by_rev{$revision}})
+    }
+  }
+
+  my @cleanup = map "$corpusdir/$cleanup[$_]", 0..$#cleanup;
+
+  unlink($_) foreach @cleanup;
+}
+
+sub email_beforenine {
+  my $size = keys %before_nine;
+  return unless $size;
+
+  my $from = 'automc@sa-vm1.apache.org';
+  my $subject = '[corpus-cleanup] Early runners';
+  my $message = "The following files were submitted by early runners:\n\n";
+  foreach my $revision (keys %before_nine) {
+    my $time = strftime("%F %R:%S %z", gmtime($before_nine{$revision}));
+    $message .= "$revision - Started at $time\n";
+  }
+  $message .= "\nPlease run automasscheck after 0900 UTC";
+  open(MAIL, "|/usr/sbin/sendmail -t");
+
+  # Email Header
+  print MAIL "To: $email_to\n";
+  print MAIL "From: $from\n";
+  print MAIL "Subject: $subject\n";
+  print MAIL "MIME-Version: 1.0\n";
+  print MAIL "Content-Type: text/plain; charset=UTF-8\n";
+  print MAIL "Content-Transfer-Encoding: 8bit\n";
+  print MAIL "\n";
+  # Email Body
+  print MAIL $message;
+
+  close(MAIL);
+
+}

Modified: spamassassin/trunk/masses/rule-qa/corpus-hourly
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/rule-qa/corpus-hourly?rev=1860471&r1=1860470&r2=1860471&view=diff
==============================================================================
--- spamassassin/trunk/masses/rule-qa/corpus-hourly (original)
+++ spamassassin/trunk/masses/rule-qa/corpus-hourly Sat Jun  1 09:51:31 2019
@@ -31,10 +31,12 @@ my %revision = ();
 my %logs_by_rev = ();
 my %is_net_revision = ();
 my %time = ();
+my %revision_date = ();
 my @files;
 my @tmps = ();
 my $skip = '';
 my $time_start = time;
+$time_start -= ($time_start % 3600);
 my $output_revpath;
 
 &configure;
@@ -130,20 +132,26 @@ sub locate {
   closedir(CORPUS);
 
   @files = grep {
-    /^(?:spam|ham)-(?:net-)?[-\w]+\.log$/ && -f "$corpusdir/$_" && -M _ < 10 
+    /^(?:spam|ham)-(?:net-)?[-\w]+\.r[0-9]+\.log$/ && -f "$corpusdir/$_" && -M _ < 10 
   } @files;
 
   foreach my $file (@files) {
     # my $time = 0;
     my $tag = 0;
+    my $revtime;
     open(FILE, "$corpusdir/$file") or warn "cannot read $corpusdir/$file";
     while (my $line = <FILE>) {
       last if $line !~ /^#/;
-      if ($line =~ /, on (... ... .. )(..)(:..:.. ... ....)/) {
-        my ($datepre, $hh, $datepost) = ($1,$2,$3);
-        
-        my $timet = Time::ParseDate::parsedate($datepre.$hh.$datepost,
-                    GMT => 1, PREFER_PAST => 1);
+      if ($line =~ m/^# Date:\s*(\S+)/) {
+        my $date_line = $1;
+        my ($yyyy, $mm, $dd, $h, $m, $s) = $date_line =~ /(\d\d\d\d)(\d\d)(\d\d)T(\d\d)(\d\d)(\d\d)Z/;
+
+        my $timet = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} ${h}:${m}:${s} GMT+0",
+                  GMT => 1, PREFER_PAST => 1);
+
+        $revtime = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} 09:00:00 GMT+0",
+                  GMT => 1, PREFER_PAST => 1);
+
         $time{$file} = $timet;
         print "$corpusdir/$file: time=$timet\n";
 
@@ -166,6 +174,14 @@ sub locate {
       }
     }
     close(FILE);
+    if ($revtime) {
+      my $rev = $revision{$file};
+      $revision_date{$rev} = $revtime unless defined $revision_date{$rev};
+
+      if ($revtime < $revision_date{$rev}) {
+        $revision_date{$rev} = $revtime;
+      }
+    }
     # if (!$time) {
     # $skip .= "# skipped $_: time is between 0800 UTC and 0900 UTC\n";
     # }
@@ -189,10 +205,10 @@ sub sort_all {
 }
 
 sub time_filter {
-  my ($after, $before) = @_;
+  my ($target, $after, $before) = @_;
   if (/time=(\d+)/) {
-	return (($time_start - $1 >= WEEK * $after) &&
-		($time_start - $1 < WEEK * $before));
+	return (($target - $1 >= WEEK * $after) &&
+		($target - $1 < WEEK * $before));
   }
   return 0;
 }
@@ -245,10 +261,10 @@ sub gen_class {
     my %ham;
     
     for my $file (@spam) {
-      $spam{$1}++ if ($file =~ m/-(\w[-\w]+)\.log$/);
+      $spam{$1}++ if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/);
     }
     for my $file (@ham) {
-      $ham{$1}++ if ($file =~ m/-(\w[-\w]+)\.log$/);
+      $ham{$1}++ if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/);
     }
     while (my ($user, $count) = each %ham) {
       if ($count > 1) {
@@ -301,15 +317,14 @@ sub gen_class {
     return;
   }
 
-  my $time = $time{$ham[0]};        # use the ham file's time
+  my $time = $revision_date{$rev};
   my $dir = create_outputdir($rev, $time);
 
   my $fname = "$dir/$class.$age";
-
   # now, if the target file already exists, check to see if it's newer
   # than all the sources, make-style
   if (-f $fname) {
-    my $targetfreshness = (-M $fname);
+    my $targetfreshness = (-M $fname) + (6*3600);
     my $needsrebuild = 0;
 
     foreach my $srcfile (@spam, @ham) {
@@ -347,6 +362,12 @@ sub gen_class {
                     warn "cannot rename $zf.$$ to $zf";
       $bytes += (-s $zf);
     }
+    # reduce the number of times that the raw logs are copied over
+    my $tmpfname = "$fname.$$";
+    open(OUT, "> $tmpfname") or warn "cannot write to $tmpfname";
+    print OUT "$$ : $time_start";
+    close(OUT);
+    rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname";
   }
   else {
     my $tmpfname = "$fname.$$";
@@ -378,10 +399,10 @@ sub gen_class {
       my @output;
       
       for my $file (@spam) {
-        $spam{$1} = $file if ($file =~ m/-(\w[-\w]+)\.log$/);
+        $spam{$1} = $file if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/);
       }
       for my $file (@ham) {
-        $ham{$1} = $file if ($file =~ m/-(\w[-\w]+)\.log$/);
+        $ham{$1} = $file if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/);
       }
       unlink "$opt{tmp}/ham.log.$$";
       unlink "$opt{tmp}/spam.log.$$";
@@ -423,7 +444,7 @@ sub gen_class {
           for my $file (@array) {
             open(IN, $file) or warn "cannot read $file";
             while (<IN>) {
-              print TMP $_ if time_filter($after, $before);
+              print TMP $_ if time_filter($time, $after, $before);
             }
             close(IN);
           }