You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2007/07/31 14:53:22 UTC

svn commit: r561316 - /spamassassin/trunk/masses/rule-qa/corpus-hourly

Author: jm
Date: Tue Jul 31 05:53:18 2007
New Revision: 561316

URL: http://svn.apache.org/viewvc?view=rev&rev=561316
Log:
impose a global max-age for all mass-check logs processed by the ruleqa stuff; 5 years for ham, 6 months for spam

Modified:
    spamassassin/trunk/masses/rule-qa/corpus-hourly

Modified: spamassassin/trunk/masses/rule-qa/corpus-hourly
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/rule-qa/corpus-hourly?view=diff&rev=561316&r1=561315&r2=561316
==============================================================================
--- spamassassin/trunk/masses/rule-qa/corpus-hourly (original)
+++ spamassassin/trunk/masses/rule-qa/corpus-hourly Tue Jul 31 05:53:18 2007
@@ -33,6 +33,13 @@
 # http://wiki.apache.org/spamassassin/DateRev for more details.
 use constant DATEREV_ADJ => - (8 * 60 * 60);
 
+# what's the max age of mail we will accept data from? (in weeks)
+# TODO: maybe this should be in ~/.corpus
+my $OLDEST_HAM_WEEKS    = 52 * 5;       # 5 years
+my $OLDEST_SPAM_WEEKS    = 6 * 4;       # 6 months
+
+# ---------------------------------------------------------------------------
+
 my $configuration = "$ENV{HOME}/.corpus";
 my %opt;
 my %revision = ();
@@ -238,15 +245,24 @@
   return $n;
 }
 
-sub time_filter {
-  my ($after, $before) = @_;
-  if (/time=(\d+)/) {
-	return (($time_start - $1 >= WEEK * $after) &&
-		($time_start - $1 < WEEK * $before));
+sub time_filter_fileset {
+  my ($fileary, $outname, $after, $before) = @_;
+  open(TMP, "> $outname") or warn "cannot write $outname";
+  for my $file (@{$fileary}) {
+    open(IN, $file) or warn "cannot read $file";
+
+    while (<IN>) {
+      if (/time=(\d+)/) {
+        print TMP if (($time_start - $1 >= WEEK * $after) &&
+                      ($time_start - $1 < WEEK * $before));
+      }
+    }
+    close(IN);
   }
-  return 0;
+  close (TMP);
 }
 
+
 sub current {
   my $classes = $opt{output_classes};
   $classes ||= "DETAILS.new DETAILS.all DETAILS.age HTML.new HTML.all HTML.age NET.new NET.all NET.age";
@@ -406,20 +422,29 @@
       chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";
       for my $user (sort keys %spam) {
         next unless $ham{$user};
-        system("cat $corpusdir/$ham{$user} >> $opt{tmp}/ham.log.$$");
-        system("cat $corpusdir/$spam{$user} >> $opt{tmp}/spam.log.$$");
-        open(IN, "./hit-frequencies -TxpaP $flags $corpusdir/$spam{$user} $corpusdir/$ham{$user} |");
+
+        time_filter_fileset([ "$corpusdir/$ham{$user}" ],
+                "$opt{tmp}/ham.log.$$", $OLDEST_HAM_WEEKS, 0);
+        time_filter_fileset([ "$corpusdir/$spam{$user}" ],
+                "$opt{tmp}/spam.log.$$", $OLDEST_SPAM_WEEKS, 0);
+
+        open(IN, "./hit-frequencies -TxpaP $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
         while(<IN>) {
           chomp;
           push @output, "$_:$user\n";
         }
         close(IN);
+
+        system("cat $opt{tmp}/ham.log.$$ >> $opt{tmp}/hamall.log.$$");
+        system("cat $opt{tmp}/spam.log.$$ >> $opt{tmp}/spamall.log.$$");
       }
-      open(IN, "./hit-frequencies -TxpaP $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
+
+      open(IN, "./hit-frequencies -TxpaP $flags $opt{tmp}/spamall.log.$$ $opt{tmp}/hamall.log.$$ |");
       while(<IN>) {
         push @output, $_;
       }
       close(IN);
+
       for (sort sort_all @output) { print OUT; }
     }
     elsif ($age eq "age") {
@@ -429,18 +454,9 @@
         my ($after, $before) = split(/-/, $which);
         # get and filter logs
         chdir $corpusdir;
-        for my $type (("ham", "spam")) {
-          open(TMP, "> $opt{tmp}/$type.log.$$");
-          my @array = ($type eq "ham") ? @ham : @spam;
-          for my $file (@array) {
-            open(IN, $file) or warn "cannot read $file";
-            while (<IN>) {
-              print TMP $_ if time_filter($after, $before);
-            }
-            close(IN);
-          }
-          close (TMP);
-        }
+        time_filter_fileset(\@ham, "$opt{tmp}/ham.log.$$", $after, $before);
+        time_filter_fileset(\@spam, "$opt{tmp}/spam.log.$$", $after, $before);
+
         # print out by age
         chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";
         open(IN, "./hit-frequencies -TxpaP $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
@@ -454,8 +470,8 @@
     }
     elsif (@ham && @spam) {
       # get logs
-      system("cat " . join(" ", @ham) . " > $opt{tmp}/ham.log.$$");
-      system("cat " . join(" ", @spam) . " > $opt{tmp}/spam.log.$$");
+      time_filter_fileset(\@ham, "$opt{tmp}/ham.log.$$", $OLDEST_HAM_WEEKS, 0);
+      time_filter_fileset(\@spam, "$opt{tmp}/spam.log.$$", $OLDEST_SPAM_WEEKS, 0);
 
       chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";
       open(IN, "./hit-frequencies -TxpaP $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
@@ -558,4 +574,3 @@
     warn "'$cmd' failed";
   }
 }
-