You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2005/10/25 02:51:37 UTC

svn commit: r328203 - /spamassassin/trunk/masses/rule-qa/corpus-hourly

Author: jm
Date: Mon Oct 24 17:51:35 2005
New Revision: 328203

URL: http://svn.apache.org/viewcvs?rev=328203&view=rev
Log:
add LOGS gzipping

Modified:
    spamassassin/trunk/masses/rule-qa/corpus-hourly

Modified: spamassassin/trunk/masses/rule-qa/corpus-hourly
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/rule-qa/corpus-hourly?rev=328203&r1=328202&r2=328203&view=diff
==============================================================================
--- spamassassin/trunk/masses/rule-qa/corpus-hourly (original)
+++ spamassassin/trunk/masses/rule-qa/corpus-hourly Mon Oct 24 17:51:35 2005
@@ -14,6 +14,7 @@
 
 
 use File::Path;
+use File::Copy;
 use Time::ParseDate;
 use Cwd;
 use POSIX qw(nice strftime);
@@ -317,116 +318,133 @@
     }
   }
 
-  my $tmpfname = "$fname.$$";
-  open(OUT, "> $tmpfname") or warn "cannot write to $tmpfname";
-  print OUT "# ham results used for $rev $class $age: " . join(" ", @ham) . "\n";
-  print OUT "# spam results used for $rev $class $age: " . join(" ", @spam) . "\n";
-  for (@ham) {
-    print OUT "# $_ was at r$revision{$_}\n";
-  }
-  for (@spam) {
-    print OUT "# $_ was at r$revision{$_}\n";
-  }
-
-  push (@tmps, $tmpfname);
-
   my $when = scalar localtime time;
   print qq{creating: $fname
-started $when...
-};
-
-  my $flags = "";
-  $flags = "-t net -s 1" if $class eq "NET";
-  $flags = "-M HTML_MESSAGE" if $class eq "HTML";
-  $flags = "-o" if $class eq "OVERLAP";
-  if ($opt{rules_dir}) {
-    $flags .= " -c '$opt{rules_dir}'";
+  started $when...
+  };
+  my $bytes = 0;
+
+  if ($class eq 'LOGS') {
+    foreach my $f (@ham, @spam) {
+      $f =~ s/[^-_A-Za-z0-9]+/_/gs;    # sanitize!
+
+      system("gzip -c < $f > $fname-$f.gz.$$");
+      if ($? >> 8 != 0) {
+        warn "gzip -c < $f > $fname-$f.gz.$$ failed";
+      }
+
+      rename("$fname-$f.gz.$$", "$fname-$f.gz") or
+                    warn "cannot rename $fname-$f.gz.$$ to $fname-$f.gz";
+      $bytes += (-s "$fname-$f");
+    }
   }
+  else {
+    my $tmpfname = "$fname.$$";
 
-  if ($age eq "all") {
-    my %spam;
-    my %ham;
-    my @output;
-    
-    for my $file (@spam) {
-      $spam{$1} = $file if ($file =~ m/-(\w[-\w]+)\.log$/);
+    open(OUT, "> $tmpfname") or warn "cannot write to $tmpfname";
+    print OUT "# ham results used for $rev $class $age: " . join(" ", @ham) . "\n";
+    print OUT "# spam results used for $rev $class $age: " . join(" ", @spam) . "\n";
+    for (@ham) {
+      print OUT "# $_ was at r$revision{$_}\n";
     }
-    for my $file (@ham) {
-      $ham{$1} = $file if ($file =~ m/-(\w[-\w]+)\.log$/);
+    for (@spam) {
+      print OUT "# $_ was at r$revision{$_}\n";
     }
-    unlink "$opt{tmp}/ham.log.$$";
-    unlink "$opt{tmp}/spam.log.$$";
 
-    if (scalar keys %spam <= 0 || scalar keys %ham <= 0) {
-      warn "no files found for $class.$age";
-      return;
+    push (@tmps, $tmpfname);
+
+    my $flags = "";
+    $flags = "-t net -s 1" if $class eq "NET";
+    $flags = "-M HTML_MESSAGE" if $class eq "HTML";
+    $flags = "-o" if $class eq "OVERLAP";
+    if ($opt{rules_dir}) {
+      $flags .= " -c '$opt{rules_dir}'";
     }
 
-    chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";
-    for my $user (sort keys %spam) {
-      next unless $ham{$user};
-      system("cat $corpusdir/$ham{$user} >> $opt{tmp}/ham.log.$$");
-      system("cat $corpusdir/$spam{$user} >> $opt{tmp}/spam.log.$$");
-      open(IN, "./hit-frequencies -xpa $flags $corpusdir/$spam{$user} $corpusdir/$ham{$user} |");
+    if ($age eq "all") {
+      my %spam;
+      my %ham;
+      my @output;
+      
+      for my $file (@spam) {
+        $spam{$1} = $file if ($file =~ m/-(\w[-\w]+)\.log$/);
+      }
+      for my $file (@ham) {
+        $ham{$1} = $file if ($file =~ m/-(\w[-\w]+)\.log$/);
+      }
+      unlink "$opt{tmp}/ham.log.$$";
+      unlink "$opt{tmp}/spam.log.$$";
+
+      if (scalar keys %spam <= 0 || scalar keys %ham <= 0) {
+        warn "no files found for $class.$age";
+        return;
+      }
+
+      chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";
+      for my $user (sort keys %spam) {
+        next unless $ham{$user};
+        system("cat $corpusdir/$ham{$user} >> $opt{tmp}/ham.log.$$");
+        system("cat $corpusdir/$spam{$user} >> $opt{tmp}/spam.log.$$");
+        open(IN, "./hit-frequencies -xpa $flags $corpusdir/$spam{$user} $corpusdir/$ham{$user} |");
+        while(<IN>) {
+          chomp;
+          push @output, "$_:$user\n";
+        }
+        close(IN);
+      }
+      open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
       while(<IN>) {
-        chomp;
-        push @output, "$_:$user\n";
+        push @output, $_;
       }
       close(IN);
+      for (sort sort_all @output) { print OUT; }
     }
-    open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
-    while(<IN>) {
-      push @output, $_;
-    }
-    close(IN);
-    for (sort sort_all @output) { print OUT; }
-  }
-  elsif ($age eq "age") {
-    my @output;
-
-    for my $which (("0-1", "1-2", "2-3", "3-6")) {
-      my ($after, $before) = split(/-/, $which);
-      # get and filter logs
-      chdir $corpusdir;
-      for my $type (("ham", "spam")) {
-        open(TMP, "> $opt{tmp}/$type.log.$$");
-        my @array = ($type eq "ham") ? @ham : @spam;
-        for my $file (@array) {
-          open(IN, $file) or warn "cannot read $file";
-          while (<IN>) {
-            print TMP $_ if time_filter($after, $before);
+    elsif ($age eq "age") {
+      my @output;
+
+      for my $which (("0-1", "1-2", "2-3", "3-6")) {
+        my ($after, $before) = split(/-/, $which);
+        # get and filter logs
+        chdir $corpusdir;
+        for my $type (("ham", "spam")) {
+          open(TMP, "> $opt{tmp}/$type.log.$$");
+          my @array = ($type eq "ham") ? @ham : @spam;
+          for my $file (@array) {
+            open(IN, $file) or warn "cannot read $file";
+            while (<IN>) {
+              print TMP $_ if time_filter($after, $before);
+            }
+            close(IN);
           }
-          close(IN);
+          close (TMP);
         }
-        close (TMP);
+        # print out by age
+        chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";
+        open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
+        while(<IN>) {
+          chomp;
+          push @output, "$_:$which\n";
+        }
+        close(IN);
       }
-      # print out by age
+      for (sort sort_all @output) { print OUT; }
+    }
+    elsif (@ham && @spam) {
+      # get logs
+      system("cat " . join(" ", @ham) . " > $opt{tmp}/ham.log.$$");
+      system("cat " . join(" ", @spam) . " > $opt{tmp}/spam.log.$$");
+
       chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";
       open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
-      while(<IN>) {
-        chomp;
-        push @output, "$_:$which\n";
-      }
+      while(<IN>) { print(OUT); }
       close(IN);
     }
-    for (sort sort_all @output) { print OUT; }
-  }
-  elsif (@ham && @spam) {
-    # get logs
-    system("cat " . join(" ", @ham) . " > $opt{tmp}/ham.log.$$");
-    system("cat " . join(" ", @spam) . " > $opt{tmp}/spam.log.$$");
 
-    chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";
-    open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
-    while(<IN>) { print(OUT); }
-    close(IN);
+    $bytes = (-s OUT);
+    close(OUT);
+    rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname";
   }
 
-  my $bytes = (-s OUT);
-  close(OUT);
-
-  rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname";
-
   $when = scalar localtime time;
   print qq{created: $bytes bytes, finished at $when
 URL:
@@ -444,7 +462,6 @@
 
   # print "output dir: $dir\n";
   if (!-d $dir) {
-    my $mode = oct($opt{html_mode});
     my $prevu = umask 0;
     mkpath([$dir], 0, oct($opt{html_mode})) or warn "failed to mkdir $dir";
     umask $prevu;