You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2005/11/02 05:16:42 UTC

svn commit: r330188 - in /spamassassin/trunk: MANIFEST build/automc/buildbot_ready build/automc/run_preflight masses/hit-frequencies masses/plugins/ masses/plugins/01_rule_timing.cf masses/plugins/HitFreqsRuleTiming.pm masses/rule-qa/corpus-hourly

Author: jm
Date: Tue Nov  1 20:16:37 2005
New Revision: 330188

URL: http://svn.apache.org/viewcvs?rev=330188&view=rev
Log:
add rule timing to hit-frequencies, -T switch.  Thanks to John Gardiner Myers for the code

Added:
    spamassassin/trunk/masses/plugins/
    spamassassin/trunk/masses/plugins/01_rule_timing.cf
    spamassassin/trunk/masses/plugins/HitFreqsRuleTiming.pm
Modified:
    spamassassin/trunk/MANIFEST
    spamassassin/trunk/build/automc/buildbot_ready
    spamassassin/trunk/build/automc/run_preflight
    spamassassin/trunk/masses/hit-frequencies
    spamassassin/trunk/masses/rule-qa/corpus-hourly

Modified: spamassassin/trunk/MANIFEST
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/MANIFEST?rev=330188&r1=330187&r2=330188&view=diff
==============================================================================
--- spamassassin/trunk/MANIFEST (original)
+++ spamassassin/trunk/MANIFEST Tue Nov  1 20:16:37 2005
@@ -140,6 +140,8 @@
 masses/overlap
 masses/parse-rules-for-masses
 masses/perceptron.c
+masses/plugins/01_rule_timing.cf
+masses/plugins/HitFreqsRuleTiming.pm
 masses/post-ga-analysis.pl
 masses/remove-ids-from-mclog
 masses/rewrite-cf-with-new-scores

Modified: spamassassin/trunk/build/automc/buildbot_ready
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/build/automc/buildbot_ready?rev=330188&r1=330187&r2=330188&view=diff
==============================================================================
--- spamassassin/trunk/build/automc/buildbot_ready (original)
+++ spamassassin/trunk/build/automc/buildbot_ready Tue Nov  1 20:16:37 2005
@@ -84,7 +84,7 @@
   chdir("masses") or die;
 
   print "FAST FREQS REPORT:\n\n";
-  system ("$perl hit-frequencies -c tstrules -x -p -s 0");
+  system ("$perl hit-frequencies -c tstrules -x -p -T -s 0");
 
   print "\n\nBUILDING SLOW FREQS REPORT:\n\n";
 

Modified: spamassassin/trunk/build/automc/run_preflight
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/build/automc/run_preflight?rev=330188&r1=330187&r2=330188&view=diff
==============================================================================
--- spamassassin/trunk/build/automc/run_preflight (original)
+++ spamassassin/trunk/build/automc/run_preflight Tue Nov  1 20:16:37 2005
@@ -8,8 +8,6 @@
   die "no perl path found in ARGV!";
 }
 
-chdir "masses" or die;
-
 my $slavename = "generic";
 
 my $pwd = `pwd`;
@@ -22,11 +20,16 @@
 #
 system ("renice +19 $$");
 
-# just the sandbox rules
+# cd to masses
+#
+chdir "masses" or die;
+
+# just the sandbox rules and the timing plugin
 #
 system ("rm -rf tstrules");
 run "mkdir tstrules";
 run "cp ../rules/70_sandbox.cf tstrules";
+run "cp plugins/*.* tstrules";
 
 # this is run in a chroot jail, just in case there's hostile
 # rule code in there...

Modified: spamassassin/trunk/masses/hit-frequencies
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/hit-frequencies?rev=330188&r1=330187&r2=330188&view=diff
==============================================================================
--- spamassassin/trunk/masses/hit-frequencies (original)
+++ spamassassin/trunk/masses/hit-frequencies Tue Nov  1 20:16:37 2005
@@ -19,16 +19,16 @@
 use strict;
 use FindBin;
 use Getopt::Std;
-getopts("fm:M:X:l:L:pxhc:at:s:io");
+getopts("fm:M:X:l:L:pxhc:at:s:ioT");
 
 use vars qw {
   $opt_f $opt_m $opt_M $opt_X $opt_p $opt_x $opt_h $opt_l $opt_L $opt_c
-  $opt_a $opt_t $opt_s $opt_i $sorting $opt_o 
+  $opt_a $opt_t $opt_s $opt_i $sorting $opt_o $opt_T
 };
 
 sub usage {
   die "hit-frequencies [-c rules dir] [-f] [-m RE] [-M RE] [-X RE] [-l LC]
-                [-s SC] [-a] [-p] [-x] [-i] [-o] [spam log] [ham log]
+                [-s SC] [-a] [-p] [-x] [-i] [-T] [-o] [spam log] [ham log]
 
     -c p   use p as the rules directory
     -f     falses. count only false-negative or false-positive matches
@@ -43,6 +43,7 @@
     -x     extended output, with S/O ratio and scores
     -s SC  which scoreset to use
     -i     use IG (information gain) for ranking
+    -T     display rule times. implies -x, -p
     -o     display hit overlaps against all other rules
 
     options -l and -L are mutually exclusive.
@@ -80,6 +81,8 @@
 my %ranking = ();
 my $ok_lang = '';
 
+my %rule_times = ();
+
 readscores($cffile);
 
 $ok_lang = lc ($opt_l || $opt_L || '');
@@ -111,13 +114,29 @@
 my $sorting = $opt_i ? "IG" : "RANK";
 
 if ($opt_p) {
-  if ($opt_f) {
-    printf "%7s %7s %7s  %6s  %6s  %6s  %s\n",
-  	"OVERALL%", "FNEG%", "FPOS%", "S/O", $sorting, "SCORE", "NAME";
-  } else {
-    printf "%7s %7s  %7s  %6s  %6s  %6s  %s\n",
-  	"OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME";
-  }
+  printf "%7s  %7s  %7s  %6s  %6s  %6s  %s\n",
+  	"MSECS", $opt_f?"FNEG%":"SPAM%", $opt_f?"FPO%":"HAM%",
+        "S/O", $sorting, "SCORE", "NAME";
+
+  printf "%7d  %7d  %7d  %7.3f %6.2f  %6.2f  (all messages)\n",
+  	0, $hdr_spam, $hdr_ham,
+        soratio ($num_spam,$num_ham), 0, 0;
+
+  $hdr_all ||= 0.00001;     # avoid div by 0 in the next 2 statements
+  $hdr_spam = ($num_spam / $hdr_all) * 100.0;
+  $hdr_ham = ($num_ham / $hdr_all) * 100.0;
+  $hdr_all = 100.0;             # this is obvious
+
+  printf "%7.5f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  (all messages as %%)\n",
+  	0, $hdr_spam, $hdr_ham,
+        soratio ($num_spam,$num_ham), 0, 0;
+
+}
+elsif ($opt_p) {
+  printf "%8s %7s  %7s  %6s  %6s  %6s  %s\n",
+  	"OVERALL%", $opt_f?"FNEG%":"SPAM%", $opt_f?"FPO%":"HAM%",
+        "S/O", $sorting, "SCORE", "NAME";
+
   printf "%7d  %7d  %7d  %7.3f %6.2f  %6.2f  (all messages)\n",
   	$hdr_all, $hdr_spam, $hdr_ham,
         soratio ($num_spam,$num_ham), 0, 0;
@@ -131,7 +150,8 @@
   	$hdr_all, $hdr_spam, $hdr_ham,
         soratio ($num_spam,$num_ham), 0, 0;
 
-} elsif ($opt_x) {
+}
+elsif ($opt_x) {
   printf "%7s %7s  %7s  %6s  %6s %6s  %s\n",
   	"OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME";
   printf "%7d  %7d  %7d  %7.3f %6.2f %6.2f  (all messages)\n",
@@ -291,6 +311,10 @@
   }
 }
 
+if ($opt_T) {
+  read_timings();
+}
+
 foreach $test (sort { $ranking{$b} <=> $ranking{$a} } @tests) {
   next unless (exists $rules{$test});           # only valid tests
   next if (!$opt_a && $rules{$test}->{issubrule});
@@ -333,7 +357,13 @@
     $soratio{$test} = soratio ($fsadj, $fnadj);
   }
 
-  if ($opt_p) {
+  if ($opt_T) {
+    printf "%7.5f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  %s\n",
+  	$rule_times{$test}||0, $fs, $fn, $soratio, $ranking{$test},
+        $scores{$test}||0,
+        $test;
+
+  } elsif ($opt_p) {
     printf "%7.3f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  %s\n",
   	$fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}||0, $test;
 
@@ -570,5 +600,25 @@
   } else {
       return 0.5;		# no results -> not effective
   }
+}
+
+sub read_timings {
+  if (!open (IN, "<timing.log")) {
+    warn "hit-frequencies: cannot read 'timing.log', timings will be 0";
+    return;
+  }
+  my $ver = <IN>;
+  if ($ver !~ /^v1/) {
+    warn "hit-frequencies: unknown version in 'timing.log', timings will be 0";
+    close IN;
+    return;
+  }
+  while (<IN>) {
+    if (/^T\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/) {
+      my ($name, $duration, $max, $runs) = ($1,$2,$3,$4);
+      $rule_times{$name} = ($duration / ($runs||0.00001)) * 1000;
+    }
+  }
+  close IN;
 }
 

Added: spamassassin/trunk/masses/plugins/01_rule_timing.cf
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/plugins/01_rule_timing.cf?rev=330188&view=auto
==============================================================================
--- spamassassin/trunk/masses/plugins/01_rule_timing.cf (added)
+++ spamassassin/trunk/masses/plugins/01_rule_timing.cf Tue Nov  1 20:16:37 2005
@@ -0,0 +1,22 @@
+# config file to load the timing plugin
+#
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+#
+###########################################################################
+
+loadplugin HitFreqsRuleTiming HitFreqsRuleTiming.pm
+

Added: spamassassin/trunk/masses/plugins/HitFreqsRuleTiming.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/plugins/HitFreqsRuleTiming.pm?rev=330188&view=auto
==============================================================================
--- spamassassin/trunk/masses/plugins/HitFreqsRuleTiming.pm (added)
+++ spamassassin/trunk/masses/plugins/HitFreqsRuleTiming.pm Tue Nov  1 20:16:37 2005
@@ -0,0 +1,102 @@
+# HitFreqsRuleTiming - SpamAssassin rule timing plugin
+# (derived from attachment 3055 on bug 4517)
+#
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+
+package HitFreqsRuleTiming;
+
+use Mail::SpamAssassin::Plugin;
+use Mail::SpamAssassin::Logger;
+use strict;
+use warnings;
+
+use Time::HiRes qw(gettimeofday tv_interval);
+
+use vars qw(@ISA);
+@ISA = qw(Mail::SpamAssassin::Plugin);
+
+sub new {
+    my $class = shift;
+    my $mailsaobject = shift;
+
+    $class = ref($class) || $class;
+    my $self = $class->SUPER::new($mailsaobject);
+    $mailsaobject->{rule_timing} = {
+      duration => { },
+      runs => { },
+      max => { },
+    };
+    bless ($self, $class);
+}
+
+sub start_rules {
+    my ($self, $options) = @_;
+
+    $options->{permsgstatus}->{RuleTimingStart} = [gettimeofday()];
+}
+
+sub ran_rule {
+    my @now = gettimeofday();
+    my ($self, $options) = @_;
+
+    my $permsg = $options->{permsgstatus};
+    my $mailsa = $permsg->{main};
+    my $name = $options->{rulename};
+
+    my $duration = tv_interval($permsg->{RuleTimingStart}, \@now);
+    @{$permsg->{RuleTimingStart}} = @now;
+
+    unless ($mailsa->{rule_timing}{duration}{$name}) {
+        $mailsa->{rule_timing}{duration}{$name} = 0;
+        $mailsa->{rule_timing}{max}{$name} = 0;
+    }
+
+    # TODO: record all runs and compute std dev
+
+    $mailsa->{rule_timing}{runs}{$name}++;
+    $mailsa->{rule_timing}{duration}{$name} += $duration;
+    $mailsa->{rule_timing}{max}{$name} = $duration
+        if $duration > $mailsa->{rule_timing}{max}{$name};
+}
+
+sub finish {
+    my $self = shift;
+    my $mailsa = $self->{main};
+
+    # take a ref to speed up the sorting
+    my $dur_ref = $mailsa->{rule_timing}{duration};
+
+    my $s = '';
+    foreach my $rule (sort {
+        $dur_ref->{$b} <=> $dur_ref->{$a}
+      } keys %{$dur_ref})
+    {
+        $s .= sprintf "T %30s %8.3f %8.3f %4d\n", $rule,
+            $mailsa->{rule_timing}{duration}->{$rule},
+            $mailsa->{rule_timing}{max}->{$rule},
+            $mailsa->{rule_timing}{runs}->{$rule};
+    }
+
+    open (OUT, ">timing.log") or warn "cannot write to timing.log";
+    print OUT "v1\n";       # forward compatibility
+    print OUT $s;
+    close OUT or warn "cannot write to timing.log";
+
+    $self->SUPER::finish();
+}
+
+1;

Modified: spamassassin/trunk/masses/rule-qa/corpus-hourly
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/rule-qa/corpus-hourly?rev=330188&r1=330187&r2=330188&view=diff
==============================================================================
--- spamassassin/trunk/masses/rule-qa/corpus-hourly (original)
+++ spamassassin/trunk/masses/rule-qa/corpus-hourly Tue Nov  1 20:16:37 2005
@@ -386,14 +386,14 @@
         next unless $ham{$user};
         system("cat $corpusdir/$ham{$user} >> $opt{tmp}/ham.log.$$");
         system("cat $corpusdir/$spam{$user} >> $opt{tmp}/spam.log.$$");
-        open(IN, "./hit-frequencies -xpa $flags $corpusdir/$spam{$user} $corpusdir/$ham{$user} |");
+        open(IN, "./hit-frequencies -Txpa $flags $corpusdir/$spam{$user} $corpusdir/$ham{$user} |");
         while(<IN>) {
           chomp;
           push @output, "$_:$user\n";
         }
         close(IN);
       }
-      open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
+      open(IN, "./hit-frequencies -Txpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
       while(<IN>) {
         push @output, $_;
       }
@@ -421,7 +421,7 @@
         }
         # print out by age
         chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";
-        open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
+        open(IN, "./hit-frequencies -Txpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
         while(<IN>) {
           chomp;
           push @output, "$_:$which\n";
@@ -436,7 +436,7 @@
       system("cat " . join(" ", @spam) . " > $opt{tmp}/spam.log.$$");
 
       chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";
-      open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
+      open(IN, "./hit-frequencies -Txpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
       while(<IN>) { print(OUT); }
       close(IN);
     }