You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by du...@apache.org on 2006/12/03 21:49:55 UTC
svn commit: r481885 - in /spamassassin/trunk/masses: extract-results fp-fn-statistics logs-to-c mk-baseline-results runGA

Author: duncf
Date: Sun Dec  3 12:49:54 2006
New Revision: 481885

URL: http://svn.apache.org/viewvc?view=rev&rev=481885
Log:
Split the --count functionality out of logs-to-c and put it in
fp-fn-statistics. Clean both scripts up, make them both run with "use
strict". Correct other scripts which call either of them.

Modified:
    spamassassin/trunk/masses/extract-results
    spamassassin/trunk/masses/fp-fn-statistics
    spamassassin/trunk/masses/logs-to-c
    spamassassin/trunk/masses/mk-baseline-results
    spamassassin/trunk/masses/runGA

Modified: spamassassin/trunk/masses/extract-results
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/extract-results?view=diff&rev=481885&r1=481884&r2=481885
==============================================================================
--- spamassassin/trunk/masses/extract-results (original)
+++ spamassassin/trunk/masses/extract-results Sun Dec  3 12:49:54 2006
@@ -1,7 +1,7 @@
 #!/usr/bin/perl
 
-# This script extracts the confusion matrix (tp, tn, fp, fn) from the output
-# of the logs-to-c program.
+# This script extracts the confusion matrix (tp, tn, fp, fn) from the
+# output of the fp-fn-statistics program.
 #
 # This is used by the validate-model script to aggregate the results of a
 # cross validation for analysis with the compare-models script.

Modified: spamassassin/trunk/masses/fp-fn-statistics
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/fp-fn-statistics?view=diff&rev=481885&r1=481884&r2=481885
==============================================================================
--- spamassassin/trunk/masses/fp-fn-statistics (original)
+++ spamassassin/trunk/masses/fp-fn-statistics Sun Dec  3 12:49:54 2006
@@ -1,3 +1,181 @@
-#!/bin/sh
+#!/usr/bin/perl -w
+#
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at:
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+
+use Getopt::Long;
+use strict;
+
+use vars qw($opt_cffile $opt_lambda $opt_threshold $opt_scoreset
+	    $opt_spam $opt_ham $opt_fplog $opt_fnlog);
+
+$opt_cffile = "../rules";
+$opt_threshold = 5;
+$opt_spam = 'spam.log';
+$opt_ham = 'ham.log';
+$opt_scoreset = 0;
+
+GetOptions("cffile=s", "lambda=f", "threshold=f", "spam=s",
+	   "ham=s", "scoreset=i", "fplog=s", "fnlog=s");
+
+# If desired, report false positives and false negatives for analysis
+if (defined $opt_fnlog) { open (FNLOG, ">$opt_fnlog"); }
+if (defined $opt_fplog) { open (FPLOG, ">$opt_fplog"); }
+
+# lambda value for TCR equation, representing the cost of of an FP vs. the
+# cost of a FN.  Some example values are: 1 = tagged only, 9 = mailed back
+# to sender asking for token, 999 = blocking or deleting a message.
+#
+# We roughly aim for a value representing "moved to infrequently-read folder".
+
+my $lambda = 50;
+if ($opt_lambda) { $lambda = $opt_lambda; }
+
+use vars qw(%scores %allrules %rules);
+
+readscores();
+
+print "Reading per-message hit stat logs and scores...\n";
+my ($num_spam, $num_ham);
+my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore);
+
+readlogs();
+
+evaluate();
+
+# show memory usage before we exit
+# print "Running \"ps aux\"...\n";
+# open(PS, "ps aux|");
+# while(<PS>) {
+# print if $. == 1 || /\b$$\b/;
+# }
+# close(PS);
+
+exit 0;
+
+# arguments are $isspam, $count, \@tests, $msgline;
+sub log_line_count {
+  my $score = 0;
+  $score += $scores{$_} for @{$_[2]};
+
+  if ($_[0]) {
+    $num_spam++;
+    if ($score >= $opt_threshold) {
+      $ga_yy++;
+      $yyscore += $score;
+    }
+    else {
+      $ga_yn++;
+      $ynscore += $score;
+      if (defined $opt_fnlog) {
+	print FNLOG $_[3];
+      }
+    }
+  }
+  else {
+    $num_ham++;
+    if ($score >= $opt_threshold) {
+      #print STDERR "FP: $id\n";
+      $ga_ny++;
+      $nyscore += $score;
+      if (defined $opt_fplog) {
+	print FPLOG $_[3];
+      }
+    }
+    else {
+      $ga_nn++;
+      $nnscore += $score;
+    }
+  }
+}
+
+sub readlogs {
+  my $msgline;
+  my $count = 0;
+  $num_spam = $num_ham = 0;
+
+  $ga_yy = $ga_ny = $ga_yn = $ga_nn = 0;
+  $yyscore = $ynscore = $nyscore = $nnscore = 0.0;
+
+  foreach my $file ($opt_spam, $opt_ham) {
+    open (IN, "<$file") || die "Could not open file '$file': $!";
+
+    my $isspam = ($file eq $opt_spam);
+    my $caught;			# 1st parameter of log line
+    my $rules;			# 4th parameter of log line
+
+    while (defined($msgline = <IN>)) {
+      ($caught, undef, undef, $rules) = split(' ', $msgline);
+
+      # only take lines starting with Y or .
+      next unless ($caught eq 'Y' || $caught eq '.') && $rules;
+
+      # get tests, but ignore unknown tests and subrules
+      my @tests = grep { defined $scores{$_} && !$allrules{$_}->{issubrule} }
+	split(/,/, $rules);
+
+      # run handler
+      log_line_count($isspam, $count, \@tests, $msgline);
+
+      # increment line
+      $count++;
+    }
+    close IN;
+  }
+}
+
+sub readscores {
+  print "Reading scores from \"$opt_cffile\"...\n";
+  system ("../build/parse-rules-for-masses -d \"$opt_cffile\" -s $opt_scoreset") and die;
+  require "./tmp/rules.pl";
+  %allrules = %rules;           # ensure it stays global
+}
+
+sub evaluate {
+   printf ("\n# SUMMARY for threshold %3.1f:\n", $opt_threshold);
+   printf "# Correctly non-spam: %6d  %4.2f%%\n",
+       $ga_nn, ($ga_nn /  $num_ham) * 100.0;
+   printf "# Correctly spam:     %6d  %4.2f%%\n",
+       $ga_yy, ($ga_yy /  $num_spam) * 100.0;
+   printf "# False positives:    %6d  %4.2f%%\n",
+       $ga_ny, ($ga_ny /  $num_ham) * 100.0;
+   printf "# False negatives:    %6d  %4.2f%%\n",
+       $ga_yn, ($ga_yn /  $num_spam) * 100.0;
+
+  # convert to the TCR metrics used in the published lit
+  my $nspamspam = $ga_yy;
+  my $nspamlegit = $ga_yn;
+  my $nlegitspam = $ga_ny;
+  my $nlegitlegit = $ga_yn;
+  my $nlegit = $num_ham;
+  my $nspam = $num_spam;
+
+  my $werr = ($lambda * $nlegitspam + $nspamlegit)
+                  / ($lambda * $nlegit + $nspam);
+
+  my $werr_base = $nspam
+                  / ($lambda * $nlegit + $nspam);
+
+  $werr ||= 0.000001;     # avoid / by 0
+  my $tcr = $werr_base / $werr;
+
+  my $sr = ($nspamspam / $nspam) * 100.0;
+  my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
+  printf "# TCR(l=%s): %3.6f  SpamRecall: %3.3f%%  SpamPrec: %3.3f%%\n",
+    $lambda, $tcr, $sr, $sp;
+}
 
-exec ./logs-to-c --count $*

Modified: spamassassin/trunk/masses/logs-to-c
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/logs-to-c?view=diff&rev=481885&r1=481884&r2=481885
==============================================================================
--- spamassassin/trunk/masses/logs-to-c (original)
+++ spamassassin/trunk/masses/logs-to-c Sun Dec  3 12:49:54 2006
@@ -18,58 +18,35 @@
 # </...@LICENSE>
 
 use Getopt::Long;
-use vars qw($opt_cffile $opt_count $opt_lambda $opt_threshold
-	    $opt_spam $opt_ham $opt_fplog $opt_fnlog);
+use strict;
+use vars qw($opt_cffile $opt_spam $opt_ham $opt_scoreset);
 
 $opt_cffile = "../rules";
-$opt_count = 0;
-$opt_threshold = 5;
 $opt_spam = 'spam.log';
 $opt_ham = 'ham.log';
 $opt_scoreset = 0;
 
-GetOptions("cffile=s", "count", "lambda=f", "threshold=f", "spam=s",
-	   "ham=s", "scoreset=i", "fplog=s", "fnlog=s");
+GetOptions("cffile=s", "spam=s", "ham=s", "scoreset=i");
 
-# If desired, report false positives and false negatives for analysis
-if (defined $opt_fnlog) { open (FNLOG, ">$opt_fnlog"); }
-if (defined $opt_fplog) { open (FPLOG, ">$opt_fplog"); }
-
-my $nybias = 10;
-
-# lambda value for TCR equation, representing the cost of of an FP vs. the
-# cost of a FN.  Some example values are: 1 = tagged only, 9 = mailed back
-# to sender asking for token, 999 = blocking or deleting a message.
-#
-# We roughly aim for a value representing "moved to infrequently-read folder".
-
-my $lambda = 50;
-if ($opt_lambda) { $lambda = $opt_lambda; }
-
-my $msgline;
 my $is_spam = '';		# vec aligned with @tests_hit
 my @tests_hit = ();
 my %mutable_tests = ();
 
-use vars qw(%rules %allrules);
+use vars qw(%rules %allrules %scores);
+
+my (%ignored_rule, %range_lo, %range_hi);
+my %rule_to_index;
 
 readscores();
 
 print "Reading per-message hit stat logs and scores...\n";
 my ($num_tests, $num_spam, $num_ham);
-my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore);
 
 read_ranges();
 readlogs();
 
-if ($opt_count) {
-  $nybias = $nybias*($num_spam / $num_ham);
-  evaluate();
-}
-else {
-  print "Writing logs and current scores as C code...\n";
-  writescores_c();
-}
+print "Writing logs and current scores as C code...\n";
+writescores_c();
 
 # show memory usage before we exit
 # print "Running \"ps aux\"...\n";
@@ -108,42 +85,6 @@
   return map { $short_to_long[$_] } unpack("w*", $_[0]);
 }
 
-# arguments are $isspam, $count, \@tests
-sub log_line_count {
-  my $score = 0;
-  $score += $scores{$_} for @{$_[2]};
-
-  if ($_[0]) {
-    $num_spam++;
-    if ($score >= $opt_threshold) {
-      $ga_yy++;
-      $yyscore += $score;
-    }
-    else {
-      $ga_yn++;
-      $ynscore += $score;
-      if (defined $opt_fnlog) {
-	print FNLOG $msgline;
-      }
-    }
-  }
-  else {
-    $num_ham++;
-    if ($score >= $opt_threshold) {
-      #print STDERR "FP: $id\n";
-      $ga_ny++;
-      $nyscore += $score;
-      if (defined $opt_fplog) {
-	print FPLOG $msgline;
-      }
-    }
-    else {
-      $ga_nn++;
-      $nnscore += $score;
-    }
-  }
-}
-
 # arguments are $isspam, $count, \@tests;
 sub log_line_code {
   $tests_hit[$_[1]] = freeze_tests($_[2]);
@@ -159,17 +100,11 @@
 }
 
 sub readlogs {
+  my $msgline;
+
   my $count = 0;
   $num_spam = $num_ham = 0;
 
-  if ($opt_count) {
-    $ga_yy = $ga_ny = $ga_yn = $ga_nn = 0;
-    $yyscore = $ynscore = $nyscore = $nnscore = 0.0;
-  }
-
-  # set handler for log lines
-  my $log_line = $opt_count ? \&log_line_count : \&log_line_code;
-
   foreach my $file ($opt_spam, $opt_ham) {
     open (IN, "<$file") || die "Could not open file '$file': $!";
 
@@ -188,7 +123,7 @@
 	split(/,/, $rules);
 
       # run handler
-      $log_line->($isspam, $count, \@tests);
+      log_line_code($isspam, $count, \@tests);
 
       # increment line
       $count++;
@@ -214,11 +149,11 @@
     # jm: now, score-ranges-from-freqs has tflags to work from, so
     # it will always list all mutable tests.
 
-  @index_to_rule = sort {($ignored_rule{$a} <=> $ignored_rule{$b}) ||
+  my @index_to_rule = sort {($ignored_rule{$a} <=> $ignored_rule{$b}) ||
 			  ($mutable_tests{$b} <=> $mutable_tests{$a}) ||
 			   ($a cmp $b)} (keys %scores);
   my $max_hits_per_msg = 0;
-  for ($file = 0; $file < $num_tests; $file++) {
+  for (my $file = 0; $file < $num_tests; $file++) {
     my(@hits) =
      grep {(! $ignored_rule{$_}) && $mutable_tests{$_}} (thaw_tests($tests_hit[$file]));
     if ((scalar(@hits)+1) > $max_hits_per_msg) {
@@ -415,8 +350,6 @@
     }
 
     $ignored_rule{$t} = 0;
-    $index_to_rule[$count] = $t;
-    $count++;
 
     if (!$mut) {
       $mutable_tests{$t} = 0;
@@ -428,7 +361,7 @@
       $mutable_tests{$t} = 1;
     }
     unless ($mutable_tests{$t} || $scores{$t}) {
-      warn "ignoring '$t': immutable and score == 0\n";
+#      warn "ignoring '$t': immutable and score == 0\n";
       $ignored_rule{$t} = 1;
     }
   }
@@ -456,8 +389,6 @@
         $ignored_rule{$t} = 1;
       }
     }
-    $index_to_rule[$count] = $t;
-    $count++;
   }
   foreach my $t (keys %range_lo) {
     next if ($ignored_rule{$t});
@@ -495,39 +426,6 @@
   }
 }
 
-sub evaluate {
-   printf ("\n# SUMMARY for threshold %3.1f:\n", $opt_threshold);
-   printf "# Correctly non-spam: %6d  %4.2f%%\n",
-       $ga_nn, ($ga_nn /  $num_ham) * 100.0;
-   printf "# Correctly spam:     %6d  %4.2f%%\n",
-       $ga_yy, ($ga_yy /  $num_spam) * 100.0;
-   printf "# False positives:    %6d  %4.2f%%\n",
-       $ga_ny, ($ga_ny /  $num_ham) * 100.0;
-   printf "# False negatives:    %6d  %4.2f%%\n",
-       $ga_yn, ($ga_yn /  $num_spam) * 100.0;
-
-  # convert to the TCR metrics used in the published lit
-  my $nspamspam = $ga_yy;
-  my $nspamlegit = $ga_yn;
-  my $nlegitspam = $ga_ny;
-  my $nlegitlegit = $ga_yn;
-  my $nlegit = $num_ham;
-  my $nspam = $num_spam;
-
-  my $werr = ($lambda * $nlegitspam + $nspamlegit)
-                  / ($lambda * $nlegit + $nspam);
-
-  my $werr_base = $nspam
-                  / ($lambda * $nlegit + $nspam);
-
-  $werr ||= 0.000001;     # avoid / by 0
-  my $tcr = $werr_base / $werr;
-
-  my $sr = ($nspamspam / $nspam) * 100.0;
-  my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
-  printf "# TCR(l=%s): %3.6f  SpamRecall: %3.3f%%  SpamPrec: %3.3f%%\n",
-    $lambda, $tcr, $sr, $sp;
-}
 
 __DATA__
 

Modified: spamassassin/trunk/masses/mk-baseline-results
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/mk-baseline-results?view=diff&rev=481885&r1=481884&r2=481885
==============================================================================
--- spamassassin/trunk/masses/mk-baseline-results (original)
+++ spamassassin/trunk/masses/mk-baseline-results Sun Dec  3 12:49:54 2006
@@ -16,10 +16,10 @@
 ) > /dev/null 2>&1
 
 gen_fp_fn_report () {
-  ./logs-to-c \
+  ./fp-fn-statistics \
     --spam=spam-test.log \
     --ham=ham-test.log \
-    --threshold $1 --count --scoreset=$SCORESET | \
+    --threshold $1 --scoreset=$SCORESET | \
     sed -e 's/^Reading.*//' -e '/^$/d'
 }
 

Modified: spamassassin/trunk/masses/runGA
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/runGA?view=diff&rev=481885&r1=481884&r2=481885
==============================================================================
--- spamassassin/trunk/masses/runGA (original)
+++ spamassassin/trunk/masses/runGA Sun Dec  3 12:49:54 2006
@@ -81,9 +81,9 @@
 
 # This needs to have 50_scores.cf in place first ...
 echo "[gen test results]"
-./logs-to-c --spam=spam-test.log \
+./fp-fn-statistics --spam=spam-test.log \
 	--ham=ham-test.log \
-	--count --cffile=../rules --scoreset=$SCORESET | tee $LOGDIR/test
+	--cffile=../rules --scoreset=$SCORESET | tee $LOGDIR/test
 
 echo "[STATISTICS file generation]"
 ./mk-baseline-results $SCORESET | tee $LOGDIR/statistics