You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by du...@apache.org on 2006/12/03 21:49:55 UTC
svn commit: r481885 - in /spamassassin/trunk/masses: extract-results
fp-fn-statistics logs-to-c mk-baseline-results runGA
Author: duncf
Date: Sun Dec 3 12:49:54 2006
New Revision: 481885
URL: http://svn.apache.org/viewvc?view=rev&rev=481885
Log:
Split the --count functionality out of logs-to-c and put it in
fp-fn-statistics. Clean both scripts up, make them both run with "use
strict". Correct other scripts which call either of them.
Modified:
spamassassin/trunk/masses/extract-results
spamassassin/trunk/masses/fp-fn-statistics
spamassassin/trunk/masses/logs-to-c
spamassassin/trunk/masses/mk-baseline-results
spamassassin/trunk/masses/runGA
Modified: spamassassin/trunk/masses/extract-results
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/extract-results?view=diff&rev=481885&r1=481884&r2=481885
==============================================================================
--- spamassassin/trunk/masses/extract-results (original)
+++ spamassassin/trunk/masses/extract-results Sun Dec 3 12:49:54 2006
@@ -1,7 +1,7 @@
#!/usr/bin/perl
-# This script extracts the confusion matrix (tp, tn, fp, fn) from the output
-# of the logs-to-c program.
+# This script extracts the confusion matrix (tp, tn, fp, fn) from the
+# output of the fp-fn-statistics program.
#
# This is used by the validate-model script to aggregate the results of a
# cross validation for analysis with the compare-models script.
Modified: spamassassin/trunk/masses/fp-fn-statistics
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/fp-fn-statistics?view=diff&rev=481885&r1=481884&r2=481885
==============================================================================
--- spamassassin/trunk/masses/fp-fn-statistics (original)
+++ spamassassin/trunk/masses/fp-fn-statistics Sun Dec 3 12:49:54 2006
@@ -1,3 +1,181 @@
-#!/bin/sh
+#!/usr/bin/perl -w
+#
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+
+use Getopt::Long;
+use strict;
+
+use vars qw($opt_cffile $opt_lambda $opt_threshold $opt_scoreset
+ $opt_spam $opt_ham $opt_fplog $opt_fnlog);
+
+$opt_cffile = "../rules";
+$opt_threshold = 5;
+$opt_spam = 'spam.log';
+$opt_ham = 'ham.log';
+$opt_scoreset = 0;
+
+GetOptions("cffile=s", "lambda=f", "threshold=f", "spam=s",
+ "ham=s", "scoreset=i", "fplog=s", "fnlog=s");
+
+# If desired, report false positives and false negatives for analysis
+if (defined $opt_fnlog) { open (FNLOG, ">$opt_fnlog"); }
+if (defined $opt_fplog) { open (FPLOG, ">$opt_fplog"); }
+
+# lambda value for TCR equation, representing the cost of of an FP vs. the
+# cost of a FN. Some example values are: 1 = tagged only, 9 = mailed back
+# to sender asking for token, 999 = blocking or deleting a message.
+#
+# We roughly aim for a value representing "moved to infrequently-read folder".
+
+my $lambda = 50;
+if ($opt_lambda) { $lambda = $opt_lambda; }
+
+use vars qw(%scores %allrules %rules);
+
+readscores();
+
+print "Reading per-message hit stat logs and scores...\n";
+my ($num_spam, $num_ham);
+my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore);
+
+readlogs();
+
+evaluate();
+
+# show memory usage before we exit
+# print "Running \"ps aux\"...\n";
+# open(PS, "ps aux|");
+# while(<PS>) {
+# print if $. == 1 || /\b$$\b/;
+# }
+# close(PS);
+
+exit 0;
+
+# arguments are $isspam, $count, \@tests, $msgline;
+sub log_line_count {
+ my $score = 0;
+ $score += $scores{$_} for @{$_[2]};
+
+ if ($_[0]) {
+ $num_spam++;
+ if ($score >= $opt_threshold) {
+ $ga_yy++;
+ $yyscore += $score;
+ }
+ else {
+ $ga_yn++;
+ $ynscore += $score;
+ if (defined $opt_fnlog) {
+ print FNLOG $_[3];
+ }
+ }
+ }
+ else {
+ $num_ham++;
+ if ($score >= $opt_threshold) {
+ #print STDERR "FP: $id\n";
+ $ga_ny++;
+ $nyscore += $score;
+ if (defined $opt_fplog) {
+ print FPLOG $_[3];
+ }
+ }
+ else {
+ $ga_nn++;
+ $nnscore += $score;
+ }
+ }
+}
+
+sub readlogs {
+ my $msgline;
+ my $count = 0;
+ $num_spam = $num_ham = 0;
+
+ $ga_yy = $ga_ny = $ga_yn = $ga_nn = 0;
+ $yyscore = $ynscore = $nyscore = $nnscore = 0.0;
+
+ foreach my $file ($opt_spam, $opt_ham) {
+ open (IN, "<$file") || die "Could not open file '$file': $!";
+
+ my $isspam = ($file eq $opt_spam);
+ my $caught; # 1st parameter of log line
+ my $rules; # 4th parameter of log line
+
+ while (defined($msgline = <IN>)) {
+ ($caught, undef, undef, $rules) = split(' ', $msgline);
+
+ # only take lines starting with Y or .
+ next unless ($caught eq 'Y' || $caught eq '.') && $rules;
+
+ # get tests, but ignore unknown tests and subrules
+ my @tests = grep { defined $scores{$_} && !$allrules{$_}->{issubrule} }
+ split(/,/, $rules);
+
+ # run handler
+ log_line_count($isspam, $count, \@tests, $msgline);
+
+ # increment line
+ $count++;
+ }
+ close IN;
+ }
+}
+
+sub readscores {
+ print "Reading scores from \"$opt_cffile\"...\n";
+ system ("../build/parse-rules-for-masses -d \"$opt_cffile\" -s $opt_scoreset") and die;
+ require "./tmp/rules.pl";
+ %allrules = %rules; # ensure it stays global
+}
+
+sub evaluate {
+ printf ("\n# SUMMARY for threshold %3.1f:\n", $opt_threshold);
+ printf "# Correctly non-spam: %6d %4.2f%%\n",
+ $ga_nn, ($ga_nn / $num_ham) * 100.0;
+ printf "# Correctly spam: %6d %4.2f%%\n",
+ $ga_yy, ($ga_yy / $num_spam) * 100.0;
+ printf "# False positives: %6d %4.2f%%\n",
+ $ga_ny, ($ga_ny / $num_ham) * 100.0;
+ printf "# False negatives: %6d %4.2f%%\n",
+ $ga_yn, ($ga_yn / $num_spam) * 100.0;
+
+ # convert to the TCR metrics used in the published lit
+ my $nspamspam = $ga_yy;
+ my $nspamlegit = $ga_yn;
+ my $nlegitspam = $ga_ny;
+ my $nlegitlegit = $ga_yn;
+ my $nlegit = $num_ham;
+ my $nspam = $num_spam;
+
+ my $werr = ($lambda * $nlegitspam + $nspamlegit)
+ / ($lambda * $nlegit + $nspam);
+
+ my $werr_base = $nspam
+ / ($lambda * $nlegit + $nspam);
+
+ $werr ||= 0.000001; # avoid / by 0
+ my $tcr = $werr_base / $werr;
+
+ my $sr = ($nspamspam / $nspam) * 100.0;
+ my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
+ printf "# TCR(l=%s): %3.6f SpamRecall: %3.3f%% SpamPrec: %3.3f%%\n",
+ $lambda, $tcr, $sr, $sp;
+}
-exec ./logs-to-c --count $*
Modified: spamassassin/trunk/masses/logs-to-c
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/logs-to-c?view=diff&rev=481885&r1=481884&r2=481885
==============================================================================
--- spamassassin/trunk/masses/logs-to-c (original)
+++ spamassassin/trunk/masses/logs-to-c Sun Dec 3 12:49:54 2006
@@ -18,58 +18,35 @@
# </...@LICENSE>
use Getopt::Long;
-use vars qw($opt_cffile $opt_count $opt_lambda $opt_threshold
- $opt_spam $opt_ham $opt_fplog $opt_fnlog);
+use strict;
+use vars qw($opt_cffile $opt_spam $opt_ham $opt_scoreset);
$opt_cffile = "../rules";
-$opt_count = 0;
-$opt_threshold = 5;
$opt_spam = 'spam.log';
$opt_ham = 'ham.log';
$opt_scoreset = 0;
-GetOptions("cffile=s", "count", "lambda=f", "threshold=f", "spam=s",
- "ham=s", "scoreset=i", "fplog=s", "fnlog=s");
+GetOptions("cffile=s", "spam=s", "ham=s", "scoreset=i");
-# If desired, report false positives and false negatives for analysis
-if (defined $opt_fnlog) { open (FNLOG, ">$opt_fnlog"); }
-if (defined $opt_fplog) { open (FPLOG, ">$opt_fplog"); }
-
-my $nybias = 10;
-
-# lambda value for TCR equation, representing the cost of of an FP vs. the
-# cost of a FN. Some example values are: 1 = tagged only, 9 = mailed back
-# to sender asking for token, 999 = blocking or deleting a message.
-#
-# We roughly aim for a value representing "moved to infrequently-read folder".
-
-my $lambda = 50;
-if ($opt_lambda) { $lambda = $opt_lambda; }
-
-my $msgline;
my $is_spam = ''; # vec aligned with @tests_hit
my @tests_hit = ();
my %mutable_tests = ();
-use vars qw(%rules %allrules);
+use vars qw(%rules %allrules %scores);
+
+my (%ignored_rule, %range_lo, %range_hi);
+my %rule_to_index;
readscores();
print "Reading per-message hit stat logs and scores...\n";
my ($num_tests, $num_spam, $num_ham);
-my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore);
read_ranges();
readlogs();
-if ($opt_count) {
- $nybias = $nybias*($num_spam / $num_ham);
- evaluate();
-}
-else {
- print "Writing logs and current scores as C code...\n";
- writescores_c();
-}
+print "Writing logs and current scores as C code...\n";
+writescores_c();
# show memory usage before we exit
# print "Running \"ps aux\"...\n";
@@ -108,42 +85,6 @@
return map { $short_to_long[$_] } unpack("w*", $_[0]);
}
-# arguments are $isspam, $count, \@tests
-sub log_line_count {
- my $score = 0;
- $score += $scores{$_} for @{$_[2]};
-
- if ($_[0]) {
- $num_spam++;
- if ($score >= $opt_threshold) {
- $ga_yy++;
- $yyscore += $score;
- }
- else {
- $ga_yn++;
- $ynscore += $score;
- if (defined $opt_fnlog) {
- print FNLOG $msgline;
- }
- }
- }
- else {
- $num_ham++;
- if ($score >= $opt_threshold) {
- #print STDERR "FP: $id\n";
- $ga_ny++;
- $nyscore += $score;
- if (defined $opt_fplog) {
- print FPLOG $msgline;
- }
- }
- else {
- $ga_nn++;
- $nnscore += $score;
- }
- }
-}
-
# arguments are $isspam, $count, \@tests;
sub log_line_code {
$tests_hit[$_[1]] = freeze_tests($_[2]);
@@ -159,17 +100,11 @@
}
sub readlogs {
+ my $msgline;
+
my $count = 0;
$num_spam = $num_ham = 0;
- if ($opt_count) {
- $ga_yy = $ga_ny = $ga_yn = $ga_nn = 0;
- $yyscore = $ynscore = $nyscore = $nnscore = 0.0;
- }
-
- # set handler for log lines
- my $log_line = $opt_count ? \&log_line_count : \&log_line_code;
-
foreach my $file ($opt_spam, $opt_ham) {
open (IN, "<$file") || die "Could not open file '$file': $!";
@@ -188,7 +123,7 @@
split(/,/, $rules);
# run handler
- $log_line->($isspam, $count, \@tests);
+ log_line_code($isspam, $count, \@tests);
# increment line
$count++;
@@ -214,11 +149,11 @@
# jm: now, score-ranges-from-freqs has tflags to work from, so
# it will always list all mutable tests.
- @index_to_rule = sort {($ignored_rule{$a} <=> $ignored_rule{$b}) ||
+ my @index_to_rule = sort {($ignored_rule{$a} <=> $ignored_rule{$b}) ||
($mutable_tests{$b} <=> $mutable_tests{$a}) ||
($a cmp $b)} (keys %scores);
my $max_hits_per_msg = 0;
- for ($file = 0; $file < $num_tests; $file++) {
+ for (my $file = 0; $file < $num_tests; $file++) {
my(@hits) =
grep {(! $ignored_rule{$_}) && $mutable_tests{$_}} (thaw_tests($tests_hit[$file]));
if ((scalar(@hits)+1) > $max_hits_per_msg) {
@@ -415,8 +350,6 @@
}
$ignored_rule{$t} = 0;
- $index_to_rule[$count] = $t;
- $count++;
if (!$mut) {
$mutable_tests{$t} = 0;
@@ -428,7 +361,7 @@
$mutable_tests{$t} = 1;
}
unless ($mutable_tests{$t} || $scores{$t}) {
- warn "ignoring '$t': immutable and score == 0\n";
+# warn "ignoring '$t': immutable and score == 0\n";
$ignored_rule{$t} = 1;
}
}
@@ -456,8 +389,6 @@
$ignored_rule{$t} = 1;
}
}
- $index_to_rule[$count] = $t;
- $count++;
}
foreach my $t (keys %range_lo) {
next if ($ignored_rule{$t});
@@ -495,39 +426,6 @@
}
}
-sub evaluate {
- printf ("\n# SUMMARY for threshold %3.1f:\n", $opt_threshold);
- printf "# Correctly non-spam: %6d %4.2f%%\n",
- $ga_nn, ($ga_nn / $num_ham) * 100.0;
- printf "# Correctly spam: %6d %4.2f%%\n",
- $ga_yy, ($ga_yy / $num_spam) * 100.0;
- printf "# False positives: %6d %4.2f%%\n",
- $ga_ny, ($ga_ny / $num_ham) * 100.0;
- printf "# False negatives: %6d %4.2f%%\n",
- $ga_yn, ($ga_yn / $num_spam) * 100.0;
-
- # convert to the TCR metrics used in the published lit
- my $nspamspam = $ga_yy;
- my $nspamlegit = $ga_yn;
- my $nlegitspam = $ga_ny;
- my $nlegitlegit = $ga_yn;
- my $nlegit = $num_ham;
- my $nspam = $num_spam;
-
- my $werr = ($lambda * $nlegitspam + $nspamlegit)
- / ($lambda * $nlegit + $nspam);
-
- my $werr_base = $nspam
- / ($lambda * $nlegit + $nspam);
-
- $werr ||= 0.000001; # avoid / by 0
- my $tcr = $werr_base / $werr;
-
- my $sr = ($nspamspam / $nspam) * 100.0;
- my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
- printf "# TCR(l=%s): %3.6f SpamRecall: %3.3f%% SpamPrec: %3.3f%%\n",
- $lambda, $tcr, $sr, $sp;
-}
__DATA__
Modified: spamassassin/trunk/masses/mk-baseline-results
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/mk-baseline-results?view=diff&rev=481885&r1=481884&r2=481885
==============================================================================
--- spamassassin/trunk/masses/mk-baseline-results (original)
+++ spamassassin/trunk/masses/mk-baseline-results Sun Dec 3 12:49:54 2006
@@ -16,10 +16,10 @@
) > /dev/null 2>&1
gen_fp_fn_report () {
- ./logs-to-c \
+ ./fp-fn-statistics \
--spam=spam-test.log \
--ham=ham-test.log \
- --threshold $1 --count --scoreset=$SCORESET | \
+ --threshold $1 --scoreset=$SCORESET | \
sed -e 's/^Reading.*//' -e '/^$/d'
}
Modified: spamassassin/trunk/masses/runGA
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/runGA?view=diff&rev=481885&r1=481884&r2=481885
==============================================================================
--- spamassassin/trunk/masses/runGA (original)
+++ spamassassin/trunk/masses/runGA Sun Dec 3 12:49:54 2006
@@ -81,9 +81,9 @@
# This needs to have 50_scores.cf in place first ...
echo "[gen test results]"
-./logs-to-c --spam=spam-test.log \
+./fp-fn-statistics --spam=spam-test.log \
--ham=ham-test.log \
- --count --cffile=../rules --scoreset=$SCORESET | tee $LOGDIR/test
+ --cffile=../rules --scoreset=$SCORESET | tee $LOGDIR/test
echo "[STATISTICS file generation]"
./mk-baseline-results $SCORESET | tee $LOGDIR/statistics