You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2004/04/29 06:40:47 UTC
svn commit: rev 10405 - incubator/spamassassin/trunk/masses
Author: jm
Date: Wed Apr 28 21:40:47 2004
New Revision: 10405
Modified:
incubator/spamassassin/trunk/masses/hit-frequencies
Log:
Rank replaced by information gain, defined as 'the average reduction in the entropy of C (classification) given the value of X (the rule)'.
Modified: incubator/spamassassin/trunk/masses/hit-frequencies
==============================================================================
--- incubator/spamassassin/trunk/masses/hit-frequencies (original)
+++ incubator/spamassassin/trunk/masses/hit-frequencies Wed Apr 28 21:40:47 2004
@@ -22,12 +22,12 @@
use vars qw {
$opt_f $opt_m $opt_M $opt_X $opt_p $opt_x $opt_h $opt_l $opt_L $opt_c
- $opt_a $opt_t $opt_s
+ $opt_a $opt_t $opt_s
};
sub usage {
die "hit-frequencies [-c rules dir] [-f] [-m RE] [-M RE] [-X RE] [-l LC]
- [-s SC] [-a] [-p] [-x] [spam log] [ham log]
+ [-s SC] [-a] [-p] [-x] [-g] [spam log] [ham log]
-c p use p as the rules directory
-f falses. count only false-negative or false-positive matches
@@ -40,6 +40,7 @@
-a display all tests
-p percentages. implies -x
-x extended output, with S/O ratio and scores
+ -g use Information Gain ranking
-s SC which scoreset to use
options -l and -L are mutually exclusive.
@@ -67,6 +68,7 @@
my $num_spam = 0;
my $num_ham = 0;
my %ranking = ();
+my %infogain = ();
my $ok_lang = '';
readscores($cffile);
@@ -96,10 +98,10 @@
if ($opt_p) {
if ($opt_f) {
printf "%7s %7s %7s %6s %6s %6s %s\n",
- "OVERALL%", "FNEG%", "FPOS%", "S/O", "RANK", "SCORE", "NAME";
+ "OVERALL%", "FNEG%", "FPOS%", "S/O", "IG", "SCORE", "NAME";
} else {
printf "%7s %7s %7s %6s %6s %6s %s\n",
- "OVERALL%", "SPAM%", "HAM%", "S/O", "RANK", "SCORE", "NAME";
+ "OVERALL%", "SPAM%", "HAM%", "S/O", "IG", "SCORE", "NAME";
}
printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n",
$hdr_all, $hdr_spam, $hdr_ham,
@@ -114,7 +116,7 @@
} elsif ($opt_x) {
printf "%7s %7s %7s %6s %6s %6s %s\n",
- "OVERALL%", "SPAM%", "HAM%", "S/O", "RANK", "SCORE", "NAME";
+ "OVERALL%", "SPAM%", "HAM%", "S/O", "IG", "SCORE", "NAME";
printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n",
$hdr_all, $hdr_spam, $hdr_ham,
soratio ($num_spam,$num_ham), 0, 0;
@@ -154,24 +156,74 @@
# now, given the S/O ratio (0.0 to 1.0) and match%s (0.0 to 100.0),
# come up with a ranking.
+ my $rank;
# old system
- #my $rank = $soratio * ($fsadj / (($fnadj || 0.0008) * 10));
+ #$rank = $soratio * ($fsadj / (($fnadj || 0.0008) * 10));
#$rank = log($rank+0.001);
# new system: allows a few more 99% hitters into the first page
- my $rank = (($soratio**3) * 2000) + ($fsadj*3);
+ # $rank = (($soratio**3) * 2000) + ($fsadj*3);
+ #
+ # $ranking{$test} = $rank;
+ # $rank_hi = $rank if ($rank > $rank_hi);
+ # $rank_lo = $rank if ($rank < $rank_lo);
+
+ {
+ # New new system: from "Learning to Filter Unsolicited Commercial E-Mail",
+ # Ion Androutsopoulos et al: determine the information gain IG(X, C) of the
+ # Boolean attributes (ie. the rules). Measures "the average reduction in
+ # the entropy of C (classification) given the value of X (the rule)". Makes
+ # a good ranking measure with a proper statistical basis. ;)
+ #
+ # Still would like to get an entropy measure in, too.
+ #
+ # sum P(X = x ^ C = c)
+ # IG(X,C) = x in [0, 1] P(X = x ^ C = c) . log2( ------------------- )
+ # c in [Ch, Cs] P(X = x) . P(C = c)
+ #
+ my $safe_nspam = $num_spam || 0.0000001;
+ my $safe_nham = $num_ham || 0.0000001;
+
+ my $num_all = ($num_spam+$num_ham);
+ my $safe_all = $num_all || 0.0000001;
+ my $f_all = $fs+$fn;
+
+ my $px0 = (($num_all - $f_all) / $safe_all); # P(X = 0)
+ my $px1 = ($f_all / $safe_all); # P(X = 1)
+ my $pccs = ($num_spam / $safe_all); # P(C = Cs)
+ my $pcch = ($num_ham / $safe_all); # P(C = Ch)
+ my $px1ccs = ($fs / $safe_nspam); # P(X = 1 ^ C = Cs)
+ my $px1cch = ($fn / $safe_nham); # P(X = 1 ^ C = Ch)
+ my $px0ccs = (($num_spam - $fs) / $safe_nspam); # P(X = 0 ^ C = Cs)
+ my $px0cch = (($num_ham - $fn) / $safe_nham); # P(X = 0 ^ C = Ch)
+ my $safe_px0_dot_pccs = ($px0 * $pccs) || 0.00000001;
+ my $safe_px0_dot_pcch = ($px0 * $pcch) || 0.00000001;
+ my $safe_px1_dot_pccs = ($px1 * $pccs) || 0.00000001;
+ my $safe_px1_dot_pcch = ($px1 * $pcch) || 0.00000001;
+
+ sub log2 { return log($_[0]) / 0.693147180559945; } # log(2) = 0.6931...
+
+ my $safe_px0ccs = ($px0ccs || 0.0000001);
+ my $safe_px0cch = ($px0cch || 0.0000001);
+ my $safe_px1ccs = ($px1ccs || 0.0000001);
+ my $safe_px1cch = ($px1cch || 0.0000001);
+ my $infogain = ( $px0ccs * log2($safe_px0ccs / $safe_px0_dot_pccs) ) +
+ ( $px0cch * log2($safe_px0cch / $safe_px0_dot_pcch) ) +
+ ( $px1ccs * log2($safe_px1ccs / $safe_px1_dot_pccs) ) +
+ ( $px1cch * log2($safe_px1cch / $safe_px1_dot_pcch) );
- $ranking{$test} = $rank;
- $rank_hi = $rank if ($rank > $rank_hi);
- $rank_lo = $rank if ($rank < $rank_lo);
+ $ranking{$test} = $infogain;
+ }
}
-# now normalise the rankings to [0, 1]
-$rank_hi -= $rank_lo;
-foreach $test (@tests) {
- $ranking{$test} = $rank_hi == 0 ? 0.001 : ($ranking{$test} - $rank_lo) / $rank_hi;
-}
+# {
+# # now normalise the rankings to [0, 1]
+# $rank_hi -= $rank_lo;
+# foreach $test (@tests) {
+# $ranking{$test} = $rank_hi == 0 ? 0.001 : ($ranking{$test} - $rank_lo) / $rank_hi;
+# }
+# }
foreach $test (sort { $ranking{$b} <=> $ranking{$a} } @tests) {
next unless (exists $rules{$test}); # only valid tests