You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by qu...@apache.org on 2004/05/05 23:34:19 UTC
svn commit: rev 10533 - incubator/spamassassin/trunk/masses

Author: quinlan
Date: Wed May  5 14:34:19 2004
New Revision: 10533

Modified:
   incubator/spamassassin/trunk/masses/hit-frequencies
Log:
new wanted/unwanted ranking system
add -i option for IG ranking system


Modified: incubator/spamassassin/trunk/masses/hit-frequencies
==============================================================================
--- incubator/spamassassin/trunk/masses/hit-frequencies	(original)
+++ incubator/spamassassin/trunk/masses/hit-frequencies	Wed May  5 14:34:19 2004
@@ -18,16 +18,16 @@
 
 use FindBin;
 use Getopt::Std;
-getopts("fm:M:X:l:L:pxhc:at:s:");
+getopts("fm:M:X:l:L:pxhc:at:s:i");
 
 use vars qw {
   $opt_f $opt_m $opt_M $opt_X $opt_p $opt_x $opt_h $opt_l $opt_L $opt_c
-  $opt_a $opt_t $opt_s 
+  $opt_a $opt_t $opt_s $opt_i $sorting
 };
 
 sub usage {
   die "hit-frequencies [-c rules dir] [-f] [-m RE] [-M RE] [-X RE] [-l LC]
-                [-s SC] [-a] [-p] [-x] [spam log] [ham log]
+                [-s SC] [-a] [-p] [-x] [-i] [spam log] [ham log]
 
     -c p   use p as the rules directory
     -f     falses. count only false-negative or false-positive matches
@@ -41,6 +41,7 @@
     -p     percentages. implies -x
     -x     extended output, with S/O ratio and scores
     -s SC  which scoreset to use
+    -i     use IG (information gain) for ranking
 
     options -l and -L are mutually exclusive.
 
@@ -94,12 +95,13 @@
 my $hdr_ham = $num_ham;
 
 if ($opt_p) {
+  my $sorting = $opt_i ? "IG" : "RANK";
   if ($opt_f) {
     printf "%7s %7s %7s  %6s  %6s  %6s  %s\n",
-  	"OVERALL%", "FNEG%", "FPOS%", "S/O", "IG", "SCORE", "NAME";
+  	"OVERALL%", "FNEG%", "FPOS%", "S/O", $sorting, "SCORE", "NAME";
   } else {
     printf "%7s %7s  %7s  %6s  %6s  %6s  %s\n",
-  	"OVERALL%", "SPAM%", "HAM%", "S/O", "IG", "SCORE", "NAME";
+  	"OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME";
   }
   printf "%7d  %7d  %7d  %7.3f %6.2f  %6.2f  (all messages)\n",
   	$hdr_all, $hdr_spam, $hdr_ham,
@@ -114,7 +116,7 @@
 
 } elsif ($opt_x) {
   printf "%7s %7s  %7s  %6s  %6s %6s  %s\n",
-  	"OVERALL%", "SPAM%", "HAM%", "S/O", "IG", "SCORE", "NAME";
+  	"OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME";
   printf "%7d  %7d  %7d  %7.3f %6.2f %6.2f  (all messages)\n",
   	$hdr_all, $hdr_spam, $hdr_ham,
         soratio ($num_spam,$num_ham), 0, 0;
@@ -130,6 +132,11 @@
 my @tests = ();
 my $rank_hi = 0;
 my $rank_lo = 9999999;
+
+# variables for wanted/unwanted RANK
+my %wanted;
+my %unwanted;
+
 foreach my $test (keys %freq_spam, keys %freq_ham) {
   next unless (exists $rules{$test});           # only valid tests
   next if (!$opt_a && $rules{$test}->{issubrule});
@@ -152,22 +159,10 @@
     my $tmp = $fsadj; $fsadj = $fnadj; $fnadj = $tmp;
   }
 
-  # now, given the S/O ratio (0.0 to 1.0) and match%s (0.0 to 100.0),
-  # come up with a ranking.
-  my $rank;
-
-  # old system
-  #$rank = $soratio * ($fsadj / (($fnadj || 0.0008) * 10));
-  #$rank = log($rank+0.001);
-
-  # new system: allows a few more 99% hitters into the first page
-  # $rank = (($soratio**3) * 2000) + ($fsadj*3);
-  # 
-  # $ranking{$test} = $rank;
-  # $rank_hi = $rank if ($rank > $rank_hi);
-  # $rank_lo = $rank if ($rank < $rank_lo);
+  if ($opt_i) {
+    # come up with a ranking
+    my $rank;
 
-  {
     # New new system: from "Learning to Filter Unsolicited Commercial E-Mail",
     # Ion Androutsopoulos et al: determine the information gain IG(X, C) of the
     # Boolean attributes (ie. the rules). Measures "the average reduction in
@@ -183,7 +178,7 @@
     my $safe_nspam = $num_spam || 0.0000001;
     my $safe_nham = $num_ham || 0.0000001;
 
-    my $num_all = ($num_spam+$num_ham);
+    my $num_all = ($num_spam + $num_ham);
     my $safe_all = $num_all || 0.0000001;
     my $f_all = $fs+$fn;
 
@@ -215,13 +210,45 @@
     $rank_hi = $rank if ($rank > $rank_hi);
     $rank_lo = $rank if ($rank < $rank_lo);
   }
+  else {
+    # basic wanted/unwanted ranking
+    $wanted{$test} = $isnice ? $fn : $fs;
+    $unwanted{$test} = $isnice ? $fs : $fn;
+  }
+}
+
+# finish basic wanted/unwanted ranking
+if (! $opt_i) {
+  my @wanted = sort { $wanted{$a} <=> $wanted{$b} } keys %wanted;
+  my @unwanted = sort { $unwanted{$b} <=> $unwanted{$a} } keys %wanted;
+
+  my $position;
+  my $last;
+
+  $position = 0;
+  $last = undef;
+  for my $test (@wanted) {
+    $position++ if defined $last && $last != $wanted{$test};
+    $ranking{$test} += $position;
+    $last = $wanted{$test}
+  }
+
+  $position = 0;
+  $last = undef;
+  for my $test (@unwanted) {
+    $position++ if defined $last && $last != $unwanted{$test};
+    $ranking{$test} += $position;
+    $last = $unwanted{$test};
+    $rank_hi = $ranking{$test} if ($ranking{$test} > $rank_hi);
+    $rank_lo = $ranking{$test} if ($ranking{$test} < $rank_lo);
+  }
 }
 
 {
   # now normalise the rankings to [0, 1]
   $rank_hi -= $rank_lo;
   foreach $test (@tests) {
-  $ranking{$test} = $rank_hi == 0 ? 0.001 : ($ranking{$test} - $rank_lo) / ($rank_hi);
+    $ranking{$test} = $rank_hi == 0 ? 0.001 : ($ranking{$test} - $rank_lo) / ($rank_hi);
   }
 }