You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by qu...@apache.org on 2004/09/27 05:17:33 UTC
svn commit: rev 47270 - spamassassin/trunk/masses
Author: quinlan
Date: Sun Sep 26 20:17:32 2004
New Revision: 47270
Modified:
spamassassin/trunk/masses/hit-frequencies
Log:
document RANK
Modified: spamassassin/trunk/masses/hit-frequencies
==============================================================================
--- spamassassin/trunk/masses/hit-frequencies (original)
+++ spamassassin/trunk/masses/hit-frequencies Sun Sep 26 20:17:32 2004
@@ -173,7 +173,7 @@
# come up with a ranking
my $rank;
- # New new system: from "Learning to Filter Unsolicited Commercial E-Mail",
+ # IG system: from "Learning to Filter Unsolicited Commercial E-Mail",
# Ion Androutsopoulos et al: determine the information gain IG(X, C) of the
# Boolean attributes (ie. the rules). Measures "the average reduction in
# the entropy of C (classification) given the value of X (the rule)". Makes
@@ -221,7 +221,17 @@
$rank_lo = $rank if ($rank < $rank_lo);
}
else {
- # basic wanted/unwanted ranking
+ # RANK: basic wanted/unwanted ranking
+ #
+ # The rank of each test based on two ranks: (1) the number of wanted
+ # hits and (2) the number of unwanted hits. Each test is ranked
+ # positionally for both its wanted and unwanted hits (ties are
+ # allowed) and the two ranks are normalized to have the same range.
+ # Those two ranks are added together, producing a single RANK number
+ # that is then normalized to [0, 1]. The result is equivalent to:
+ #
+ # RANK(rule) = (percentile(wanted) + percentile(unwanted))/2
+ #
$wanted{$test} = $isnice ? $fn : $fs;
$unwanted{$test} = $isnice ? $fs : $fn;
# count number of ranks of each type