You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by du...@apache.org on 2006/12/09 02:45:15 UTC

svn commit: r484892 - /spamassassin/trunk/masses/hit-frequencies

Author: duncf
Date: Fri Dec  8 17:45:15 2006
New Revision: 484892

URL: http://svn.apache.org/viewvc?view=rev&rev=484892
Log:
Document hit-frequencies and minor cleanup.

Modified:
    spamassassin/trunk/masses/hit-frequencies

Modified: spamassassin/trunk/masses/hit-frequencies
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/hit-frequencies?view=diff&rev=484892&r1=484891&r2=484892
==============================================================================
--- spamassassin/trunk/masses/hit-frequencies (original)
+++ spamassassin/trunk/masses/hit-frequencies Fri Dec  8 17:45:15 2006
@@ -18,56 +18,145 @@
 # </...@LICENSE>
 
 use strict;
+use warnings;
+
 use FindBin;
-use Getopt::Std;
-getopts("fm:M:X:l:L:pxhc:at:s:ioTSdP");
+use Getopt::Long qw(:config bundling auto_help);
+use Pod::Usage;
 
 use vars qw {
-  $opt_f $opt_m $opt_M $opt_X $opt_p $opt_x $opt_h $opt_l $opt_L $opt_c
-  $opt_a $opt_t $opt_s $opt_i $sorting $opt_o $opt_T $opt_S $opt_X
-  $opt_d $opt_P
+  $opt_c $opt_s $opt_f $opt_a $opt_p $opt_x $opt_m $opt_t $opt_M
+  $opt_X $opt_L $opt_l $opt_i $opt_T $opt_o $opt_S $opt_P $opt_d
+  $sorting
 };
 
-# as per http://wiki.apache.org/spamassassin/RulesProjPromotion, for -P
-my $promote_so_min      = 0.95;
-my $promote_hitrate_min = 0.02;
-my $promote_fprate_max  = 1.00;
+GetOptions("c|cffile=s@" => \$opt_c,
+	   "s|scoreset=i" => \$opt_s, # ,, pacify stupid emacs cperl mode
+	   "f|falses" => \$opt_f,
+	   "a|all" => \$opt_a,
+	   "p|percentages" => \$opt_p,
+	   "x|extended" => \$opt_x,
+	   "m|matchrules=s" => \$opt_m,
+	   "t|tflags=s" => \$opt_t,
+	   "M|matchlogs=s" => \$opt_M,
+	   "X|excludelogs=s" => \$opt_X,
+	   "L|onlylanguage=s" => \$opt_L,
+	   "l|alsolanguage=s" => \$opt_l,
+	   "i|ig" => \$opt_i,
+	   "T|times" => \$opt_T,
+	   "o|overlaps" => \$opt_o,
+	   "S|scoremap" => \$opt_S,
+	   "P|promotion" => \$opt_P,
+	   "d|xml" => \$opt_d
+	  );
+
+=head1 NAME
+
+hit-frequencies - Display statistics about tests hit by a mass-check run
+
+=head1 SYNOPSIS
+
+hit-frequencies [options] <spam-log> <ham-log>
+
+ Options:
+    -c,--cffile=path	  Use path as the rules directory
+    -s,--scoreset=n	  Use scoreset n
+    -f,--falses  	  Count only false-positives/false-negatives
+    -a,--all		  Report all tests (including subrules)
+    -p,--percentages	  Report percentages instead of raw hits (implies -x)
+    -x,--extended	  "Extended" output, include RANK, S/O and SCORE
+    -m,--matchrules=re    Print rules matching the regular expression
+    -t,--tflags=re	  Print only rules with tflags matching the regular expression
+    -M,--matchlogs=re     Consider only logs matching the regular expression
+    -X,--excludelogs=re	  Exclude logs matching this regular expression
+    -L,--onlylanguage=lc  Only print language specific tests for specified lang code (try 'all')
+    -l,--alsolanguage=lc  Also print language specific tests for specified lang code (try 'all')
+    -i,--ig               Use IG (information gain) for ranking
+    -T,--times            Display rule times (implies -x, -p)
+    -o,--overlaps         Display hit overlaps against all other rules
+    -S,--scoremap         Display score-map of hits
+    -P,--promotion        Flag rules that meet the promotion criteria
+    -d,--XML              XML output (conflicts with -x, -p)
+
+=head1 DESCRIPTION
+
+B<hit-frequencies> will read the mass-check logs F<spam.log> and
+F<ham.log> or the logs given on the command line. The output will
+contain a summary of the number of ham and spam messages and detailed
+statistics for each rule. The output will include the following
+columns:
+
+=over 4
+
+=item OVERALL
+
+Number of times (or percentage with B<-p>) the rule hit on
+all messages (spam or ham).
+
+=item SPAM
+
+Number of times (or percentage with B<-p>) the rule hit on
+spam messages.
+
+=item HAM
+
+Number of times (or percentage with B<-p>) the rule hit on
+ham messages.
+
+=item FPOS
+
+=item FNEG
+
+Shown only with B<-f>, these refer to the number of times (or
+percentage) the rule hit on messages that were found to be false
+positives or false negatives.
+
+=item S/O
 
-sub usage {
-  die "hit-frequencies [-c rules dir] [-f] [-m RE] [-M RE] [-X RE] [-l LC]
-          [-s SC] [-a] [-p] [-x] [-i] [-T] [-S] [-o] [-d] [spam log] [ham log]
-
-    -c p   use p as the rules directory
-    -f     falses. count only false-negative or false-positive matches
-    -m RE  print rules matching regular expression
-    -t RE  print rules with tflags matching regular expression
-    -M RE  only consider log entries matching regular expression
-    -X RE  don't consider log entries matching regular expression
-    -l LC  also print language specific rules for lang code LC (or 'all')
-    -L LC  only print language specific rules for lang code LC (or 'all')
-    -a     display all tests
-    -p     percentages. implies -x
-    -x     extended output, with S/O ratio and scores
-    -s SC  which scoreset to use
-    -i     use IG (information gain) for ranking
-    -T     display rule times. implies -x, -p
-    -o     display hit overlaps against all other rules
-    -S     display score-map of hits
-    -P     flag which rules pass the promotion criteria
-    -d     XML output.  conflicts with -x, -p
-
-    options -l and -L are mutually exclusive.
-
-    options -M and -X are *not* mutually exclusive.
+Shown only with B<-x> or B<-p>, this is the number of spam hits
+divided by total number of hits (C<S/O> refers to spam divided by
+overall).
 
-    if either the spam or and ham logs are unspecified, the defaults
-    are \"spam.log\" and \"ham.log\" in the cwd.
+=item RANK
 
-";
+Shown only with B<-x> or B<-p>, and when B<-i> is not used, this is a
+measure that attempts to indicate how I<good> or I<useful> a test
+is. The higher it is, the better the test.
+
+=item IG
+
+Shown only with B<-i>, this is another measure that attempts to
+indicate how I<useful> a test is.
+
+=item SCORE
+
+Shown only with B<-x> or B<-p>, this is the current score assigned to
+the rule.
+
+=item NAME
+
+This is the rule's name.
+
+=back
+
+=head1 BUGS
+
+Please report bugs to http://bugzilla.spamassassin.org/
+
+=head1 SEE ALSO
+
+L<mass-check(1)>, L<perceptron(1)>
+
+=cut
+if ($opt_l && $opt_L) {
+  pod2usage("-L/--alsolanguage and -l/--onlylanguage are mutually exclusive");
+}
+
+if ($opt_d && ($opt_x || $opt_p)) {
+  pod2usage("-d/--xml conflicts with -x/--extended and -p/--percentages");
 }
 
-usage() if($opt_h || ($opt_l && $opt_L));
-usage() if($opt_d && ($opt_x || $opt_p));
+$opt_s = 0 if ( !defined $opt_s );
 
 if ($opt_p) {
   $opt_x = 1;
@@ -77,7 +166,12 @@
   $opt_x = $opt_p = 1;
 }
 
-$opt_s = 0 if ( !defined $opt_s );
+
+# as per http://wiki.apache.org/spamassassin/RulesProjPromotion, for -P
+my $promote_so_min      = 0.95;
+my $promote_hitrate_min = 0.02;
+my $promote_fprate_max  = 1.00;
+
 
 my $cffile = $opt_c || "$FindBin::Bin/../rules";
 
@@ -201,7 +295,8 @@
 
 } else {
   printf "%10s  %10s  %10s  %s\n",
-  	"OVERALL", $opt_f?"FNEG":"SPAM", $opt_f?"FPOS":"HAM", "NAME";
+  	"OVERALL", $opt_f?"FNEG":"SPAM", $opt_f?"FPO":"HAM",
+	  "NAME";
   printf "%10d  %10d  %10d  (all messages)\n",
   	$hdr_all, $hdr_spam, $hdr_ham;
 }