You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by du...@apache.org on 2006/12/11 06:43:18 UTC

svn commit: r485513 - /spamassassin/trunk/masses/overlap

Author: duncf
Date: Sun Dec 10 21:43:14 2006
New Revision: 485513

URL: http://svn.apache.org/viewvc?view=rev&rev=485513
Log:
Clean up overlap and document

Modified:
    spamassassin/trunk/masses/overlap

Modified: spamassassin/trunk/masses/overlap
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/overlap?view=diff&rev=485513&r1=485512&r2=485513
==============================================================================
--- spamassassin/trunk/masses/overlap (original)
+++ spamassassin/trunk/masses/overlap Sun Dec 10 21:43:14 2006
@@ -1,7 +1,5 @@
 #!/usr/bin/perl -w
 
-# overlap - print overlap between test pairs
-#
 # <@LICENSE>
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
@@ -19,23 +17,41 @@
 # limitations under the License.
 # </...@LICENSE>
 
-use vars qw($opt_a $opt_h $opt_t);
-use Getopt::Std;
-getopts("aht");
+use strict;
+use warnings;
 
-my $prog = $0;
-$prog =~ s@.*/@@;
+use vars qw($opt_a $opt_t);
+use Getopt::Long qw(:config auto_help bundling);
+
+GetOptions("a|all" => \$opt_a,
+	   "t|ignore" => \$opt_t);
+
+=head1 NAME
+
+overlap - Tool to help determine which tests overlap significantly
+
+=head1 SYNOPSIS
+
+overlap [options] <log file>
+
+ Options:
+   -a,--all          Show all entries (including reverses of pairs)
+   -t,--ignore       Ignore T_ tests (rules under testing)
 
-sub usage {
-    my $status = shift;
+=head1 DESCRIPTION
 
-    my $out = $status ? STDERR : STDOUT;
-    print $out <<EOF;
-usage: $prog [options] [mass-check results files]
-
- -a    show all entries (normally, reverses of pairs are not shown)
- -h    print this help
- -t    ignore T_ tests
+B<overlap> will read the mass-check results log specified and output
+pairs of tests and how frequently they occur together in absolute
+terms, and relative to their individual hit rates.
+
+The output is of the form:
+
+ COUNT   PAIR/A  PAIR/B  A,B
+
+where C<COUNT> is the number of times the tests hit on the same
+message, C<PAIR/A> is the ratio of times that both test hit to the
+number of times test A hits, C<PAIR/B> is the ratio of pair hits to B
+hits, and the C<A,B> column shows the names of the two tests.
 
 Do not abuse this tool.  Just because a test highly correlates with
 another test does not mean you can simply remove one or merge them
@@ -44,11 +60,11 @@
 Some overlap is often good, especially if the tests have different
 characteristics.
 
-EOF
-    exit($status);
-}
+=cut
 
-usage(0) if $opt_h;
+
+my $prog = $0;
+$prog =~ s@.*/@@;
 
 if ($#ARGV < 0) {
     push(@ARGV, "-");
@@ -57,13 +73,13 @@
 my %solo;
 my %pair;
 
-foreach $file (@ARGV) {
+foreach my $file (@ARGV) {
     read_file($file);
 }
 
 print "COUNT\tPAIR/A\tPAIR/B\tA,B\n";
 
-foreach $k (sort { $pair{$b} <=> $pair{$a} } keys %pair) {
+foreach my $k (sort { $pair{$b} <=> $pair{$a} } keys %pair) {
     my ($a, $b) = split(/ /, $k);
     my $a_pct = $pair{$k} / $solo{$a};
     my $b_pct = $pair{$k} / $solo{$b};