You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2004/02/08 00:28:54 UTC

svn commit: rev 6569 - incubator/spamassassin/trunk/tools

Author: jm
Date: Sat Feb  7 15:28:52 2004
New Revision: 6569

Modified:
   incubator/spamassassin/trunk/tools/check_whitelist
Log:
added POD documentation and --clean switch to tools/check_whitelist, allowing AWL to be cleaned of single-hit old entries.   fixes bug 1883: record number of emails received in AWL; bug 2103: AWL needs a way to be cleaned out

Modified: incubator/spamassassin/trunk/tools/check_whitelist
==============================================================================
--- incubator/spamassassin/trunk/tools/check_whitelist	(original)
+++ incubator/spamassassin/trunk/tools/check_whitelist	Sat Feb  7 15:28:52 2004
@@ -1,14 +1,32 @@
 #!/usr/bin/perl
+#
+# TODO: should this be made a top-level script, called "sa-awl"?
+
+sub usage {
+  die "
+usage: check_whitelist [--clean] [--min n] [dbfile]
+";
+}
 
 use strict;
 use Fcntl;
+use Getopt::Long;
+
+use vars qw(
+		$opt_clean $opt_min $opt_help
+	);
 
-# must match line at top of lib/Mail/SpamAssassin/DBBasedAddrList.pm.
-# now off until 3.0
-# BEGIN { @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File); }
+GetOptions(
+  'clean'		=> \$opt_clean,
+  'min:i'		=> \$opt_min,
+  'help'		=> \$opt_help
+) or usage();
+$opt_help and usage();
 
+$opt_min ||= 2;
+
+BEGIN { @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File); }
 use AnyDBM_File ;
-use vars qw( %h $k $v ) ;
 
 my $db;
 if ($#ARGV == -1) {
@@ -17,17 +35,94 @@
   $db = $ARGV[0];
 }
 
-tie %h, "AnyDBM_File",$db, O_RDONLY,0600
+my %h;
+if ($opt_clean) {
+  tie %h, "AnyDBM_File",$db, O_RDWR,0600
+      or die "Cannot open r/w file $db: $!\n";
+} else {
+  tie %h, "AnyDBM_File",$db, O_RDONLY,0600
       or die "Cannot open file $db: $!\n";
+}
+
 my @k = grep(!/totscore$/,keys(%h));
 for my $key (@k)
 {
-  my $t = $h{"$key|totscore"};
-  my $v = $h{$key};
-  if(defined($t)) {
-    printf "% 8.1f %15s  --  %s\n",
-		  $t/$v, (sprintf "(%.1f/%d)",$t/$v,$v),
+  my $totscore = $h{"$key|totscore"};
+  my $count = $h{$key};
+  next unless defined($totscore);
+
+  if ($opt_clean) {
+    if ($count >= $opt_min) { next; }
+    print "cleaning: ";
+  }
+
+  printf "% 8.1f %15s  --  %s\n",
+		  $totscore/$count, (sprintf "(%.1f/%d)",$totscore,$count),
 		  $key;
+
+  if ($opt_clean) {
+    delete $h{"$key|totscore"};
+    delete $h{$key};
   }
 }
 untie %h;
+
+=head1 NAME
+
+check_whitelist - examine and manipulate SpamAssassin's auto-whitelist db
+
+=head1 SYNOPSIS
+
+B<check_whitelist> [--clean] [--min n] [dbfile]
+
+=head1 DESCRIPTION
+
+Check or clean a SpamAssassin auto-whitelist (AWL) database file.
+
+The name of the file is specified after any options, as C<dbfile>.
+The default is C<$HOME/.spamassassin/auto-whitelist>.
+
+=head1 OPTIONS
+
+=over 4
+
+=item --clean
+
+Clean out infrequently-used AWL entries.  The C<--min> switch can be
+used to select the threshold at which entries are kept or deleted.
+
+=item --min n
+
+Select the threshold at which entries are kept or deleted when C<--clean> is
+used.  The default is C<2>, so entries that have only been seen once are
+deleted.
+
+=back
+
+=head1 OUTPUT
+
+The output looks like this:
+
+     AVG  (TOTSCORE/COUNT)  --  EMAIL|ip=IPBASE
+
+For example:
+
+     0.0         (0.0/7)  --  dawson@example.com|ip=208.192
+    21.8        (43.7/2)  --  mcdaniel_2s2000@example.com|ip=200.106
+
+C<AVG> is the average score;  C<TOTSCORE> is the total score of all mails seen
+so far;  C<COUNT> is the number of messages seen from that sender;  C<EMAIL> is
+the sender's email address, and C<IPBASE> is the B<AWL base IP address>.
+
+B<AWL base IP address> is a way to identify the sender's IP address they
+frequently send from, in an approximate way, but remaining hard for spammers to
+spoof.  The algorithm is as follows:
+
+  - take the last Received header that contains a public IP address -- namely
+    one which is not in private, unrouted IP space.
+
+  - chop off the last two octets, assuming that the user may be in an ISP's
+    dynamic address pool.
+
+=cut
+