You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by mm...@apache.org on 2009/11/24 17:32:51 UTC

svn commit: r883770 - in /spamassassin/trunk: lib/Mail/SpamAssassin/AutoWhitelist.pm lib/Mail/SpamAssassin/Plugin/AWL.pm lib/Mail/SpamAssassin/SQLBasedAddrList.pm sql/README.awl sql/awl_mysql.sql sql/awl_pg.sql t/data/spam/004

Author: mmartinec
Date: Tue Nov 24 16:32:51 2009
New Revision: 883770

URL: http://svn.apache.org/viewvc?rev=883770&view=rev
Log:
Bug 6203: make AWL CIDR mask configurable: auto_whitelist_ipv4_mask_len
and auto_whitelist_ipv6_mask_len;  update README.awl and sql/awl_*.sql
accordingly (increasing awl.ip field width);  'fix' the t/data/spam/004
sample mail to avoid a test failing with a /24 net mask;
avoid race condition in SQLBasedAddrList.pm when multiple processes
try to insert-or-update an awl SQL record: try INSERT first, and if
that fails go for UPDATE. 

Modified:
    spamassassin/trunk/lib/Mail/SpamAssassin/AutoWhitelist.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/AWL.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/SQLBasedAddrList.pm
    spamassassin/trunk/sql/README.awl
    spamassassin/trunk/sql/awl_mysql.sql
    spamassassin/trunk/sql/awl_pg.sql
    spamassassin/trunk/t/data/spam/004

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/AutoWhitelist.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/AutoWhitelist.pm?rev=883770&r1=883769&r2=883770&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/AutoWhitelist.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/AutoWhitelist.pm Tue Nov 24 16:32:51 2009
@@ -45,7 +45,7 @@
 use warnings;
 use bytes;
 use re 'taint';
-use NetAddr::IP;
+use NetAddr::IP qw(:upper);  # ensure IPv6 string case even if default changes
 
 use Mail::SpamAssassin;
 use Mail::SpamAssassin::Logger;
@@ -64,18 +64,20 @@
   $class = ref($class) || $class;
   my ($main, $msg) = @_;
 
+  my $conf = $main->{conf};
   my $self = {
-    'main'		=> $main,
+    main          => $main,
+    factor        => $conf->{auto_whitelist_factor},
+    ipv4_mask_len => $conf->{auto_whitelist_ipv4_mask_len},
+    ipv6_mask_len => $conf->{auto_whitelist_ipv6_mask_len},
   };
 
-  $self->{factor} = $main->{conf}->{auto_whitelist_factor};
-
   my $factory;
   if ($main->{pers_addr_list_factory}) {
     $factory = $main->{pers_addr_list_factory};
   }
   else {
-    my $type = $main->{conf}->{auto_whitelist_factory};
+    my $type = $conf->{auto_whitelist_factory};
     if ($type =~ /^([_A-Za-z0-9:]+)$/) {
       $type = untaint_var($type);
       eval '
@@ -278,39 +280,68 @@
 
 ###########################################################################
 
-sub pack_addr {
-  my ($self, $addr, $origip) = @_;
-
-  $addr = lc $addr;
-  $addr =~ s/[\000\;\'\"\!\|]/_/gs;	# paranoia
+sub ip_to_awl_key {
+  my ($self, $origip) = @_;
 
+  my $result;
   local $1;
   if (!defined $origip) {
-    # could not find an IP address to use, could be localhost mail or from
-    # the user running "add-addr-to-*".
-    $origip = 'none';
+    # could not find an IP address to use
   } elsif ($origip =~ /^ (\d{1,3} \. \d{1,3}) \. \d{1,3} \. \d{1,3} $/xs) {
-    $origip = $1;
-  } elsif ($origip =~ /:/  &&
+    my $mask_len = $self->{ipv4_mask_len};
+    $mask_len = 16  if !defined $mask_len;
+    # handle the default and easy cases manually
+    if ($mask_len == 32) {
+      $result = $origip;
+    } elsif ($mask_len == 16) {
+      $result = $1;
+    } else {
+      my $origip_obj = NetAddr::IP->new($origip . '/' . $mask_len);
+      if (!defined $origip_obj) {  # invalid IPv4 address
+        dbg("auto-whitelist: bad IPv4 address $origip");
+      } else {
+        $result = $origip_obj->network->addr;
+        $result =~s/(\.0){1,3}\z//;  # truncate zero tail
+      }
+    }
+  } elsif ($origip =~ /:/ &&  # triage
            $origip =~
            /^ [0-9a-f]{0,4} (?: : [0-9a-f]{0,4} | \. [0-9]{1,3} ){2,9} $/xsi) {
     # looks like an IPv6 address
-    my $origip_obj = NetAddr::IP->new6($origip);
+    my $mask_len = $self->{ipv6_mask_len};
+    $mask_len = 48  if !defined $mask_len;
+    my $origip_obj = NetAddr::IP->new6($origip . '/' . $mask_len);
     if (!defined $origip_obj) {  # invalid IPv6 address
       dbg("auto-whitelist: bad IPv6 address $origip");
-      $origip = 'junk-' . $origip;
     } else {
-      $origip = $origip_obj->full6;  # string in a canonical form
-      $origip =~ s/(:[0-9a-f]{4}){5}\z//si;  # keep only the /48 network addr
-      $origip .= '::';  # although it wastes 2 chars, it's nice to make it
-                        # look like a syntactically correct IPv6 address
+      $result = $origip_obj->network->full6;  # string in a canonical form
+      $result =~ s/(:0000){1,7}\z/::/;        # compress zero tail
     }
   } else {
     dbg("auto-whitelist: bad IP address $origip");
-    $origip =~ s/[^0-9A-Fa-f:.]/_/gs;	# paranoia
-    $origip = 'junk-' . $origip;
   }
-  $origip = substr($origip,0,16)  if length($origip) > 16;  # awl.ip field
+  if (defined $result && length($result) > 39) {  # just in case, keep under
+    $result = substr($result,0,39);               # the awl.ip field size
+  }
+  dbg("auto-whitelist: IP masking %s -> %s", $origip,$result);
+  return $result;
+}
+
+###########################################################################
+
+sub pack_addr {
+  my ($self, $addr, $origip) = @_;
+
+  $addr = lc $addr;
+  $addr =~ s/[\000\;\'\"\!\|]/_/gs;	# paranoia
+
+  if (!defined $origip) {
+    # could not find an IP address to use, could be localhost mail
+    # or from the user running "add-addr-to-*".
+    $origip = 'none';
+  } else {
+    $origip = $self->ip_to_awl_key($origip);
+  }
   return $addr . "|ip=" . $origip;
 }
 

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/AWL.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/AWL.pm?rev=883770&r1=883769&r2=883770&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/AWL.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/AWL.pm Tue Nov 24 16:32:51 2009
@@ -148,6 +148,67 @@
 		type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
 	       });
 
+=item auto_whitelist_ipv4_mask_len n	(default: 16, range [0..32])
+
+The AWL database keeps only the specified number of most-significant bits
+of an IPv4 address in its fields, so that different individual IP addresses
+within a subnet belonging to the same owner are managed under a single
+database record. As we have no information available on the allocated
+address ranges of senders, this CIDR mask length is only an approximation.
+The default is 16 bits, corresponding to a former class B. Increase the
+number if a finer granularity is desired, e.g. to 24 (class C) or 32.
+A value 0 is allowed but is not particularly useful, as it would treat the
+whole internet as a single organization. The number need not be a multiple
+of 8, any split is allowed.
+
+=cut
+
+  push (@cmds, {
+		setting => 'auto_whitelist_ipv4_mask_len',
+		default => 16,
+		type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
+		code => sub {
+		  my ($self, $key, $value, $line) = @_;
+		  if (!defined $value || $value eq '') {
+		    return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
+		  } elsif ($value !~ /^\d+$/ || $value < 0 || $value > 32) {
+		    return $Mail::SpamAssassin::Conf::INVALID_VALUE;
+		  }
+		  $self->{auto_whitelist_ipv4_mask_len} = $value;
+		}
+	       });
+
+=item auto_whitelist_ipv6_mask_len n	(default: 48, range [0..128])
+
+The AWL database keeps only the specified number of most-significant bits
+of an IPv6 address in its fields, so that different individual IP addresses
+within a subnet belonging to the same owner are managed under a single
+database record. As we have no information available on the allocated address
+ranges of senders, this CIDR mask length is only an approximation. The default
+is 48 bits, corresponding to an address range commonly allocated to individual
+(smaller) organizations. Increase the number for a finer granularity, e.g.
+to 64 or 96 or 128, or decrease for wider ranges, e.g. 32.  A value 0 is
+allowed but is not particularly useful, as it would treat the whole internet
+as a single organization. The number need not be a multiple of 4, any split
+is allowed.
+
+=cut
+
+  push (@cmds, {
+		setting => 'auto_whitelist_ipv6_mask_len',
+		default => 48,
+		type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
+		code => sub {
+		  my ($self, $key, $value, $line) = @_;
+		  if (!defined $value || $value eq '') {
+		    return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
+		  } elsif ($value !~ /^\d+$/ || $value < 0 || $value > 128) {
+		    return $Mail::SpamAssassin::Conf::INVALID_VALUE;
+		  }
+		  $self->{auto_whitelist_ipv6_mask_len} = $value;
+		}
+	       });
+
 =item user_awl_sql_override_username
 
 Used by the SQLBasedAddrList storage implementation.

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/SQLBasedAddrList.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/SQLBasedAddrList.pm?rev=883770&r1=883769&r2=883770&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/SQLBasedAddrList.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/SQLBasedAddrList.pm Tue Nov 24 16:32:51 2009
@@ -136,7 +136,8 @@
   my $dbh = DBI->connect($dsn, $dbuser, $dbpass, {'PrintError' => 0});
 
   if(!$dbh) {
-    dbg("auto-whitelist: sql-based unable to connect to database ($dsn) : " . DBI::errstr);
+    info("auto-whitelist: sql-based unable to connect to database (%s) : %s",
+         $dsn, DBI::errstr);
     return undef;
   }
 
@@ -203,26 +204,28 @@
 
   my $sql = "SELECT count, totscore FROM $self->{tablename} " .
             "WHERE username = ? AND email = ?";
-  my @args = ($email);
-  if ($self->{_with_awl_signer}) {
-    my @signedby = !defined $signedby ? '' : split(' ', lc $signedby);
-    if (@signedby == 1) {
+  my @args = ( $email );
+  if (!$self->{_with_awl_signer}) {
+    $sql .= " AND ip = ?";
+    push(@args, $ip);
+  } else {
+    my @signedby = !defined $signedby ? () : split(' ', lc $signedby);
+    if (!@signedby) {
+      $sql .= " AND signedby = '' AND ip = ?";
+      push(@args, $ip);
+    } elsif (@signedby == 1) {
       $sql .= " AND signedby = ?";
     } elsif (@signedby > 1) {
       $sql .= " AND signedby IN (" . join(',', ('?') x @signedby) . ")";
     }
     push(@args, @signedby);
   }
-  if (!$self->{_with_awl_signer} || !defined $signedby || $signedby eq '') {
-    $sql .= " AND ip = ?";
-    push(@args, $ip);
-  }
   my $sth = $self->{dbh}->prepare($sql);
   my $rc = $sth->execute($self->{_username}, @args);
 
   if (!$rc) { # there was an error, but try to go on
-    my $err = $self->{dbh}->errstr;
-    dbg("auto-whitelist: sql-based get_addr_entry: SQL error: $err");
+    info("auto-whitelist: sql-based get_addr_entry %s: SQL error: %s",
+         join('|',@args), $sth->errstr);
     $entry->{count} = 0;
     $entry->{totscore} = 0;
   }
@@ -240,7 +243,7 @@
       $cnt++;
     }
     dbg("auto-whitelist: sql-based get_addr_entry: %s for %s",
-        $cnt ? "found $cnt entry" : 'no entries found',
+        $cnt ? "found $cnt entries" : 'no entries found',
         join('|',@args) );
   }
   $sth->finish();
@@ -279,62 +282,81 @@
   
   return $entry  unless $email ne '' && (defined $ip || defined $signedby);
 
-  if ($entry->{exists_p}) { # entry already exists, so just update
-    my(@args) = ($score, $self->{_username}, $email);
+  # try inserting first, and if that fails we'll do the update; this way
+  # we avoid to large extent a race condition between multiple processes
+
+  my $inserted = 0;
+
+  { my @fields = qw(username email ip count totscore);
+    my @signedby;
+    if ($self->{_with_awl_signer}) {
+      push(@fields, 'signedby');
+      @signedby = !defined $signedby ? () : split(' ', lc $signedby);
+      @signedby = ( '' )  if !@signedby;
+    }
+    my @args = ($self->{_username}, $email, $ip, 1, $score);
+    my $sql = sprintf("INSERT INTO %s (%s) VALUES (%s)", $self->{tablename},
+                      join(',', @fields),  join(',', ('?') x @fields));
+    my $sth = $self->{dbh}->prepare($sql);
+
+    if (!$self->{_with_awl_signer}) {
+      my $rc = $sth->execute(@args);
+      if (!$rc) {
+        dbg("auto-whitelist: sql-based add_score/insert %s: SQL error: %s",
+             join('|',@args), $sth->errstr);
+      } else {
+        dbg("auto-whitelist: sql-based add_score/insert ".
+            "score %s: %s", $score, join('|',@args));
+        $inserted = 1; $entry->{exists_p} = 1;
+      }
+    } else {
+      for my $s (@signedby) {
+        my $rc = $sth->execute(@args, $s);
+        if (!$rc) {
+          dbg("auto-whitelist: sql-based add_score/insert %s: SQL error: %s",
+              join('|',@args,$s), $sth->errstr);
+        } else {
+          dbg("auto-whitelist: sql-based add_score/insert ".
+              "score %s: %s", $score, join('|',@args,$s));
+          $inserted = 1; $entry->{exists_p} = 1;
+        }
+      }
+    }
+  }
+
+  if (!$inserted) {
+    # insert failed, assume primary key constraint, so try the update
+
     my $sql = "UPDATE $self->{tablename} ".
               "SET count = count + 1, totscore = totscore + ? ".
               "WHERE username = ? AND email = ?";
-    my @signedby;
+    my(@args) = ($score, $self->{_username}, $email);
     if ($self->{_with_awl_signer}) {
-      @signedby = !defined $signedby ? '' : split(' ', lc $signedby);
-      if (@signedby == 1) {
+      my @signedby = !defined $signedby ? () : split(' ', lc $signedby);
+      if (!@signedby) {
+        $sql .= " AND signedby = ''";
+      } elsif (@signedby == 1) {
         $sql .= " AND signedby = ?";
       } elsif (@signedby > 1) {
         $sql .= " AND signedby IN (" . join(',', ('?') x @signedby) . ")";
       }
       push(@args, @signedby);
     }
-    if (!$self->{_with_awl_signer} || !defined $signedby || $signedby eq '') {
-      $sql .= " AND ip = ?";
-      push(@args, $ip);
-    }
+    $sql .= " AND ip = ?";
+    push(@args, $ip);
+
     my $sth = $self->{dbh}->prepare($sql);
     my $rc = $sth->execute(@args);
     
     if (!$rc) {
-      my $err = $self->{dbh}->errstr;
-      dbg("auto-whitelist: sql-based add_score/update: SQL error: $err");
-    }
-    else {
-      dbg("auto-whitelist: sql-based add_score: ".
+      info("auto-whitelist: sql-based add_score/update %s: SQL error: %s",
+           join('|',@args), $sth->errstr);
+    } else {
+      dbg("auto-whitelist: sql-based add_score/update ".
           "new count: %s, new totscore: %s for %s",
-          $entry->{count}, $entry->{totscore}, join('|',@args[2..$#args]));
-    }
-    $sth->finish();
-  }
-  else { # no entry yet, so insert a new entry
-    my @fields = qw(username email ip count totscore);
-    my @args = ($self->{_username}, $email, $ip, 1, $score);
-    my @signedby;
-    if ($self->{_with_awl_signer}) {
-      push(@fields, 'signedby');
-      @signedby = !defined $signedby ? '' : split(' ', lc $signedby);
-    }
-    my $sql = sprintf("INSERT INTO %s (%s) VALUES (%s)", $self->{tablename},
-                      join(',', @fields),  join(',', ('?') x @fields));
-    my $sth = $self->{dbh}->prepare($sql);
-    for my $s (@signedby) {
-      # loop runs for at least one element, possibly an empty signing domain
-      my $rc = $sth->execute(@args, (@fields > @args ? $s : ()) );
-      if (!$rc) {
-        my $err = $self->{dbh}->errstr;
-        dbg("auto-whitelist: sql-based add_score/insert: SQL error: $err");
-      }
+          $entry->{count}, $entry->{totscore}, join('|',@args));
+      $entry->{exists_p} = 1;
     }
-    $entry->{exists_p} = 1;
-    dbg("auto-whitelist: sql-based add_score: created new entry ".
-        "with totscore %s: %s", $score, join('|',@args[2..$#args]));
-    $sth->finish();
   }
   
   return $entry;
@@ -387,8 +409,8 @@
   my $rc = $sth->execute(@args);
 
   if (!$rc) {
-    my $err = $self->{dbh}->errstr;
-    dbg("auto-whitelist: sql-based remove_entry: SQL error: $err");
+    info("auto-whitelist: sql-based remove_entry %s: SQL error: %s",
+         join('|',@args), $sth->errstr);
   }
   else {
     # We might normally have a dbg saying we removed the address

Modified: spamassassin/trunk/sql/README.awl
URL: http://svn.apache.org/viewvc/spamassassin/trunk/sql/README.awl?rev=883770&r1=883769&r2=883770&view=diff
==============================================================================
--- spamassassin/trunk/sql/README.awl (original)
+++ spamassassin/trunk/sql/README.awl Tue Nov 24 16:32:51 2009
@@ -74,7 +74,7 @@
 
   username varchar(100)	  # this is the username whose e-mail is being filtered
   email varchar(200)      # this is the address key
-  ip    varchar(16)       # this is the ip key (fits IPv4/24 or IPv6/48)
+  ip    varchar(40)       # this is the ip key (fits IPv4 IPv6)
   count int(11)           # this is the message counter
   totscore float          # this is the total calculated score
   signedby varchar(255)   # a DKIM or DomainKeys signing domain(s)
@@ -108,7 +108,7 @@
 CREATE TABLE awl (
   username varchar(100) NOT NULL default '',
   email varchar(255) NOT NULL default '',
-  ip varchar(16) NOT NULL default '',
+  ip varchar(40) NOT NULL default '',
   count int(11) NOT NULL default '0',
   totscore float NOT NULL default '0',
   signedby varchar(255) NOT NULL default '',
@@ -137,11 +137,11 @@
   auto_whitelist_distinguish_signed 1
 
 To extend a field awl.ip on an existing table to be able to fit
-an IPv6 addresses (its /48 network part) or an IPv4 address (/24):
+an IPv6 addresses (39 characters would suffice) or an IPv4 address:
 under MySQL:
-  ALTER TABLE awl MODIFY ip varchar(16);
+  ALTER TABLE awl MODIFY ip varchar(40);
 under PostgreSQL:
-  ALTER TABLE awl ALTER ip TYPE varchar(16);
+  ALTER TABLE awl ALTER ip TYPE varchar(40);
 
 
 Once you have created the database and added the table, just add the

Modified: spamassassin/trunk/sql/awl_mysql.sql
URL: http://svn.apache.org/viewvc/spamassassin/trunk/sql/awl_mysql.sql?rev=883770&r1=883769&r2=883770&view=diff
==============================================================================
--- spamassassin/trunk/sql/awl_mysql.sql (original)
+++ spamassassin/trunk/sql/awl_mysql.sql Tue Nov 24 16:32:51 2009
@@ -1,7 +1,7 @@
 CREATE TABLE awl (
   username varchar(100) NOT NULL default '',
   email varchar(255) NOT NULL default '',
-  ip varchar(16) NOT NULL default '',
+  ip varchar(40) NOT NULL default '',
   count int(11) NOT NULL default '0',
   totscore float NOT NULL default '0',
   signedby varchar(255) NOT NULL default '',

Modified: spamassassin/trunk/sql/awl_pg.sql
URL: http://svn.apache.org/viewvc/spamassassin/trunk/sql/awl_pg.sql?rev=883770&r1=883769&r2=883770&view=diff
==============================================================================
--- spamassassin/trunk/sql/awl_pg.sql (original)
+++ spamassassin/trunk/sql/awl_pg.sql Tue Nov 24 16:32:51 2009
@@ -1,7 +1,7 @@
 CREATE TABLE awl (
   username varchar(100) NOT NULL default '',
   email varchar(255) NOT NULL default '',
-  ip varchar(16) NOT NULL default '',
+  ip varchar(40) NOT NULL default '',
   count bigint NOT NULL default '0',
   totscore float NOT NULL default '0',
   signedby varchar(255) NOT NULL default '',

Modified: spamassassin/trunk/t/data/spam/004
URL: http://svn.apache.org/viewvc/spamassassin/trunk/t/data/spam/004?rev=883770&r1=883769&r2=883770&view=diff
==============================================================================
--- spamassassin/trunk/t/data/spam/004 (original)
+++ spamassassin/trunk/t/data/spam/004 Tue Nov 24 16:32:51 2009
@@ -16,7 +16,7 @@
 Received: from ns.sakakura-kk.co.jp (ns.sakakura-kk.co.jp [61.119.13.18])
     by mail.netnoteinc.com (Postfix) with ESMTP id 4FD6F1143D6 for
     <jm...@netnoteinc.com>; Thu,  6 Dec 2001 23:58:02 +0000 (Eire)
-Received: from plain (ZHONGXIN [212.17.88.134]) by ns.sakakura-kk.co.jp
+Received: from plain (ZHONGXIN [212.17.35.134]) by ns.sakakura-kk.co.jp
     with SMTP (Microsoft Exchange Internet Mail Service Version 5.5.2653.13)
     id YJQP6DKP; Fri, 7 Dec 2001 08:15:01 +0900
 Received: from 144.137.3.98 (SquirrelMail authenticated user jmmail) by