You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by mm...@apache.org on 2010/06/29 01:29:03 UTC

svn commit: r958790 - in /spamassassin/trunk/lib/Mail/SpamAssassin: Conf.pm Plugin/WLBLEval.pm

Author: mmartinec
Date: Mon Jun 28 23:29:02 2010
New Revision: 958790

URL: http://svn.apache.org/viewvc?rev=958790&view=rev
Log:
Bug 6458 - add enlist_uri_host and delist_uri_host conf directives,
allowing for arbitrarily named URI lists, each associated with
its own scoring rule

Modified:
    spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/WLBLEval.pm

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm?rev=958790&r1=958789&r2=958790&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm Mon Jun 28 23:29:02 2010
@@ -642,74 +642,135 @@ e.g.
   });
 
 
-=item blacklist_uri_host host-or-domain ...
+=item enlist_uri_host (listname) host ...
+
+Adds one or more host names or domain names to a named list of URI domains.
+The named list can then be consulted through a check_uri_host_in_wblist()
+eval rule, which takes the list name as an argument. Parenthesis around
+a list name are literal - a required syntax.
+
+Host names may optionally be prefixed by an exclamantion mark '!', which
+produces false as a result if this entry matches. This makes it easier
+to exclude some subdomains when their superdomain is listed, for example:
+
+  enlist_uri_host (MYLIST) !sub1.example.com !sub2.example.com example.com
 
-Adds one or more host names to a list of blacklisted URI domains.
+No wildcards are supported, but subdomains do match implicitly. Lists
+are independent. Search for each named list starts by looking up the
+full hostname first, then leading fields are progressively stripped off
+(e.g.: sub.example.com, example.com, com) until a match is found or we run
+out of fields. The first matching entry (the most specific) determines if
+a lookup yielded a true (no '!' prefix) or a false ('!'-prefixed) result.
 
-No wildcards are supported, but subdomains do match implicitly. There is
-only one combined list for black- and whitelisting of host names in URIs.
-Search starts by looking up the full hostname first, then leading fields
-are progressively stripped off (e.g.: sub.example.com, example.com, com)
-until a match is found or we run out of fields. The first matching entry
-(the most specific) determines if a lookup yielded a blacklisted or a
-whitelisted result.
-
-If an URL contains an IP address in place of a host name, the
-black- (or white-) list must specify the exact same IP address.
-
-A domain cannot be both blacklisted and whitelisted at the same time, the
-last directive prevails. Use the unlist_uri_host directive to neutralize
-previous blacklist_uri_host and whitelist_uri_host settings.
+If an URL found in a message contains an IP address in place of a host name,
+the given list must specify the exact same IP address (instead of a host name)
+in order to match.
+
+Use the delist_uri_host directive to neutralize previous enlist_uri_host
+settings. Listnames 'BLACK' and 'WHITE' have their shorthand directives
+blacklist_uri_host and whitelist_uri_host and default rules, but are
+otherwise not special or reserved.
 
 =cut
 
   push (@cmds, {
-    command => 'blacklist_uri_host',
-    setting => 'wblist_uri_host',
+    command => 'enlist_uri_host',
+    setting => 'uri_host_lists',
     type => $CONF_TYPE_ADDRLIST,
     code => sub {
       my($conf, $key, $value, $line) = @_;
-      my $listref = $conf->{wblist_uri_host};
-      $conf->{wblist_uri_host} = $listref = {}  if !$listref;
-      $listref->{$_} = +1  for split(' ', lc $value);
+      local($1,$2);
+      if ($value !~ /^ \( (.*?) \) \s+ (.*) \z/sx) {
+        return $MISSING_REQUIRED_VALUE;
+      }
+      my $listname = $1;  # corresponds to arg in check_uri_host_in_wblist()
+      # note: must not factor out dereferencing, as otherwise
+      # subhashes would spring up in a copy and be lost
+      foreach my $host ( split(' ', lc $2) ) {
+        my $v = $host =~ s/^!// ? 0 : 1;
+        $conf->{uri_host_lists}{$listname}{$host} = $v;
+      }
     }
   });
 
-=item whitelist_uri_host host-or-domain ...
+=item delist_uri_host [ (listname) ] host ...
 
-Adds one or more host names to a list of whitelisted URI domains.
-See blacklist_uri_host directive for details.
+Removes one or more specified host names from a named list of URI domains.
+Removing an unlisted name is ignored (is not an error). Listname is optional,
+if specified then just the named list is affected, otherwise hosts are
+removed from all URI host lists created so far. Parenthesis around a list
+name are a required syntax.
+
+Note that directives in configuration files are processed in sequence,
+the delist_uri_host only applies to previously listed entries and has
+no effect on enlisted entries in yet-to-be-processed directives.
+
+For convenience (similarity to the enlist_uri_host directive) hostnames
+may be prefixed by a an exclamation mark, which is stripped off from each
+name and has no meaning here.
 
 =cut
 
   push (@cmds, {
-    command => 'whitelist_uri_host',
-    setting => 'wblist_uri_host',
+    command => 'delist_uri_host',
+    setting => 'uri_host_lists',
+    type => $CONF_TYPE_ADDRLIST,
+    code => sub {
+      my($conf, $key, $value, $line) = @_;
+      local($1,$2);
+      if ($value !~ /^ (?: \( (.*?) \) \s+ )? (.*) \z/sx) {
+        return $MISSING_REQUIRED_VALUE;
+      }
+      my @listnames = defined $1 ? $1 : keys %{$conf->{uri_host_lists}};
+      my @args = split(' ', lc $2);
+      foreach my $listname (@listnames) {
+        foreach my $host (@args) {
+          my $v = $host =~ s/^!// ? 0 : 1;
+          delete $conf->{uri_host_lists}{$listname}{$host};
+        }
+      }
+    }
+  });
+
+=item blacklist_uri_host host-or-domain ...
+
+Is a shorthand for a directive:  enlist_uri_host (BLACK) host ...
+
+Please see directives enlist_uri_host and delist_uri_host for details.
+
+=cut
+
+  push (@cmds, {
+    command => 'blacklist_uri_host',
+    setting => 'uri_host_lists',
     type => $CONF_TYPE_ADDRLIST,
     code => sub {
       my($conf, $key, $value, $line) = @_;
-      my $listref = $conf->{wblist_uri_host};
-      $conf->{wblist_uri_host} = $listref = {}  if !$listref;
-      $listref->{$_} = -1  for split(' ', lc $value);
+      foreach my $host ( split(' ', lc $value) ) {
+        my $v = $host =~ s/^!// ? 0 : 1;
+        $conf->{uri_host_lists}{'BLACK'}{$host} = $v;
+      }
     }
   });
 
-=item unlist_uri_host host-or-domain ...
+=item whitelist_uri_host host-or-domain ...
 
-Removes one or more specified host names from a list of black- or whitelisted
-URI domains. Removing an unlisted name is ignored (is not an error).
+Is a shorthand for a directive:  enlist_uri_host (BLACK) host ...
+
+Please see directives enlist_uri_host and delist_uri_host for details.
 
 =cut
 
   push (@cmds, {
-    command => 'unlist_uri_host',
-    setting => 'wblist_uri_host',
+    command => 'whitelist_uri_host',
+    setting => 'uri_host_lists',
     type => $CONF_TYPE_ADDRLIST,
     code => sub {
       my($conf, $key, $value, $line) = @_;
-      my $listref = $conf->{wblist_uri_host};
-      $conf->{wblist_uri_host} = $listref = {}  if !$listref;
-      delete $listref->{$_}  for split(' ', lc $value);
+      foreach my $host ( split(' ', lc $value) ) {
+        my $v = $host =~ s/^!// ? 0 : 1;
+        $conf->{uri_host_lists}{'WHITE'}{$host} = $v;
+      }
     }
   });
 
@@ -4171,6 +4232,8 @@ sub clone {
     $dest = $self;
   }
 
+  my %done;
+
   # keys that should not be copied in ->clone().
   # bug 4179: include want_rebuild_for_type, so that if a user rule
   # is defined, its method will be recompiled for future scans in
@@ -4180,22 +4243,30 @@ sub clone {
     scoreset scores want_rebuild_for_type
   );
 
+  # special cases.  first, skip anything that cannot be changed
+  # by users, and the stuff we take care of here
+  foreach my $var (@NON_COPIED_KEYS) {
+    $done{$var} = undef;
+  }
+
   # keys that should can be copied using a ->clone() method, in ->clone()
   my @CLONABLE_KEYS = qw(
     internal_networks trusted_networks msa_networks 
   );
 
-  my %done;
-
-  # special cases.  first, skip anything that cannot be changed
-  # by users, and the stuff we take care of here
   foreach my $key (@CLONABLE_KEYS) {
     $dest->{$key} = $source->{$key}->clone();
     $done{$key} = undef;
   }
 
-  foreach my $var (@NON_COPIED_KEYS) {
-    $done{$var} = undef;
+  # two-level hash(es)
+  foreach my $key ('uri_host_lists') {
+    my $v = $source->{$key};
+    my $dest_key_ref = $dest->{$key} = {};  # must start from scratch!
+    while(my($k2,$v2) = each %{$v}) {
+      %{$dest_key_ref->{$k2}} = %{$v2};
+    }
+    $done{$key} = undef;
   }
 
   # bug 4179: be smarter about cloning the rule-type structures;
@@ -4309,7 +4380,7 @@ sub sa_die { Mail::SpamAssassin::sa_die(
 sub feature_originating_ip_headers { 1 }
 sub feature_dns_local_ports_permit_avoid { 1 }
 sub feature_bayes_auto_learn_on_error { 1 }
-sub feature_uri_host_wblist { 1 }
+sub feature_uri_host_listed { 1 }
 
 ###########################################################################
 

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/WLBLEval.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/WLBLEval.pm?rev=958790&r1=958789&r2=958790&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/WLBLEval.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/WLBLEval.pm Mon Jun 28 23:29:02 2010
@@ -51,8 +51,7 @@ sub new {
   $self->register_eval_rule("check_from_in_default_whitelist");
   $self->register_eval_rule("check_forged_in_default_whitelist");
   $self->register_eval_rule("check_mailfrom_matches_rcvd");
-  $self->register_eval_rule("check_uri_host_in_blacklist");
-  $self->register_eval_rule("check_uri_host_in_whitelist");
+  $self->register_eval_rule("check_uri_host_listed");
 
   return $self;
 }
@@ -361,94 +360,94 @@ sub _check_whitelist {
 
 sub check_uri_host_in_blacklist {
   my ($self, $pms) = @_;
-  my $conf = $self->{main}{conf};
-  my($host_bl, $host_wl) =
-    $self->_check_uri_wblist($pms, $conf->{wblist_uri_host});
-  if (defined $host_bl) {
-    dbg("rules: uri host blacklisted: $host_bl");
-    $pms->test_log("URI: $host_bl");
-    return 1;
-  }
-  return 0;
+  $self->check_uri_host_listed($pms, 'BLACK');
 }
 
 sub check_uri_host_in_whitelist {
   my ($self, $pms) = @_;
-  my $conf = $self->{main}{conf};
-  my($host_bl, $host_wl) =
-    $self->_check_uri_wblist($pms, $conf->{wblist_uri_host});
-  if (defined $host_wl) {
-    dbg("rules: uri host whitelisted: $host_wl");
-    $pms->test_log("URI: $host_wl");
-    return 1;
+  $self->check_uri_host_listed($pms, 'WHITE');
+}
+
+sub check_uri_host_listed {
+  my ($self, $pms, $subname) = @_;
+  my $host_enlisted_ref = $self->_check_uri_host_listed($pms);
+  if ($host_enlisted_ref) {
+    my $matched_host = $host_enlisted_ref->{$subname};
+    if ($matched_host) {
+      dbg("rules: uri host enlisted (%s): %s", $subname, $matched_host);
+      $pms->test_log("URI: $matched_host");
+      return 1;
+    }
   }
   return 0;
 }
 
-sub _check_uri_wblist {
-  my ($self, $pms, $wb_hashref) = @_;
+sub _check_uri_host_listed {
+  my ($self, $pms) = @_;
 
-  if ($pms->{'uri_wblisted'}) {
-    # just provide a cached result
-  } elsif (!$wb_hashref || !%$wb_hashref) {
-    $pms->{'uri_wblisted'} = [ undef, undef ];
-  } else {
-    my $host_blacklisted;
-    my $host_whitelisted;
-    $wb_hashref = {}  if !$wb_hashref;
-    if (would_log("dbg","rules")) {
-      dbg("rules: check_uri_wblist: %s",
-        join(', ', map { $_.'='.$wb_hashref->{$_} } sort keys %$wb_hashref));
-    }
-    # obtain a full list of html-parsed domains
-    my $uris = $pms->get_uri_detail_list();
-    my %seen;
-    while (my($uri,$info) = each %$uris) {
-      next if $uri =~ /^mailto:/i;  # we may want to skip mailto: uris (?)
-      while (my($host,$domain) = each( %{$info->{hosts}} )) {  # typically one
-        next if $seen{$host};
-        $seen{$host} = 1;
-        local($1,$2);
-        my @query_keys;
-        if ($host =~ /^\[(.*)\]\z/) {  # looks like an address literal
-          @query_keys = ( $1 );
-        } elsif ($host =~ /^\d+\.\d+\.\d+\.\d+\z/) {  # IPv4 address
-          @query_keys = ( $host );
-        } elsif ($host ne '') {
-          my($h) = $host;
-          for (;;) {
-            push(@query_keys, $h);  # sub.example.com, example.com, com
-            last if $h !~ s{^([^.]*)\.(.*)\z}{$2}s;
-          }
-          if (@query_keys > 10) {  # sanity limit, keep the tail
-            @query_keys = @query_keys[$#query_keys-9 .. $#query_keys];
-          }
+  if ($pms->{'uri_host_enlisted'}) {
+    return $pms->{'uri_host_enlisted'};  # just provide a cached result
+  }
+
+  my $uri_lists_href = $self->{main}{conf}{uri_host_lists};
+  if (!$uri_lists_href || !%$uri_lists_href) {
+    $pms->{'uri_host_enlisted'} = {};  # no URI host lists
+    return $pms->{'uri_host_enlisted'};
+  }
+
+  my %host_enlisted;
+  my @uri_listnames = sort keys %$uri_lists_href;
+  if (would_log("dbg","rules")) {
+    foreach my $nm (@uri_listnames) {
+      dbg("rules: check_uri_host_listed: (%s) %s",
+          $nm, join(', ', map { $uri_lists_href->{$nm}{$_} ? $_ : '!'.$_ }
+                              sort keys %{$uri_lists_href->{$nm}}));
+    }
+  }
+  # obtain a complete list of html-parsed domains
+  my $uris = $pms->get_uri_detail_list();
+  my %seen;
+  while (my($uri,$info) = each %$uris) {
+    next if $uri =~ /^mailto:/i;  # we may want to skip mailto: uris (?)
+    while (my($host,$domain) = each( %{$info->{hosts}} )) {  # typically one
+      next if $seen{$host};
+      $seen{$host} = 1;
+      local($1,$2);
+      my @query_keys;
+      if ($host =~ /^\[(.*)\]\z/) {  # looks like an address literal
+        @query_keys = ( $1 );
+      } elsif ($host =~ /^\d+\.\d+\.\d+\.\d+\z/) {  # IPv4 address
+        @query_keys = ( $host );
+      } elsif ($host ne '') {
+        my($h) = $host;
+        for (;;) {
+          shift @query_keys  if @query_keys > 10;  # sanity limit, keep tail
+          push(@query_keys, $h);  # sub.example.com, example.com, com
+          last if $h !~ s{^([^.]*)\.(.*)\z}{$2}s;
         }
-        my $wb_verdict;  # positive=blacklisted; negative=whitelisted
+      }
+      foreach my $nm (@uri_listnames) {
         my $match;
-        for my $q (@query_keys) {
-          $wb_verdict = $wb_hashref->{$q};
-          if ($wb_verdict) { $match = $q; last }
+        my $verdict;
+        my $hash_nm_ref = $uri_lists_href->{$nm};
+        foreach my $q (@query_keys) {
+          $verdict = $hash_nm_ref->{$q};
+          if (defined $verdict) {
+            $match = $q eq $host ? $host : "$host ($q)";
+            $match = '!'  if !$verdict;
+            last;
+          }
         }
-        if (!$wb_verdict) {
-        # dbg("rules: check_uri_wblist %s, NO MATCH for %s, search: %s",
-        #     $uri, $host, join(', ',@query_keys));
-        } elsif ($wb_verdict > 0) {
-          $host_blacklisted = $host;
-          $host_blacklisted .= " ($match)"  if $match ne $host;
-          dbg("rules: check_uri_wblist %s, BLACK: %s, search: %s",
-              $uri, $host_blacklisted, join(', ',@query_keys));
-        } elsif ($wb_verdict < 0) {
-          $host_whitelisted = $host;
-          $host_whitelisted .= " ($match)"  if $match ne $host;
-          dbg("rules: check_uri_wblist %s, WHITE: %s, search: %s",
-              $uri, $host_whitelisted, join(', ',@query_keys));
+        if (defined $verdict) {
+          $host_enlisted{$nm} = $match  if $verdict;
+          dbg("rules: check_uri_host_listed %s, (%s): %s, search: %s",
+              $uri, $nm, $match, join(', ',@query_keys));
         }
       }
     }
-    $pms->{'uri_wblisted'} = [ $host_blacklisted, $host_whitelisted ];
   }
-  return @{ $pms->{'uri_wblisted'} };
+  $pms->{'uri_host_enlisted'} = \%host_enlisted;
+  return $pms->{'uri_host_enlisted'};
 }
 
 1;