You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2005/05/14 07:58:20 UTC

svn commit: r170124 - /spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm /spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm

Author: felicity
Date: Fri May 13 22:58:18 2005
New Revision: 170124

URL: http://svn.apache.org/viewcvs?rev=170124&view=rev
Log:
put domain information in the hash returned by get_uri_detail_list().  this means the URIBL plugin can be simplified to look at the already parsed domains.

Modified:
    spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm?rev=170124&r1=170123&r2=170124&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Fri May 13 22:58:18 2005
@@ -1841,8 +1841,7 @@
 Returns an array of all unique URIs found in the message.  It takes
 a combination of the URIs found in the rendered (decoded and HTML
 stripped) body and the URIs found when parsing the HTML in the message.
-Will also set $status->{uri_domain_count} (count of unique domains)
-and $status->{uri_list} (the array as returned by this function).
+Will also set $status->{uri_list} (the array as returned by this function).
 
 The returned array will include the "raw" URI as well as
 "slightly cooked" versions.  For example, the single URI
@@ -1860,27 +1859,22 @@
   }
 
   my @uris = ();
+  $self->{redirect_num} = 0;
 
   # get URIs from HTML parsing
   while(my($uri, $info) = each %{ $self->get_uri_detail_list() }) {
     if ($info->{cleaned}) {
-      push(@uris, @{$info->{cleaned}});
-    }
-  }
+      foreach (@{$info->{cleaned}}) {
+        push(@uris, $_);
 
-  # get domain list
-  $self->{redirect_num} = 0;
-  my %domains;
-  for (@uris) {
-    # count redirection attempts and log it
-    if (my @http = m{\b(https?:/{0,2})}gi) {
-      $self->{redirect_num} = $#http if ($#http > $self->{redirect_num});
+        # count redirection attempts and log it
+        if (my @http = m{\b(https?:/{0,2})}gi) {
+          $self->{redirect_num} = $#http if ($#http > $self->{redirect_num});
+        }
+      }
     }
-    my $domain = Mail::SpamAssassin::Util::uri_to_domain($_);
-    $domains{$domain} = 1 if $domain;
   }
 
-  $self->{uri_domain_count} = keys %domains;
   $self->{uri_list} = \@uris;
 
   return @uris;
@@ -1893,7 +1887,8 @@
 combination of the URIs found in the rendered (decoded and HTML stripped)
 body and the URIs found when parsing the HTML in the message.  Will also
 set $status->{uri_detail_list} (the hash reference as returned by this
-function).
+function).  This function will also set $status->{uri_domain_count} (count of
+unique domains).
 
 The hash format looks something like this:
 
@@ -1901,6 +1896,7 @@
     types => { a => 1, img => 1, parsed => 1 },
     cleaned => [ canonified_uri ],
     anchor_text => [ "click here", "no click here" ],
+    domains => { domain1 => 1, domain2 => 1 },
   }
 
 C<raw_uri> is whatever the URI was in the message itself
@@ -1916,6 +1912,8 @@
 C<anchor_text> is an array of the anchor text (text between <a> and
 </a>), if any, which linked to the URI.
 
+C<domains> is a hash of the domains found in the canonified URIs.
+
 =cut
 
 sub get_uri_detail_list {
@@ -1926,6 +1924,8 @@
     return $self->{uri_detail_list};
   }
 
+  $self->{uri_domain_count} = 0;
+
   # do this so we're sure metadata->html is setup
   my @parsed = $self->_get_parsed_uri_list();
 
@@ -1941,29 +1941,56 @@
     my @tmp = Mail::SpamAssassin::Util::uri_list_canonify($redirector_patterns, $uri);
     $info->{cleaned} = \@tmp;
 
+    foreach (@tmp) {
+      my $domain = Mail::SpamAssassin::Util::uri_to_domain($_);
+      if ($domain && !$info->{domains}->{$domain}) {
+        $info->{domains}->{$domain} = 1;
+        $self->{uri_domain_count}++;
+      }
+    }
+
     if (would_log('dbg', 'uri')) {
       dbg("uri: html uri found, $uri");
       foreach my $nuri (@tmp) {
         dbg("uri: cleaned html uri, $nuri");
       }
+      if ($info->{domains}) {
+        foreach my $domain (keys %{$info->{domains}}) {
+          dbg("uri: html domain, $domain");
+        }
+      }
     }
   }
 
   # canonify the text parsed URIs
   foreach my $uri ( @parsed ) {
     $detail->{$uri}->{types}->{parsed} = 1;
+    my $info = $detail->{$uri};
 
     my @uris = ();
     
-    if (!exists $detail->{$uri}->{cleaned}) {
+    if (!exists $info->{cleaned}) {
       @uris = Mail::SpamAssassin::Util::uri_list_canonify($redirector_patterns, $uri);
-      $detail->{$uri}->{cleaned} = \@uris;
+      $info->{cleaned} = \@uris;
+
+      foreach (@uris) {
+        my $domain = Mail::SpamAssassin::Util::uri_to_domain($_);
+        if ($domain && !$info->{domains}->{$domain}) {
+          $info->{domains}->{$domain} = 1;
+          $self->{uri_domain_count}++;
+        }
+      }
     }
 
     if (would_log('dbg', 'uri')) {
       dbg("uri: parsed uri found, $uri");
       foreach my $nuri (@uris) {
         dbg("uri: parsed html uri, $nuri");
+      }
+      if ($info->{domains}) {
+        foreach my $domain (keys %{$info->{domains}}) {
+          dbg("uri: parsed domain, $domain");
+        }
       }
     }
   }

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm?rev=170124&r1=170123&r2=170124&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm Fri May 13 22:58:18 2005
@@ -188,6 +188,9 @@
 
   # get all domains in message
 
+  # don't keep dereferencing this
+  my $skip_domains = $scanner->{main}->{conf}->{uridnsbl_skip_domains};
+
   # list of arrays to use in order
   my @uri_ordered = ();
 
@@ -202,6 +205,12 @@
   # 4: parsed
   # 5: a_empty
   while (my($uri, $info) = each %{$uris}) {
+    # we want to skip mailto: uris
+    next if ($uri =~ /^mailto:/);
+
+    # no domains were found via this uri, so skip
+    next unless ($info->{domains});
+
     my $entry = 3;
 
     if ($info->{types}->{a}) {
@@ -225,42 +234,39 @@
       $entry = 4;
     }
 
-    push(@{$uri_ordered[$entry]}, @{$info->{cleaned}});
+    # take the usable domains and add to the ordered list
+    foreach ( keys %{ $info->{domains} } ) {
+      if (exists $skip_domains->{$_}) {
+        dbg("uridnsbl: domain $_ in skip list");
+        next;
+      }
+      $uri_ordered[$entry]->{$_} = 1;
+    }
   }
 
-  # at this point, @uri_ordered is an ordered array of uri arrays
+  # at this point, @uri_ordered is an ordered array of uri hashes
 
   my %domlist = ();
   while (keys %domlist < $scanner->{main}->{conf}->{uridnsbl_max_domains} && @uri_ordered) {
     my $array = shift @uri_ordered;
     next unless $array;
 
-    my %domains = ();
-
-    # run through and find the domains in this grouping
-    foreach (@{$array}) {
-      my $domain = $self->usable_uri_domain($scanner->{main}->{conf}->{uridnsbl_skip_domains}, $_);
-      next unless $domain;
-      next if $domlist{$domain};
-      $domains{$domain} = 1;
-    }
-
-    # at this point %domains has the list of new domains found in this
-    # grouping
+    # run through and find the new domains in this grouping
+    my @domains = grep(!$domlist{$_}, keys %{$array});
+    next unless @domains;
 
     # the new domains are all useful, just add them in
-    if (keys(%domlist) + keys(%domains) <= $scanner->{main}->{conf}->{uridnsbl_max_domains}) {
-      foreach (keys %domains) {
+    if (keys(%domlist) + @domains <= $scanner->{main}->{conf}->{uridnsbl_max_domains}) {
+      foreach (@domains) {
         $domlist{$_} = 1;
       }
     }
     else {
       # trim down to a limited number - pick randomly
       my $i;
-      my @longlist = keys %domains;
-      while (@longlist && keys %domlist < $scanner->{main}->{conf}->{uridnsbl_max_domains}) {
-        my $r = int rand (scalar @longlist);
-        $domlist{splice (@longlist, $r, 1)} = 1;
+      while (@domains && keys %domlist < $scanner->{main}->{conf}->{uridnsbl_max_domains}) {
+        my $r = int rand (scalar @domains);
+        $domlist{splice (@domains, $r, 1)} = 1;
       }
     }
   }
@@ -272,23 +278,6 @@
   }
 
   return 1;
-}
-
-sub usable_uri_domain {
-  my($self, $skip_domains, $uri) = @_;
-
-  return if ($uri =~ /^mailto:/i);
-  my $dom = Mail::SpamAssassin::Util::uri_to_domain($uri);
-  if ($dom) {
-    if (exists $skip_domains->{$dom}) {
-      dbg("uridnsbl: domain $dom in skip list");
-    }
-    else {
-      return $dom;
-    }
-  }
-
-  return;
 }
 
 sub set_config {