You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2005/05/14 07:58:20 UTC
svn commit: r170124 -
/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm
Author: felicity
Date: Fri May 13 22:58:18 2005
New Revision: 170124
URL: http://svn.apache.org/viewcvs?rev=170124&view=rev
Log:
put domain information in the hash returned by get_uri_detail_list(). this means the URIBL plugin can be simplified to look at the already parsed domains.
Modified:
spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm?rev=170124&r1=170123&r2=170124&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Fri May 13 22:58:18 2005
@@ -1841,8 +1841,7 @@
Returns an array of all unique URIs found in the message. It takes
a combination of the URIs found in the rendered (decoded and HTML
stripped) body and the URIs found when parsing the HTML in the message.
-Will also set $status->{uri_domain_count} (count of unique domains)
-and $status->{uri_list} (the array as returned by this function).
+Will also set $status->{uri_list} (the array as returned by this function).
The returned array will include the "raw" URI as well as
"slightly cooked" versions. For example, the single URI
@@ -1860,27 +1859,22 @@
}
my @uris = ();
+ $self->{redirect_num} = 0;
# get URIs from HTML parsing
while(my($uri, $info) = each %{ $self->get_uri_detail_list() }) {
if ($info->{cleaned}) {
- push(@uris, @{$info->{cleaned}});
- }
- }
+ foreach (@{$info->{cleaned}}) {
+ push(@uris, $_);
- # get domain list
- $self->{redirect_num} = 0;
- my %domains;
- for (@uris) {
- # count redirection attempts and log it
- if (my @http = m{\b(https?:/{0,2})}gi) {
- $self->{redirect_num} = $#http if ($#http > $self->{redirect_num});
+ # count redirection attempts and log it
+ if (my @http = m{\b(https?:/{0,2})}gi) {
+ $self->{redirect_num} = $#http if ($#http > $self->{redirect_num});
+ }
+ }
}
- my $domain = Mail::SpamAssassin::Util::uri_to_domain($_);
- $domains{$domain} = 1 if $domain;
}
- $self->{uri_domain_count} = keys %domains;
$self->{uri_list} = \@uris;
return @uris;
@@ -1893,7 +1887,8 @@
combination of the URIs found in the rendered (decoded and HTML stripped)
body and the URIs found when parsing the HTML in the message. Will also
set $status->{uri_detail_list} (the hash reference as returned by this
-function).
+function). This function will also set $status->{uri_domain_count} (count of
+unique domains).
The hash format looks something like this:
@@ -1901,6 +1896,7 @@
types => { a => 1, img => 1, parsed => 1 },
cleaned => [ canonified_uri ],
anchor_text => [ "click here", "no click here" ],
+ domains => { domain1 => 1, domain2 => 1 },
}
C<raw_uri> is whatever the URI was in the message itself
@@ -1916,6 +1912,8 @@
C<anchor_text> is an array of the anchor text (text between <a> and
</a>), if any, which linked to the URI.
+C<domains> is a hash of the domains found in the canonified URIs.
+
=cut
sub get_uri_detail_list {
@@ -1926,6 +1924,8 @@
return $self->{uri_detail_list};
}
+ $self->{uri_domain_count} = 0;
+
# do this so we're sure metadata->html is setup
my @parsed = $self->_get_parsed_uri_list();
@@ -1941,29 +1941,56 @@
my @tmp = Mail::SpamAssassin::Util::uri_list_canonify($redirector_patterns, $uri);
$info->{cleaned} = \@tmp;
+ foreach (@tmp) {
+ my $domain = Mail::SpamAssassin::Util::uri_to_domain($_);
+ if ($domain && !$info->{domains}->{$domain}) {
+ $info->{domains}->{$domain} = 1;
+ $self->{uri_domain_count}++;
+ }
+ }
+
if (would_log('dbg', 'uri')) {
dbg("uri: html uri found, $uri");
foreach my $nuri (@tmp) {
dbg("uri: cleaned html uri, $nuri");
}
+ if ($info->{domains}) {
+ foreach my $domain (keys %{$info->{domains}}) {
+ dbg("uri: html domain, $domain");
+ }
+ }
}
}
# canonify the text parsed URIs
foreach my $uri ( @parsed ) {
$detail->{$uri}->{types}->{parsed} = 1;
+ my $info = $detail->{$uri};
my @uris = ();
- if (!exists $detail->{$uri}->{cleaned}) {
+ if (!exists $info->{cleaned}) {
@uris = Mail::SpamAssassin::Util::uri_list_canonify($redirector_patterns, $uri);
- $detail->{$uri}->{cleaned} = \@uris;
+ $info->{cleaned} = \@uris;
+
+ foreach (@uris) {
+ my $domain = Mail::SpamAssassin::Util::uri_to_domain($_);
+ if ($domain && !$info->{domains}->{$domain}) {
+ $info->{domains}->{$domain} = 1;
+ $self->{uri_domain_count}++;
+ }
+ }
}
if (would_log('dbg', 'uri')) {
dbg("uri: parsed uri found, $uri");
foreach my $nuri (@uris) {
dbg("uri: parsed html uri, $nuri");
+ }
+ if ($info->{domains}) {
+ foreach my $domain (keys %{$info->{domains}}) {
+ dbg("uri: parsed domain, $domain");
+ }
}
}
}
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm?rev=170124&r1=170123&r2=170124&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm Fri May 13 22:58:18 2005
@@ -188,6 +188,9 @@
# get all domains in message
+ # don't keep dereferencing this
+ my $skip_domains = $scanner->{main}->{conf}->{uridnsbl_skip_domains};
+
# list of arrays to use in order
my @uri_ordered = ();
@@ -202,6 +205,12 @@
# 4: parsed
# 5: a_empty
while (my($uri, $info) = each %{$uris}) {
+ # we want to skip mailto: uris
+ next if ($uri =~ /^mailto:/);
+
+ # no domains were found via this uri, so skip
+ next unless ($info->{domains});
+
my $entry = 3;
if ($info->{types}->{a}) {
@@ -225,42 +234,39 @@
$entry = 4;
}
- push(@{$uri_ordered[$entry]}, @{$info->{cleaned}});
+ # take the usable domains and add to the ordered list
+ foreach ( keys %{ $info->{domains} } ) {
+ if (exists $skip_domains->{$_}) {
+ dbg("uridnsbl: domain $_ in skip list");
+ next;
+ }
+ $uri_ordered[$entry]->{$_} = 1;
+ }
}
- # at this point, @uri_ordered is an ordered array of uri arrays
+ # at this point, @uri_ordered is an ordered array of uri hashes
my %domlist = ();
while (keys %domlist < $scanner->{main}->{conf}->{uridnsbl_max_domains} && @uri_ordered) {
my $array = shift @uri_ordered;
next unless $array;
- my %domains = ();
-
- # run through and find the domains in this grouping
- foreach (@{$array}) {
- my $domain = $self->usable_uri_domain($scanner->{main}->{conf}->{uridnsbl_skip_domains}, $_);
- next unless $domain;
- next if $domlist{$domain};
- $domains{$domain} = 1;
- }
-
- # at this point %domains has the list of new domains found in this
- # grouping
+ # run through and find the new domains in this grouping
+ my @domains = grep(!$domlist{$_}, keys %{$array});
+ next unless @domains;
# the new domains are all useful, just add them in
- if (keys(%domlist) + keys(%domains) <= $scanner->{main}->{conf}->{uridnsbl_max_domains}) {
- foreach (keys %domains) {
+ if (keys(%domlist) + @domains <= $scanner->{main}->{conf}->{uridnsbl_max_domains}) {
+ foreach (@domains) {
$domlist{$_} = 1;
}
}
else {
# trim down to a limited number - pick randomly
my $i;
- my @longlist = keys %domains;
- while (@longlist && keys %domlist < $scanner->{main}->{conf}->{uridnsbl_max_domains}) {
- my $r = int rand (scalar @longlist);
- $domlist{splice (@longlist, $r, 1)} = 1;
+ while (@domains && keys %domlist < $scanner->{main}->{conf}->{uridnsbl_max_domains}) {
+ my $r = int rand (scalar @domains);
+ $domlist{splice (@domains, $r, 1)} = 1;
}
}
}
@@ -272,23 +278,6 @@
}
return 1;
-}
-
-sub usable_uri_domain {
- my($self, $skip_domains, $uri) = @_;
-
- return if ($uri =~ /^mailto:/i);
- my $dom = Mail::SpamAssassin::Util::uri_to_domain($uri);
- if ($dom) {
- if (exists $skip_domains->{$dom}) {
- dbg("uridnsbl: domain $dom in skip list");
- }
- else {
- return $dom;
- }
- }
-
- return;
}
sub set_config {