You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2005/03/12 03:41:28 UTC
svn commit: r157209 - in spamassassin/trunk/lib/Mail/SpamAssassin:
EvalTests.pm HTML.pm PerMsgStatus.pm Plugin/URIDNSBL.pm
Author: felicity
Date: Fri Mar 11 18:41:26 2005
New Revision: 157209
URL: http://svn.apache.org/viewcvs?view=rev&rev=157209
Log:
change the HTML parsed URI code again to be a bit more concise, restore 3.0 API, etc. change URIBL ranking and an eval test to use the new layout.
Modified:
spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm?view=diff&r1=157208&r2=157209
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm Fri Mar 11 18:41:26 2005
@@ -3366,10 +3366,9 @@
sub check_https_ip_mismatch {
my ($self) = @_;
- while (my($k,$v) = each %{$self->{html}->{uri_anchor_index}}) {
+ while (my($k,$v) = each %{$self->{html}->{uri_detail}}) {
next if ($k !~ m%^https?:/*(?:[^\@/]+\@)?\d+\.\d+\.\d+\.\d+%i);
- foreach (@{$v}) {
- $_ = $self->{html}->{anchor}->[$_];
+ foreach (@{$v->{anchor_text}}) {
next if (m%^https:/*(?:[^\@/]+\@)?\d+\.\d+\.\d+\.\d+%i);
return 1 if (m%https:%i);
}
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm?view=diff&r1=157208&r2=157209
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm Fri Mar 11 18:41:26 2005
@@ -131,40 +131,29 @@
delete $self->{text_style};
- # deal with the previous a tag. if the part in between
- # <a href=...> and </a> is not blank (ie: there was something there we
- # consider visible), add the uri to the list. otherwise, drop the uri and
- # mark that we found an "empty uri".
- # Note: this is also done in html_tests
- if ($self->{anchor_last}) {
- if (length $self->{anchor}->[$self->{anchor_index}]) {
- $self->push_uri('a', $self->{anchor_last});
- }
- else {
- $self->push_uri('a_empty', $self->{anchor_last});
- }
- push(@{$self->{anchor_uri_index}->{$self->{anchor_last}}}, $self->{anchor_index});
- }
+ my @uri = ();
-# my @uri;
+ # add the canonified version of each uri to the detail list
if (defined $self->{uri}) {
- while(my($type, $array) = each %{ $self->{uri} }) {
-# push(@uri, @{$array});
- my @tmp = Mail::SpamAssassin::Util::uri_list_canonify(@{$array});
- $self->{uri_cooked}->{$type} = \@tmp;
+ while(my($uri, $info) = each %{ $self->{uri} }) {
+ my @tmp = Mail::SpamAssassin::Util::uri_list_canonify($uri);
+ $info->{cleaned} = \@tmp;
# list out the URLs for debugging ...
if (Mail::SpamAssassin::dbg_check('uri')) {
foreach my $nuri (@tmp) {
- dbg("uri: uri found, type $type: $nuri");
+ dbg("uri: uri found, $nuri");
}
}
}
+
+ @uri = keys %{$self->{uri}};
}
-# $self->put_results(uri => \@uri);
- $self->put_results(uri_raw => $self->{uri});
- $self->put_results(uri_canon => $self->{uri_cooked});
- $self->put_results(uri_anchor_index => $self->{anchor_uri_index});
+ # these keep backward compatibility, albeit a little wasteful
+ $self->put_results(uri => \@uri);
+ $self->put_results(anchor => $self->{anchor});
+
+ $self->put_results(uri_detail => $self->{uri});
# final results scalars
$self->put_results(image_area => $self->{image_area});
@@ -178,7 +167,6 @@
}
# final result arrays
- $self->put_results(anchor => $self->{anchor});
$self->put_results(comment => $self->{comment});
$self->put_results(script => $self->{script});
$self->put_results(title => $self->{title});
@@ -255,7 +243,6 @@
$self->{image_area} = 0;
$self->{max_shouting} = 0;
- $self->{anchor_index} = -1;
$self->{title_index} = -1;
$self->{max_size} = 3; # start at default size
$self->{min_size} = 3; # start at default size
@@ -374,14 +361,14 @@
# puts the uri onto the internal array
# note: uri may be blank (<a href=""></a> obfuscation, etc.)
sub push_uri {
- my ($self, $location, $uri) = @_;
+ my ($self, $type, $uri) = @_;
# URIs don't have leading/trailing whitespace ...
$uri =~ s/^\s+//;
$uri =~ s/\s+$//;
my $target = target_uri($self->{base_href} || "", $uri);
- push @{ $self->{uri}->{$location} }, $target;
+ $self->{uri}->{$uri}->{types}->{$type} = 1;
}
sub html_uri {
@@ -393,7 +380,7 @@
$self->push_uri($tag, $attr->{background});
}
}
- elsif ($tag =~ /^(?:area|link)$/) {
+ elsif ($tag =~ /^(?:a|area|link)$/) {
if (defined $attr->{href}) {
$self->push_uri($tag, $attr->{href});
}
@@ -630,11 +617,10 @@
}
}
if ($tag eq "img" && exists $self->{inside}{a} && $self->{inside}{a} > 0) {
- $self->{anchor}->[$self->{anchor_index}] .= "<img>\n";
- if (exists $self->{anchor_last}) {
- if ($self->{anchor_last} =~ /\.(?:pl|cgi|php|asp|jsp|cfm)\b/i) {
- $self->put_results(anchor_image_bug => 1);
- }
+ $self->{uri}->{$self->{anchor_last}}->{anchor_text}->[-1] .= "<img>\n";
+ $self->{anchor}->[-1] .= "<img>\n";
+ if ($self->{anchor_last} =~ /\.(?:pl|cgi|php|asp|jsp|cfm)\b/i) {
+ $self->put_results(anchor_image_bug => 1);
}
}
@@ -680,23 +666,9 @@
# special text delimiters - <a> and <title>
if ($tag eq "a") {
- # deal with the previous a tag. if the part in between
- # <a href=...> and </a> is not blank (ie: there was something there we
- # consider visible), add the uri to the list. otherwise, drop the uri and
- # mark that we found an "empty uri".
- # Note: this is also done in html_end
- if ($self->{anchor_last}) {
- if (length $self->{anchor}->[$self->{anchor_index}]) {
- $self->push_uri('a', $self->{anchor_last});
- }
- else {
- $self->push_uri('a_empty', $self->{anchor_last});
- }
- push(@{$self->{anchor_uri_index}->{$self->{anchor_last}}}, $self->{anchor_index});
- }
$self->{anchor_last} = (exists $attr->{href} ? $attr->{href} : "");
- $self->{anchor_index}++;
- $self->{anchor}->[$self->{anchor_index}] = "";
+ push(@{$self->{uri}->{$self->{anchor_last}}->{anchor_text}}, '');
+ push(@{$self->{anchor}}, '');
}
if ($tag eq "title") {
$self->{title_index}++;
@@ -781,7 +753,8 @@
# text that is part of body and also stored separately
if (exists $self->{inside}{a} && $self->{inside}{a} > 0) {
# this doesn't worry about nested anchors
- $self->{anchor}->[$self->{anchor_index}] .= $text;
+ $self->{uri}->{$self->{anchor_last}}->{anchor_text}->[-1] .= $text;
+ $self->{anchor}->[-1] .= $text;
}
if (exists $self->{inside}{title} && $self->{inside}{title} > 0) {
$self->{title}->[$self->{title_index}] .= $text;
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm?view=diff&r1=157208&r2=157209
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Fri Mar 11 18:41:26 2005
@@ -1815,9 +1815,9 @@
# get URIs from HTML parsing
# use the metadata version as $self->{html} is probably not set yet
- if (defined $self->{msg}->{metadata}->{html}->{uri_canon}) {
- while(my($type, $array) = each %{ $self->{msg}->{metadata}->{html}->{uri_canon} }) {
- push(@uris, @{$array});
+ if (defined $self->{msg}->{metadata}->{html}->{uri_detail}) {
+ while(my($uri, $info) = each %{ $self->{msg}->{metadata}->{html}->{uri_detail} }) {
+ push(@uris, @{$info->{cleaned}});
}
}
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm?view=diff&r1=157208&r2=157209
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm Fri Mar 11 18:41:26 2005
@@ -201,7 +201,7 @@
# TODO! we need a method that provides more metadata about where
# the URI was found so we can ignore hammy decoys.
- # use the visible anchor uris first
+ # list of arrays to use in order
my @uri_ordered = ();
# use the parsed uris from the rendered message text
@@ -211,34 +211,41 @@
my @parsed = $scanner->get_parsed_uri_list();
# Generate the full list of html-parsed domains.
- my $html = $scanner->{msg}->{metadata}->{html}->{uri_canon} || { };
+ my $html = $scanner->{msg}->{metadata}->{html}->{uri_detail} || { };
- # list specific tags to use in order
- foreach ( 'a', 'form', 'img' ) {
- if (exists $html->{$_}) {
- push(@uri_ordered, $html->{$_});
- delete $html->{$_};
- }
+ # go from uri => info to uri_ordered
+ # 0: a
+ # 1: form
+ # 2: img
+ # 3: !a_empty
+ # 4: parsed
+ # 5: a_empty
+ if (@parsed) {
+ $uri_ordered[4] = \@parsed;
}
- # use the rest of the uris, except empty anchor uris
- if (keys %{$html}) {
- my @list = ();
- while(my($type, $array) = each %{$html}) {
- next if ($type eq 'a_empty');
- push(@list, @{$array});
- delete $html->{$type};
- }
- push(@uri_ordered, \@list) if (@list);
- }
+ while (my($uri, $info) = each %{$html}) {
+ my $entry = 3;
+
+ if ($info->{types}->{a}) {
+ $entry = 5;
- # now, use any of the URIs we parsed out of the message
- push(@uri_ordered, \@parsed) if (@parsed);
+ # determine a vs a_empty
+ foreach my $at (@{$info->{anchor_text}}) {
+ if (length $at) {
+ $entry = 0;
+ last;
+ }
+ }
+ }
+ elsif ($info->{types}->{form}) {
+ $entry = 1;
+ }
+ elsif ($info->{types}->{img}) {
+ $entry = 2;
+ }
- # finally, use any uris from empty anchor tags
- if (exists $html->{a_empty}) {
- push(@uri_ordered, $html->{a_empty});
- delete $html->{a_empty};
+ push(@{$uri_ordered[$entry]}, @{$info->{cleaned}});
}
# at this point, @uri_ordered is an ordered array of uri arrays
@@ -246,6 +253,8 @@
my %domlist = ();
while (keys %domlist < $scanner->{main}->{conf}->{uridnsbl_max_domains} && @uri_ordered) {
my $array = shift @uri_ordered;
+ next unless $array;
+
my %domains = ();
# run through and find the domains in this grouping