You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2004/04/20 01:33:54 UTC
svn commit: rev 10111 - incubator/spamassassin/trunk/lib/Mail/SpamAssassin
Author: felicity
Date: Mon Apr 19 16:33:53 2004
New Revision: 10111
Modified:
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
Log:
bug 3268: get_uri_list() was using decoded_body to find URIs, then looking at the HTML parser results for more uris. however, the decoded_body has the HTML in it, so we were looking at the HTML twice, sometimes grabbing newline-separated uris. so disable passing in a text array, and use the rendered text for RE matching, let the HTML parsing do the right thing there. also, revert the redirector code.
Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Mon Apr 19 16:33:53 2004
@@ -154,7 +154,8 @@
$self->{msg}->get_pristine_body());
# use $bodytext here because $decoded is too stripped
- my @uris = $self->get_uri_list($bodytext);
+ # TVD: leave it up to get_uri_list to do the right thing ...
+ my @uris = $self->get_uri_list();
foreach my $priority (sort { $a <=> $b } keys %{$self->{conf}->{priorities}}) {
# no need to run if there are no priorities at this level. This can
@@ -1466,14 +1467,20 @@
# This really belongs in metadata
sub get_uri_list {
- my ($self, $textary) = @_;
+ my ($self) = @_;
# use cached answer if available
if (defined $self->{uri_list}) {
return @{$self->{uri_list}};
}
- $textary ||= $self->get_decoded_body_text_array();
+ # TVD: we used to use decoded_body which is fine, except then we'll
+ # try parsing URLs out of HTML, which is what the HTML code is going
+ # to do (note: we know the HTML parsing occurs, because we call for the
+ # rendered text which does HTML parsing...) trying to get URLs out of
+ # HTML w/out parsing causes issues, so let's not do it.
+ my $textary = $self->get_decoded_stripped_body_text_array();
+
my ($rulename, $pat, @uris);
local ($_);
@@ -1545,10 +1552,11 @@
push(@nuris, $nuri);
}
- # deal with redirectors, push the redirect uri onto the uri array
- # so this loop deals with that one independently
- while ($nuri =~ s{^https?://.+?(https?://.+)$}{$1}s) {
- push(@uris, $_);
+ # deal with http redirectors. strip off one level of redirector
+ # and add back to the array. the foreach loop will go over those
+ # and deal appropriately.
+ if ($nuri =~ m{^https?://.+?(https?://.+)$}) {
+ push(@uris, $1);
}
}