You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2004/04/20 01:33:54 UTC
svn commit: rev 10111 - incubator/spamassassin/trunk/lib/Mail/SpamAssassin

Author: felicity
Date: Mon Apr 19 16:33:53 2004
New Revision: 10111

Modified:
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
Log:
bug 3268: get_uri_list() was using decoded_body to find URIs, then looking at the HTML parser results for more uris.  however, the decoded_body has the HTML in it, so we were looking at the HTML twice, sometimes grabbing newline-separated uris.  so disable passing in a text array, and use the rendered text for RE matching, let the HTML parsing do the right thing there.  also, revert the redirector code.

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm	Mon Apr 19 16:33:53 2004
@@ -154,7 +154,8 @@
 			 $self->{msg}->get_pristine_body());
 
     # use $bodytext here because $decoded is too stripped
-    my @uris = $self->get_uri_list($bodytext);
+    # TVD: leave it up to get_uri_list to do the right thing ...
+    my @uris = $self->get_uri_list();
 
     foreach my $priority (sort { $a <=> $b } keys %{$self->{conf}->{priorities}}) {
       # no need to run if there are no priorities at this level.  This can
@@ -1466,14 +1467,20 @@
 
 # This really belongs in metadata
 sub get_uri_list {
-  my ($self, $textary) = @_;
+  my ($self) = @_;
 
   # use cached answer if available
   if (defined $self->{uri_list}) {
     return @{$self->{uri_list}};
   }
 
-  $textary ||= $self->get_decoded_body_text_array();
+  # TVD: we used to use decoded_body which is fine, except then we'll
+  # try parsing URLs out of HTML, which is what the HTML code is going
+  # to do (note: we know the HTML parsing occurs, because we call for the
+  # rendered text which does HTML parsing...)  trying to get URLs out of
+  # HTML w/out parsing causes issues, so let's not do it.
+  my $textary = $self->get_decoded_stripped_body_text_array();
+
   my ($rulename, $pat, @uris);
   local ($_);
 
@@ -1545,10 +1552,11 @@
       push(@nuris, $nuri);
     }
 
-    # deal with redirectors, push the redirect uri onto the uri array
-    # so this loop deals with that one independently
-    while ($nuri =~ s{^https?://.+?(https?://.+)$}{$1}s) {
-      push(@uris, $_);
+    # deal with http redirectors.  strip off one level of redirector
+    # and add back to the array.  the foreach loop will go over those
+    # and deal appropriately.
+    if ($nuri =~ m{^https?://.+?(https?://.+)$}) {
+      push(@uris, $1);
     }
   }