You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2005/05/11 21:23:11 UTC

svn commit: r169679 - /spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm

Author: felicity
Date: Wed May 11 12:23:10 2005
New Revision: 169679

URL: http://svn.apache.org/viewcvs?rev=169679&view=rev
Log:
more uri canonification work.  move html canonification into get_uri_detail_list() now that it exists, leave pms->html in extract_metadata.  this lets get_uri_list() not call plugins when it doesn't need to...

Modified:
    spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm?rev=169679&r1=169678&r2=169679&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Wed May 11 12:23:10 2005
@@ -145,20 +145,18 @@
     $self->{conf}->set_score_set ($set|2);
   }
 
-  # Do this here so that {metadata}->{html} gets set, which we then reference
-  # in extract_message_metadata()
-  my $decoded = $self->get_decoded_stripped_body_text_array();
-
-  $self->extract_message_metadata();
-
   {
     # Here, we launch all the DNS RBL queries and let them run while we
     # inspect the message
     $self->run_rbl_eval_tests ($self->{conf}->{rbl_evals});
     my $needs_dnsbl_harvest_p = 1; # harvest needs to be run
 
+    my $decoded = $self->get_decoded_stripped_body_text_array();
     my $bodytext = $self->get_decoded_body_text_array();
     my $fulltext = $self->{msg}->get_pristine();
+
+    $self->extract_message_metadata();
+
     my @uris = $self->get_uri_list();
 
     foreach my $priority (sort { $a <=> $b } keys %{$self->{conf}->{priorities}}) {
@@ -1293,10 +1291,6 @@
 sub extract_message_metadata {
   my ($self) = @_;
   
-  # Use $self->{html} as a flag indicating whether or not we've already
-  # extracted the metadata.
-  return if ($self->{html});
-
   $self->{msg}->extract_message_metadata($self->{main});
 
   foreach my $item (qw(
@@ -1312,28 +1306,12 @@
   $self->{tag_data}->{RELAYSUNTRUSTED} = $self->{relays_untrusted_str};
   $self->{tag_data}->{LANGUAGES} = $self->{msg}->get_metadata("X-Languages");
 
-  # NOTE: we may have to call get_decoded_stripped_body_text_array so
-  # that {metadata}->{html} gets set...
-  if (!$self->{msg}->{metadata}->{html}) {
+  # This should happen before we get called, but just in case.
+  if (!defined $self->{msg}->{metadata}->{html}) {
     $self->get_decoded_stripped_body_text_array();
   }
   $self->{html} = $self->{msg}->{metadata}->{html};
 
-  # canonify the HTML parsed URIs
-  my $redirector_patterns = $self->{conf}->{redirector_patterns};
-  if (defined $self->{html}->{uri_detail}) {
-    while(my($uri, $info) = each %{ $self->{html}->{uri_detail} }) {
-      my @tmp = Mail::SpamAssassin::Util::uri_list_canonify($redirector_patterns, $uri);
-      $info->{cleaned} = \@tmp;
-      if (would_log('dbg', 'uri')) {
-        dbg("uri: html uri found, $uri");
-        foreach my $nuri (@tmp) {
-          dbg("uri: cleaned html uri, $nuri");
-        }
-      }
-    }
-  }
-
   # allow plugins to add more metadata, read the stuff that's there, etc.
   $self->{main}->call_plugins ("parsed_metadata", { permsgstatus => $self });
 }
@@ -1916,29 +1894,46 @@
     return $self->{uri_detail_list};
   }
 
-  # IMPORTANT: to get the html parsed into metadata, we need to
-  # call extract_message_metadata().  It will call
-  # get_decoded_stripped_body_text_array(), if necessary which does
-  # the metadata->html stuff ...
-
-  # We need the Metadata extracted to get the canonified HTML parsed URIs
-  $self->extract_message_metadata();
+  # do this so we're sure metadata->html is setup
+  my @parsed = $self->_get_parsed_uri_list();
 
   # get URIs from HTML parsing
-  my $detail = $self->{html}->{uri_detail} || { };
+  # use the metadata version since $self->{html} may not be setup
+  my $detail = $self->{msg}->{metadata}->{html}->{uri_detail} || { };
 
-  foreach my $uri ( $self->_get_parsed_uri_list() ) {
-    my @uris = Mail::SpamAssassin::Util::uri_list_canonify($self->{conf}->{redirector_patterns}, $uri);
+  # don't keep dereferencing ...
+  my $redirector_patterns = $self->{conf}->{redirector_patterns};
+
+  # canonify the HTML parsed URIs
+  while(my($uri, $info) = each %{ $detail }) {
+    my @tmp = Mail::SpamAssassin::Util::uri_list_canonify($redirector_patterns, $uri);
+    $info->{cleaned} = \@tmp;
 
-    # list out the URLs for debugging ...
     if (would_log('dbg', 'uri')) {
-      foreach my $nuri (@uris) {
-        dbg("uri: parsed uri found: $nuri");
+      dbg("uri: html uri found, $uri");
+      foreach my $nuri (@tmp) {
+        dbg("uri: cleaned html uri, $nuri");
       }
     }
+  }
 
+  # canonify the text parsed URIs
+  foreach my $uri ( @parsed ) {
     $detail->{$uri}->{types}->{parsed} = 1;
-    push(@{$detail->{$uri}->{cleaned}}, @uris);
+
+    my @uris = ();
+    
+    if (!exists $detail->{$uri}->{cleaned}) {
+      @uris = Mail::SpamAssassin::Util::uri_list_canonify($redirector_patterns, $uri);
+      $detail->{$uri}->{cleaned} = \@uris;
+    }
+
+    if (would_log('dbg', 'uri')) {
+      dbg("uri: parsed uri found, $uri");
+      foreach my $nuri (@uris) {
+        dbg("uri: parsed html uri, $nuri");
+      }
+    }
   }
 
   # setup the cache