You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2005/05/10 20:36:03 UTC

svn commit: r169506 - in /spamassassin/trunk: lib/Mail/SpamAssassin/PerMsgStatus.pm lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm t/uri_html.t

Author: felicity
Date: Tue May 10 11:36:02 2005
New Revision: 169506

URL: http://svn.apache.org/viewcvs?rev=169506&view=rev
Log:
move the URI canonification around some more.  get_uri_list() now just puts together canonfied parsed and html uri lists, html canonification happens in extract_metadata, extract_metadata only runs once now and is called multiple times whenever canonfied html uris are needed.  modified around t/uri_html.t to not need a temp file, just run through get_uri_list() internally and not call a full SA, and avoid the whole debug output reading bit.

Modified:
    spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm
    spamassassin/trunk/t/uri_html.t

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm?rev=169506&r1=169505&r2=169506&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Tue May 10 11:36:02 2005
@@ -145,6 +145,10 @@
     $self->{conf}->set_score_set ($set|2);
   }
 
+  # Do this here so that {metadata}->{html} gets set, which we then reference
+  # in extract_message_metadata()
+  my $decoded = $self->get_decoded_stripped_body_text_array();
+
   $self->extract_message_metadata();
 
   {
@@ -153,15 +157,6 @@
     $self->run_rbl_eval_tests ($self->{conf}->{rbl_evals});
     my $needs_dnsbl_harvest_p = 1; # harvest needs to be run
 
-    my $decoded = $self->get_decoded_stripped_body_text_array();
-
-    # this has been put on the metadata object.  we could use it
-    # directly, but $self->{msg}->{metadata}->{html} goes through a lot
-    # of referencing ...
-    # NOTE: this has to come after get_decoded_stripped_body_text_array() as it's
-    # the one that sets {metadata}->{html} ...
-    $self->{html} = $self->{msg}->{metadata}->{html};
-
     my $bodytext = $self->get_decoded_body_text_array();
     my $fulltext = $self->{msg}->get_pristine();
     my @uris = $self->get_uri_list();
@@ -1297,6 +1292,10 @@
 
 sub extract_message_metadata {
   my ($self) = @_;
+  
+  # Use $self->{html} as a flag indicating whether or not we've already
+  # extracted the metadata.
+  return if ($self->{html});
 
   $self->{msg}->extract_message_metadata($self->{main});
 
@@ -1313,6 +1312,26 @@
   $self->{tag_data}->{RELAYSUNTRUSTED} = $self->{relays_untrusted_str};
   $self->{tag_data}->{LANGUAGES} = $self->{msg}->get_metadata("X-Languages");
 
+  # NOTE: this has to come after get_decoded_stripped_body_text_array() as it's
+  # the one that sets {metadata}->{html} ...  it should  be called before
+  # extract_message_metadata() ...
+  $self->{html} = $self->{msg}->{metadata}->{html};
+
+  # canonify the HTML parsed URIs
+  my $redirector_patterns = $self->{conf}->{redirector_patterns};
+  if (defined $self->{html}->{uri_detail}) {
+    while(my($uri, $info) = each %{ $self->{html}->{uri_detail} }) {
+      my @tmp = Mail::SpamAssassin::Util::uri_list_canonify($redirector_patterns, $uri);
+      $info->{cleaned} = \@tmp;
+      if (would_log('dbg', 'uri')) {
+        dbg("uri: html uri found, $uri");
+        foreach my $nuri (@tmp) {
+          dbg("uri: cleaned html uri, $nuri");
+        }
+      }
+    }
+  }
+
   # allow plugins to add more metadata, read the stuff that's there, etc.
   $self->{main}->call_plugins ("parsed_metadata", { permsgstatus => $self });
 }
@@ -1864,32 +1883,19 @@
   # get_parsed_uri_list() which calls get_decoded_stripped_body_text_array(),
   # which does the metadata stuff ...  DO THIS BEFORE LOOKING FOR METADATA!!!
   my @uris = $self->get_parsed_uri_list();
-  my $redirector_patterns = $self->{conf}->{redirector_patterns};
-  @uris = Mail::SpamAssassin::Util::uri_list_canonify($redirector_patterns, @uris);
+
+  # We need the Metadata extracted to get the canonified HTML parsed URIs
+  $self->extract_message_metadata();
 
   # get URIs from HTML parsing
-  # use the metadata version as $self->{html} is probably not set yet
-  if (defined $self->{msg}->{metadata}->{html}->{uri_detail}) {
-    while(my($uri, $info) = each %{ $self->{msg}->{metadata}->{html}->{uri_detail} }) {
-      my @tmp = Mail::SpamAssassin::Util::uri_list_canonify($redirector_patterns, $uri);
-      $info->{cleaned} = \@tmp;
-      push(@uris, @tmp);
-      if (would_log('dbg', 'uri')) {
-        dbg("uri: html uri found, $uri");
-        foreach my $nuri (@tmp) {
-          dbg("uri: cleaned html uri, $nuri");
-        }
+  if (defined $self->{html}->{uri_detail}) {
+    while(my($uri, $info) = each %{ $self->{html}->{uri_detail} }) {
+      if ($info->{cleaned}) {
+	push(@uris, @{$info->{cleaned}});
       }
     }
   }
 
-  # list out the URLs for debugging ...
-  if (would_log('dbg', 'uri')) {
-    foreach my $nuri (@uris) {
-      dbg("uri: parsed uri found: $nuri");
-    }
-  }
-
   # get domain list
   $self->{redirect_num} = 0;
   my %domains;
@@ -1970,6 +1976,16 @@
         push @uris, $uri;
       }
     }
+
+    @uris = Mail::SpamAssassin::Util::uri_list_canonify($self->{conf}->{redirector_patterns}, @uris);
+
+    # list out the URLs for debugging ...
+    if (would_log('dbg', 'uri')) {
+      foreach my $nuri (@uris) {
+        dbg("uri: parsed uri found: $nuri");
+      }
+    }
+
     # setup the cache and return
     $self->{parsed_uri_list} = \@uris;
 

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm?rev=169506&r1=169505&r2=169506&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm Tue May 10 11:36:02 2005
@@ -186,20 +186,18 @@
   $self->setup ($scanstate);
 
   # get all domains in message
-  # TODO! we need a method that provides more metadata about where
-  # the URI was found so we can ignore hammy decoys.
 
   # list of arrays to use in order
   my @uri_ordered = ();
 
   # use the parsed uris from the rendered message text
-  # IMPORTANT: to get the html parsed into metadata, we need to call
-  # get_parsed_uri_list() which calls get_decoded_stripped_body_text_array(),
-  # which does the metadata stuff ...  DO THIS BEFORE SETTING $html !!!
-  my @parsed = $scanner->get_uri_list();
+  my @parsed = $scanner->get_parsed_uri_list();
+
+  # We need the Metadata extracted to get the canonified HTML parsed URIs
+  $scanner->extract_message_metadata();
 
   # Generate the full list of html-parsed domains.
-  my $html = $scanner->{msg}->{metadata}->{html}->{uri_detail} || { };
+  my $html = $scanner->{html}->{uri_detail} || { };
 
   # go from uri => info to uri_ordered
   # 0: a

Modified: spamassassin/trunk/t/uri_html.t
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/t/uri_html.t?rev=169506&r1=169505&r2=169506&view=diff
==============================================================================
--- spamassassin/trunk/t/uri_html.t (original)
+++ spamassassin/trunk/t/uri_html.t Tue May 10 11:36:02 2005
@@ -21,7 +21,6 @@
 use SATest; sa_t_init("uri_html");
 use Test;
 use Mail::SpamAssassin;
-use IO::File;
 use vars qw(%patterns %anti_patterns);
 
 # settings
@@ -33,35 +32,28 @@
 $sa->init(0); # parse rules
 
 # load tests and write mail
-my $mail = 'log/uri_html.eml';
 %patterns = ();
 %anti_patterns = ();
-write_mail();
+my $message = write_mail();
 
-# test message
-my $fh = IO::File->new_tmpfile();
-open(STDERR, ">&=".fileno($fh)) || die "Cannot reopen STDERR";
-ok(sarun("-t --debug=uri < log/uri_html.eml"));
-seek($fh, 0, 0);
-my $error = do {
-    local $/;
-    <$fh>;
-};
-$error =~ s/^.*dbg: uri: parsed uri found: //mg;
+my $mail = $sa->parse($message);
+my $msg = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail);
+
+my $uris = join("\n", $msg->get_uri_list(), "");
 
 # run patterns and anti-patterns
 my $failures = 0;
 for my $pattern (keys %patterns) {
-  if ($error !~ /${pattern}/m) {
+  if ($uris !~ /${pattern}/m) {
     print "did not find $pattern\n";
-#    print "found $error\n";
     $failures++;
-  } else {
-#    print "success $pattern in $error\n";
   }
 }
+ok(!$failures);
+$failures = 0;
+
 for my $anti_pattern (keys %anti_patterns) {
-  if ($error =~ /${anti_pattern}/m) {
+  if ($uris =~ /${anti_pattern}/m) {
     print "did find $anti_pattern\n";
     $failures++;
   }
@@ -70,8 +62,7 @@
 
 # function to write test email
 sub write_mail {
-  if (open(MAIL, ">$mail")) {
-    print MAIL <<'EOF';
+    my $msg = <<'EOF';
 Message-ID: <cl...@example.com>
 Date: Mon, 07 Oct 2002 09:00:00 +0000
 From: Sender <se...@example.com>
@@ -98,31 +89,29 @@
 </head>
 <body>
 EOF
-    while (<DATA>) {
-      chomp;
-      next if /^#/;
-      if (/^(.*?)\t+(.*?)\s*$/) {
-	my $string = $1;
-	my @patterns = split(' ', $2);
-	if ($string && @patterns) {
-	  print MAIL "<a href=$string>click here</a>\n";
-	  for my $pattern (@patterns) {
-	    if ($pattern =~ /^\!(.*)/) {
-	      $anti_patterns{$1} = 1;
-	    }
-	    else {
-	      $patterns{$pattern} = 1;
-	    }
+
+  while (<DATA>) {
+    chomp;
+    next if /^#/;
+    if (/^(.*?)\t+(.*?)\s*$/) {
+      my $string = $1;
+      my @patterns = split(' ', $2);
+      if ($string && @patterns) {
+	$msg .= qq@<a href="$string">click here</a>\n@;
+	for my $pattern (@patterns) {
+	  if ($pattern =~ /^\!(.*)/) {
+	    $anti_patterns{$1} = 1;
+	  }
+	  else {
+	    $patterns{$pattern} = 1;
 	  }
 	}
       }
     }
-    print MAIL "</body>\n</html>\n\n----IDYGGVGT_LIYGR--\n";
-    close(MAIL);
-  }
-  else {
-    die "can't open output file: $!";
   }
+  $msg .= "</body>\n</html>\n\n----IDYGGVGT_LIYGR--\n";
+
+  return $msg;
 }
 
 # <line>    : <string><tabs><matches>
@@ -238,4 +227,3 @@
 
 mailto://cah3neun@thaihe4d.com		mailto://cah3neun@thaihe4d.com
 mailto://jicu8vah@another@jicu8vah	jicu8vah@another@jicu8vah
-