You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2005/02/24 04:26:03 UTC
svn commit: r155151 - in spamassassin/trunk/lib/Mail/SpamAssassin: HTML.pm PerMsgStatus.pm Plugin/URIDNSBL.pm

Author: felicity
Date: Wed Feb 23 19:25:58 2005
New Revision: 155151

URL: http://svn.apache.org/viewcvs?view=rev&rev=155151
Log:
bug 3976: rework how uris are parsed out of the message.  the urirbl plugin now orders the domains to query, based on where in the message the uri was found.  separate out anchor uris which have text between open and close and those that don't.  removed uri html metadata since there is now a different/better way to get it.

Modified:
    spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm?view=diff&r1=155150&r2=155151
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm Wed Feb 23 19:25:58 2005
@@ -32,6 +32,9 @@
 @EXPORT = qw(get_results name_to_rgb);
 @EXPORT_OK = qw();
 
+# Make the main dbg() accessible in our package w/o an extra function
+*dbg=\&Mail::SpamAssassin::dbg;
+
 # elements defined by the HTML 4.01 and XHTML 1.0 DTDs (do not change them!)
 # does not include XML
 my %elements = map {; $_ => 1 }
@@ -128,6 +131,39 @@
 
   delete $self->{text_style};
 
+  # deal with the previous a tag.  if the part in between
+  # <a href=...> and </a> is not blank (ie: there was something there we
+  # consider visible), add the uri to the list.  otherwise, drop the uri and
+  # mark that we found an "empty uri".
+  # Note: this is also done in html_tests
+  if ($self->{anchor_last}) {
+    if (length $self->{anchor}->[$self->{anchor_index}]) {
+      $self->push_uri('a', $self->{anchor_last});
+    }
+    else {
+      $self->push_uri('a_empty', $self->{anchor_last});
+    }
+  }
+
+#  my @uri;
+  if (defined $self->{uri}) {
+    while(my($type, $array) = each %{ $self->{uri} }) {
+#      push(@uri, @{$array});
+      my @tmp = Mail::SpamAssassin::Util::uri_list_canonify(@{$array});
+      $self->{uri_cooked}->{$type} = \@tmp;
+      # list out the URLs for debugging ...
+      if (Mail::SpamAssassin::dbg_check('uri')) {
+        foreach my $nuri (@tmp) {
+          dbg("uri: uri found, type $type: $nuri");
+        }
+      }
+    }
+  }
+
+#  $self->put_results(uri => \@uri);
+  $self->put_results(uri_raw => $self->{uri});
+  $self->put_results(uri_canon => $self->{uri_cooked});
+
   # final results scalars
   $self->put_results(image_area => $self->{image_area});
   $self->put_results(max_shouting => $self->{max_shouting});
@@ -144,7 +180,6 @@
   $self->put_results(comment => $self->{comment});
   $self->put_results(script => $self->{script});
   $self->put_results(title => $self->{title});
-  $self->put_results(uri => $self->{uri});
 
   # final result hashes
   $self->put_results(inside => $self->{inside});
@@ -337,14 +372,14 @@
 # puts the uri onto the internal array
 # note: uri may be blank (<a href=""></a> obfuscation, etc.)
 sub push_uri {
-  my ($self, $uri) = @_;
+  my ($self, $location, $uri) = @_;
 
   # URIs don't have leading/trailing whitespace ...
   $uri =~ s/^\s+//;
   $uri =~ s/\s+$//;
 
   my $target = target_uri($self->{base_href} || "", $uri);
-  push @{ $self->{uri} }, $target;
+  push @{ $self->{uri}->{$location} }, $target;
 }
 
 sub html_uri {
@@ -353,22 +388,22 @@
   # ordered by frequency of tag groups
   if ($tag =~ /^(?:body|table|tr|td)$/) {
     if (defined $attr->{background}) {
-      $self->push_uri($attr->{background});
+      $self->push_uri($tag, $attr->{background});
     }
   }
-  elsif ($tag =~ /^(?:a|area|link)$/) {
+  elsif ($tag =~ /^(?:area|link)$/) {
     if (defined $attr->{href}) {
-      $self->push_uri($attr->{href});
+      $self->push_uri($tag, $attr->{href});
     }
   }
   elsif ($tag =~ /^(?:img|frame|iframe|embed|script|bgsound)$/) {
     if (defined $attr->{src}) {
-      $self->push_uri($attr->{src});
+      $self->push_uri($tag, $attr->{src});
     }
   }
   elsif ($tag eq "form") {
     if (defined $attr->{action}) {
-      $self->push_uri($attr->{action});
+      $self->push_uri($tag, $attr->{action});
     }
   }
   elsif ($tag eq "base") {
@@ -376,7 +411,7 @@
       # use <BASE HREF="URI"> to turn relative links into absolute links
 
       # even if it is a base URI, handle like a normal URI as well
-      push @{ $self->{uri} }, $uri;
+      $self->push_uri($tag, $uri);
 
       # a base URI will be ignored by browsers unless it is an absolute
       # URI of a standard protocol
@@ -638,6 +673,19 @@
 
   # special text delimiters - <a> and <title>
   if ($tag eq "a") {
+    # deal with the previous a tag.  if the part in between
+    # <a href=...> and </a> is not blank (ie: there was something there we
+    # consider visible), add the uri to the list.  otherwise, drop the uri and
+    # mark that we found an "empty uri".
+    # Note: this is also done in html_end
+    if ($self->{anchor_last}) {
+      if (length $self->{anchor}->[$self->{anchor_index}]) {
+        $self->push_uri('a', $self->{anchor_last});
+      }
+      else {
+        $self->push_uri('a_empty', $self->{anchor_last});
+      }
+    }
     $self->{anchor_last} = (exists $attr->{href} ? $attr->{href} : "");
     $self->{anchor_index}++;
     $self->{anchor}->[$self->{anchor_index}] = "";

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm?view=diff&r1=155150&r2=155151
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Wed Feb 23 19:25:58 2005
@@ -1783,68 +1783,21 @@
     return @{$self->{uri_list}};
   }
 
-  # TVD: we used to use decoded_body which is fine, except then we'll
-  # try parsing URLs out of HTML, which is what the HTML code is going
-  # to do (note: we know the HTML parsing occurs, because we call for the
-  # rendered text which does HTML parsing...)  trying to get URLs out of
-  # HTML w/out parsing causes issues, so let's not do it.
-  # also, if we allow $textary to be passed in, we need to invalidate
-  # the cache first. fyi.
-  my $textary = $self->get_decoded_stripped_body_text_array();
-
-  $self->{redirect_num} = 0;
-
-  my ($rulename, $pat, @uris);
-  local ($_);
-
-  my $text;
-
-  for (@$textary) {
-    # NOTE: do not modify $_ in this loop
-    while (/($uriRe)/igo) {
-      my $uri = $1;
-
-      $uri =~ s/^<(.*)>$/$1/;
-      $uri =~ s/[\]\)>#]$//;
-
-      if ($uri !~ /^${schemeRE}:/io) {
-        # If it's a hostname that was just sitting out in the
-        # open, without a protocol, and not inside of an HTML tag,
-        # the we should add the proper protocol in front, rather
-        # than using the base URI.
-        if ($uri =~ /^www\d*\./i) {
-          # some spammers are using unschemed URIs to escape filters
-          push (@uris, $uri);
-          $uri = "http://$uri";
-        }
-        elsif ($uri =~ /^ftp\./i) {
-          push (@uris, $uri);
-          $uri = "ftp://$uri";
-        }
-      }
-
-      # warn("uri: got URI: $uri\n");
-      push @uris, $uri;
-    }
-    while (/($Addr_spec_re)/go) {
-      my $uri = $1;
-
-      $uri = "mailto:$uri";
-
-      #warn("uri: got URI: $uri\n");
-      push @uris, $uri;
-    }
-  }
+  # IMPORTANT: to get the html parsed into metadata, we need to call
+  # get_parsed_uri_list() which calls get_decoded_stripped_body_text_array(),
+  # which does the metadata stuff ...  DO THIS BEFORE LOOKING FOR METADATA!!!
+  my @uris = $self->get_parsed_uri_list();
 
   # get URIs from HTML parsing
-  # use the metadata version as $self->{html} may not be set yet
-  if (defined $self->{msg}->{metadata}->{html}->{uri}) {
-    push @uris, @{ $self->{msg}->{metadata}->{html}->{uri} };
+  # use the metadata version as $self->{html} is probably not set yet
+  if (defined $self->{msg}->{metadata}->{html}->{uri_canon}) {
+    while(my($type, $array) = each %{ $self->{msg}->{metadata}->{html}->{uri_canon} }) {
+      push(@uris, @{$array});
+    }
   }
 
-  @uris = Mail::SpamAssassin::Util::uri_list_canonify(@uris);
-
   # get domain list
+  $self->{redirect_num} = 0;
   my %domains;
   for (@uris) {
     # count redirection attempts and log it
@@ -1858,14 +1811,79 @@
   $self->{uri_domain_count} = keys %domains;
   $self->{uri_list} = \@uris;
 
-  # list out the URLs for debugging ...
-  if (Mail::SpamAssassin::dbg_check('uri')) {
-    foreach my $nuri (@uris) {
-      dbg("uri: uri found: $nuri");
+  return @uris;
+}
+
+sub get_parsed_uri_list {
+  my($self) = @_;
+
+  # use cached answer if available
+  unless (defined $self->{parsed_uri_list}) {
+    # TVD: we used to use decoded_body which is fine, except then we'll
+    # try parsing URLs out of HTML, which is what the HTML code is going
+    # to do (note: we know the HTML parsing occurs, because we call for the
+    # rendered text which does HTML parsing...)  trying to get URLs out of
+    # HTML w/out parsing causes issues, so let's not do it.
+    # also, if we allow $textary to be passed in, we need to invalidate
+    # the cache first. fyi.
+    my $textary = $self->get_decoded_stripped_body_text_array();
+
+    my ($rulename, $pat, @uris);
+    local ($_);
+
+    my $text;
+
+    for (@$textary) {
+      # NOTE: do not modify $_ in this loop
+      while (/($uriRe)/igo) {
+        my $uri = $1;
+
+        $uri =~ s/^<(.*)>$/$1/;
+        $uri =~ s/[\]\)>#]$//;
+
+        if ($uri !~ /^${schemeRE}:/io) {
+          # If it's a hostname that was just sitting out in the
+          # open, without a protocol, and not inside of an HTML tag,
+          # the we should add the proper protocol in front, rather
+          # than using the base URI.
+          if ($uri =~ /^www\d*\./i) {
+            # some spammers are using unschemed URIs to escape filters
+            push (@uris, $uri);
+            $uri = "http://$uri";
+          }
+          elsif ($uri =~ /^ftp\./i) {
+            push (@uris, $uri);
+            $uri = "ftp://$uri";
+          }
+        }
+
+        # warn("uri: got URI: $uri\n");
+        push @uris, $uri;
+      }
+      while (/($Addr_spec_re)/go) {
+        my $uri = $1;
+
+        $uri = "mailto:$uri";
+
+        #warn("uri: got URI: $uri\n");
+        push @uris, $uri;
+      }
+    }
+
+    @uris = Mail::SpamAssassin::Util::uri_list_canonify(@uris);
+
+    # setup the cache and return
+    $self->{parsed_uri_list} = \@uris;
+
+    # list out the URLs for debugging ...
+    if (Mail::SpamAssassin::dbg_check('uri')) {
+      foreach my $nuri (@uris) {
+        dbg("uri: parsed uri found: $nuri");
+      }
     }
   }
 
-  return @uris;
+  return @{$self->{parsed_uri_list}};
 }
 
 sub do_body_uri_tests {

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm?view=diff&r1=155150&r2=155151
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm Wed Feb 23 19:25:58 2005
@@ -200,36 +200,105 @@
   # get all domains in message
   # TODO! we need a method that provides more metadata about where
   # the URI was found so we can ignore hammy decoys.
-  my %domlist = ( );
-  foreach my $uri ($scanner->get_uri_list()) {
-    my $dom = Mail::SpamAssassin::Util::uri_to_domain($uri);
-    if ($dom) {
-      if (exists $scanner->{main}->{conf}->{uridnsbl_skip_domains}->{$dom}) {
-        dbg("uridnsbl: found domain $dom in skip list");
-      }
-      else {
-        $domlist{$dom} = 1;
-      }
+
+  # use the visible anchor uris first
+  my @uri_ordered = ();
+
+  # use the parsed uris from the rendered message text
+  # IMPORTANT: to get the html parsed into metadata, we need to call
+  # get_parsed_uri_list() which calls get_decoded_stripped_body_text_array(),
+  # which does the metadata stuff ...  DO THIS BEFORE SETTING $html !!!
+  my @parsed = $scanner->get_parsed_uri_list();
+
+  # Generate the full list of html-parsed domains.
+  my $html = $scanner->{msg}->{metadata}->{html}->{uri_canon} || { };
+
+  # list specific tags to use in order
+  foreach ( 'a', 'form', 'img' ) {
+    if (exists $html->{$_}) {
+      push(@uri_ordered, $html->{$_});
+      delete $html->{$_};
+    }
+  }
+
+  # use the rest of the uris, except empty anchor uris
+  if (keys %{$html}) {
+    my @list = ();
+    while(my($type, $array) = each %{$html}) {
+      next if ($type eq 'a_empty');
+      push(@list, @{$array});
+      delete $html->{$type};
     }
+    push(@uri_ordered, \@list) if (@list);
   }
 
-  # trim down to a limited number - pick randomly
-  my $i;
-  my @longlist = keys %domlist;
-  my @shortlist = ();
-  for ($i = $scanner->{main}->{conf}->{uridnsbl_max_domains}; $i > 0; $i--) {
-    my $r = int rand (scalar @longlist);
-    push (@shortlist, splice (@longlist, $r, 1));
-    last if (scalar @longlist <= 0);
+  # now, use any of the URIs we parsed out of the message
+  push(@uri_ordered, \@parsed) if (@parsed);
+
+  # finally, use any uris from empty anchor tags
+  if (exists $html->{a_empty}) {
+    push(@uri_ordered, $html->{a_empty});
+    delete $html->{a_empty};
+  }
+
+  # at this point, @uri_ordered is an ordered array of uri arrays
+
+  my %domlist = ();
+  while (keys %domlist < $scanner->{main}->{conf}->{uridnsbl_max_domains} && @uri_ordered) {
+    my $array = shift @uri_ordered;
+    my %domains = ();
+
+    # run through and find the domains in this grouping
+    foreach (@{$array}) {
+      my $domain = $self->usable_uri_domain($scanner->{main}->{conf}->{uridnsbl_skip_domains}, $_);
+      next unless $domain;
+      next if $domlist{$domain};
+      $domains{$domain} = 1;
+    }
+
+    # at this point %domains has the list of new domains found in this
+    # grouping
+
+    # the new domains are all useful, just add them in
+    if (keys(%domlist) + keys(%domains) <= $scanner->{main}->{conf}->{uridnsbl_max_domains}) {
+      foreach (keys %domains) {
+        $domlist{$_} = 1;
+      }
+    }
+    else {
+      # trim down to a limited number - pick randomly
+      my $i;
+      my @longlist = keys %domains;
+      while (@longlist && keys %domlist < $scanner->{main}->{conf}->{uridnsbl_max_domains}) {
+        my $r = int rand (scalar @longlist);
+        $domlist{splice (@longlist, $r, 1)} = 1;
+      }
+    }
   }
 
   # and query
-  dbg("uridnsbl: domains to query: ".join(' ',@shortlist));
-  foreach my $dom (@shortlist) {
+  dbg("uridnsbl: domains to query: ".join(' ',keys %domlist));
+  foreach my $dom (keys %domlist) {
     $self->query_domain ($scanstate, $dom);
   }
 
   return 1;
+}
+
+sub usable_uri_domain {
+  my($self, $skip_domains, $uri) = @_;
+
+  my $dom = Mail::SpamAssassin::Util::uri_to_domain($uri);
+  if ($dom) {
+    if (exists $skip_domains->{$dom}) {
+      dbg("uridnsbl: domain $dom in skip list");
+    }
+    else {
+      return $dom;
+    }
+  }
+
+  return;
 }
 
 sub set_config {