You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2005/03/12 03:41:28 UTC
svn commit: r157209 - in spamassassin/trunk/lib/Mail/SpamAssassin: EvalTests.pm HTML.pm PerMsgStatus.pm Plugin/URIDNSBL.pm

Author: felicity
Date: Fri Mar 11 18:41:26 2005
New Revision: 157209

URL: http://svn.apache.org/viewcvs?view=rev&rev=157209
Log:
change the HTML parsed URI code again to be a bit more concise, restore 3.0 API, etc.  change URIBL ranking and an eval test to use the new layout.

Modified:
    spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm?view=diff&r1=157208&r2=157209
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm Fri Mar 11 18:41:26 2005
@@ -3366,10 +3366,9 @@
 sub check_https_ip_mismatch {
   my ($self) = @_;
 
-  while (my($k,$v) = each %{$self->{html}->{uri_anchor_index}}) {
+  while (my($k,$v) = each %{$self->{html}->{uri_detail}}) {
     next if ($k !~ m%^https?:/*(?:[^\@/]+\@)?\d+\.\d+\.\d+\.\d+%i);
-    foreach (@{$v}) {
-      $_ = $self->{html}->{anchor}->[$_];
+    foreach (@{$v->{anchor_text}}) {
       next if (m%^https:/*(?:[^\@/]+\@)?\d+\.\d+\.\d+\.\d+%i);
       return 1 if (m%https:%i);
     }

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm?view=diff&r1=157208&r2=157209
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm Fri Mar 11 18:41:26 2005
@@ -131,40 +131,29 @@
 
   delete $self->{text_style};
 
-  # deal with the previous a tag.  if the part in between
-  # <a href=...> and </a> is not blank (ie: there was something there we
-  # consider visible), add the uri to the list.  otherwise, drop the uri and
-  # mark that we found an "empty uri".
-  # Note: this is also done in html_tests
-  if ($self->{anchor_last}) {
-    if (length $self->{anchor}->[$self->{anchor_index}]) {
-      $self->push_uri('a', $self->{anchor_last});
-    }
-    else {
-      $self->push_uri('a_empty', $self->{anchor_last});
-    }
-    push(@{$self->{anchor_uri_index}->{$self->{anchor_last}}}, $self->{anchor_index});
-  }
+  my @uri = ();
 
-#  my @uri;
+  # add the canonified version of each uri to the detail list
   if (defined $self->{uri}) {
-    while(my($type, $array) = each %{ $self->{uri} }) {
-#      push(@uri, @{$array});
-      my @tmp = Mail::SpamAssassin::Util::uri_list_canonify(@{$array});
-      $self->{uri_cooked}->{$type} = \@tmp;
+    while(my($uri, $info) = each %{ $self->{uri} }) {
+      my @tmp = Mail::SpamAssassin::Util::uri_list_canonify($uri);
+      $info->{cleaned} = \@tmp;
       # list out the URLs for debugging ...
       if (Mail::SpamAssassin::dbg_check('uri')) {
         foreach my $nuri (@tmp) {
-          dbg("uri: uri found, type $type: $nuri");
+          dbg("uri: uri found, $nuri");
         }
       }
     }
+
+    @uri = keys %{$self->{uri}};
   }
 
-#  $self->put_results(uri => \@uri);
-  $self->put_results(uri_raw => $self->{uri});
-  $self->put_results(uri_canon => $self->{uri_cooked});
-  $self->put_results(uri_anchor_index => $self->{anchor_uri_index});
+  # these keep backward compatibility, albeit a little wasteful
+  $self->put_results(uri => \@uri);
+  $self->put_results(anchor => $self->{anchor});
+
+  $self->put_results(uri_detail => $self->{uri});
 
   # final results scalars
   $self->put_results(image_area => $self->{image_area});
@@ -178,7 +167,6 @@
   }
 
   # final result arrays
-  $self->put_results(anchor => $self->{anchor});
   $self->put_results(comment => $self->{comment});
   $self->put_results(script => $self->{script});
   $self->put_results(title => $self->{title});
@@ -255,7 +243,6 @@
 
   $self->{image_area} = 0;
   $self->{max_shouting} = 0;
-  $self->{anchor_index} = -1;
   $self->{title_index} = -1;
   $self->{max_size} = 3;	# start at default size
   $self->{min_size} = 3;	# start at default size
@@ -374,14 +361,14 @@
 # puts the uri onto the internal array
 # note: uri may be blank (<a href=""></a> obfuscation, etc.)
 sub push_uri {
-  my ($self, $location, $uri) = @_;
+  my ($self, $type, $uri) = @_;
 
   # URIs don't have leading/trailing whitespace ...
   $uri =~ s/^\s+//;
   $uri =~ s/\s+$//;
 
   my $target = target_uri($self->{base_href} || "", $uri);
-  push @{ $self->{uri}->{$location} }, $target;
+  $self->{uri}->{$uri}->{types}->{$type} = 1;
 }
 
 sub html_uri {
@@ -393,7 +380,7 @@
       $self->push_uri($tag, $attr->{background});
     }
   }
-  elsif ($tag =~ /^(?:area|link)$/) {
+  elsif ($tag =~ /^(?:a|area|link)$/) {
     if (defined $attr->{href}) {
       $self->push_uri($tag, $attr->{href});
     }
@@ -630,11 +617,10 @@
     }
   }
   if ($tag eq "img" && exists $self->{inside}{a} && $self->{inside}{a} > 0) {
-    $self->{anchor}->[$self->{anchor_index}] .= "<img>\n";
-    if (exists $self->{anchor_last}) {
-      if ($self->{anchor_last} =~ /\.(?:pl|cgi|php|asp|jsp|cfm)\b/i) {
-	$self->put_results(anchor_image_bug => 1);
-      }
+    $self->{uri}->{$self->{anchor_last}}->{anchor_text}->[-1] .= "<img>\n";
+    $self->{anchor}->[-1] .= "<img>\n";
+    if ($self->{anchor_last} =~ /\.(?:pl|cgi|php|asp|jsp|cfm)\b/i) {
+      $self->put_results(anchor_image_bug => 1);
     }
   }
 
@@ -680,23 +666,9 @@
 
   # special text delimiters - <a> and <title>
   if ($tag eq "a") {
-    # deal with the previous a tag.  if the part in between
-    # <a href=...> and </a> is not blank (ie: there was something there we
-    # consider visible), add the uri to the list.  otherwise, drop the uri and
-    # mark that we found an "empty uri".
-    # Note: this is also done in html_end
-    if ($self->{anchor_last}) {
-      if (length $self->{anchor}->[$self->{anchor_index}]) {
-        $self->push_uri('a', $self->{anchor_last});
-      }
-      else {
-        $self->push_uri('a_empty', $self->{anchor_last});
-      }
-      push(@{$self->{anchor_uri_index}->{$self->{anchor_last}}}, $self->{anchor_index});
-    }
     $self->{anchor_last} = (exists $attr->{href} ? $attr->{href} : "");
-    $self->{anchor_index}++;
-    $self->{anchor}->[$self->{anchor_index}] = "";
+    push(@{$self->{uri}->{$self->{anchor_last}}->{anchor_text}}, '');
+    push(@{$self->{anchor}}, '');
   }
   if ($tag eq "title") {
     $self->{title_index}++;
@@ -781,7 +753,8 @@
   # text that is part of body and also stored separately
   if (exists $self->{inside}{a} && $self->{inside}{a} > 0) {
     # this doesn't worry about nested anchors
-    $self->{anchor}->[$self->{anchor_index}] .= $text;
+    $self->{uri}->{$self->{anchor_last}}->{anchor_text}->[-1] .= $text;
+    $self->{anchor}->[-1] .= $text;
   }
   if (exists $self->{inside}{title} && $self->{inside}{title} > 0) {
     $self->{title}->[$self->{title_index}] .= $text;

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm?view=diff&r1=157208&r2=157209
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Fri Mar 11 18:41:26 2005
@@ -1815,9 +1815,9 @@
 
   # get URIs from HTML parsing
   # use the metadata version as $self->{html} is probably not set yet
-  if (defined $self->{msg}->{metadata}->{html}->{uri_canon}) {
-    while(my($type, $array) = each %{ $self->{msg}->{metadata}->{html}->{uri_canon} }) {
-      push(@uris, @{$array});
+  if (defined $self->{msg}->{metadata}->{html}->{uri_detail}) {
+    while(my($uri, $info) = each %{ $self->{msg}->{metadata}->{html}->{uri_detail} }) {
+      push(@uris, @{$info->{cleaned}});
     }
   }
 

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm?view=diff&r1=157208&r2=157209
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm Fri Mar 11 18:41:26 2005
@@ -201,7 +201,7 @@
   # TODO! we need a method that provides more metadata about where
   # the URI was found so we can ignore hammy decoys.
 
-  # use the visible anchor uris first
+  # list of arrays to use in order
   my @uri_ordered = ();
 
   # use the parsed uris from the rendered message text
@@ -211,34 +211,41 @@
   my @parsed = $scanner->get_parsed_uri_list();
 
   # Generate the full list of html-parsed domains.
-  my $html = $scanner->{msg}->{metadata}->{html}->{uri_canon} || { };
+  my $html = $scanner->{msg}->{metadata}->{html}->{uri_detail} || { };
 
-  # list specific tags to use in order
-  foreach ( 'a', 'form', 'img' ) {
-    if (exists $html->{$_}) {
-      push(@uri_ordered, $html->{$_});
-      delete $html->{$_};
-    }
+  # go from uri => info to uri_ordered
+  # 0: a
+  # 1: form
+  # 2: img
+  # 3: !a_empty
+  # 4: parsed
+  # 5: a_empty
+  if (@parsed) {
+    $uri_ordered[4] = \@parsed;
   }
 
-  # use the rest of the uris, except empty anchor uris
-  if (keys %{$html}) {
-    my @list = ();
-    while(my($type, $array) = each %{$html}) {
-      next if ($type eq 'a_empty');
-      push(@list, @{$array});
-      delete $html->{$type};
-    }
-    push(@uri_ordered, \@list) if (@list);
-  }
+  while (my($uri, $info) = each %{$html}) {
+    my $entry = 3;
+
+    if ($info->{types}->{a}) {
+      $entry = 5;
 
-  # now, use any of the URIs we parsed out of the message
-  push(@uri_ordered, \@parsed) if (@parsed);
+      # determine a vs a_empty
+      foreach my $at (@{$info->{anchor_text}}) {
+        if (length $at) {
+	  $entry = 0;
+	  last;
+	}
+      }
+    }
+    elsif ($info->{types}->{form}) {
+      $entry = 1;
+    }
+    elsif ($info->{types}->{img}) {
+      $entry = 2;
+    }
 
-  # finally, use any uris from empty anchor tags
-  if (exists $html->{a_empty}) {
-    push(@uri_ordered, $html->{a_empty});
-    delete $html->{a_empty};
+    push(@{$uri_ordered[$entry]}, @{$info->{cleaned}});
   }
 
   # at this point, @uri_ordered is an ordered array of uri arrays
@@ -246,6 +253,8 @@
   my %domlist = ();
   while (keys %domlist < $scanner->{main}->{conf}->{uridnsbl_max_domains} && @uri_ordered) {
     my $array = shift @uri_ordered;
+    next unless $array;
+
     my %domains = ();
 
     # run through and find the domains in this grouping