You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by qu...@apache.org on 2004/02/15 20:53:40 UTC

svn commit: rev 6658 - incubator/spamassassin/trunk/lib/Mail/SpamAssassin

Author: quinlan
Date: Sun Feb 15 11:53:39 2004
New Revision: 6658

Modified:
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
Log:
add major overhaul of URI parsing to HTML parser
also remove HTML order code and unclickable anchor test code


Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm	Sun Feb 15 11:53:39 2004
@@ -159,8 +159,6 @@
   $self->{html}{"inside_$tag"} += $num;
   $self->{html}{"inside_$tag"} = 0 if $self->{html}{"inside_$tag"} < 0;
 
-  push @{$self->{html}{order}}, ($num > 0 ? "" : "/") . $tag;
-
   if ($tag =~ /^(?:body|table|tr|th|td)$/) {
     $self->html_bgcolor($tag, $attr, $num);
   }
@@ -175,11 +173,6 @@
 
     $self->{html_last_tag} = $tag;
   }
-  if ($num == -1) {
-    if ($tag eq "a") {
-      $self->{html}{anchor_unclickable}++ if $self->{html}{anchor_empty};
-    }
-  }
 
   if ($tag =~ /^(?:b|i|u|strong|em|big|center|h\d)$/) {
     $self->{html}{shouting} += $num;
@@ -205,33 +198,131 @@
   }
 }
 
+sub parse_uri {
+  my ($u) = @_;
+  my %u;
+  ($u{scheme}, $u{authority}, $u{path}, $u{query}, $u{fragment}) =
+    $u =~ m|^(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;
+  return %u;
+}
+
+# resolving relative URIs as defined in RFC 2396 (steps from section 5.2)
+sub push_uri {
+  my ($self, $uri) = @_;
+
+  return unless defined $uri;
+
+  # step 1
+  my %uri = parse_uri($uri);
+
+  # step 2
+  if (!$uri{path} && !$uri{scheme} && !$uri{authority} && !$uri{query}) {
+    return;
+  }
+
+  my $base = $self->{html}{base_href};
+
+  if (!defined $base || !$base) {
+    push @{$self->{html}{uri}}, $uri;
+    return;
+  }
+
+  my %base = parse_uri($base);	# don't need to parse base until here
+
+  # step 3
+  if (!$uri{scheme}) {
+    $uri{scheme} = $base{scheme};
+  }
+
+  # step 4
+  if ($uri{authority}) {
+    goto result;
+  }
+  else {
+    $uri{authority} = $base{authority};
+  }
+
+  # step 5
+  if ($uri{path} =~ m@^/@) {
+    goto result;
+  }
+
+  # step 6
+  my $buffer;
+  # a)
+  $buffer = $base{path};
+  $buffer =~ s@(?<=/)[^/]*$@@;
+  # b)
+  $buffer .= $uri{path};
+  # c)
+  $buffer =~ s@^\./@@g;
+  $buffer =~ s@(?<=/)\./@@g;
+  # d)
+  $buffer =~ s@^\.$@@g;
+  $buffer =~ s@(?<=/)\.$@@g;
+  # e) and f)
+  $buffer =~ s@[^/]+/\.\.($|/)@@g;	# maybe wrong
+  # g) - do nothing
+  $uri{path} = $buffer;
+
+ result:
+  # step 7
+  my $result = "";
+  if ($uri{scheme}) {
+    $result .= $uri{scheme} . ":";
+  }
+  else {
+    # this block is not part of the RFC
+    # TODO: figure out what MUAs actually do with unschemed URIs
+    # maybe look at URI::Heuristic
+    if ($uri{authority} =~ /^www\d*\./i) {
+      # some spammers are using unschemed URIs to escape filters
+      $result .= "http:";
+    }
+    elsif ($uri{authority} =~ /^ftp\d*\./i) {
+      $result .= "ftp:";
+    }
+  }
+  if ($uri{authority}) {
+    $result .= "//" . $uri{authority};
+  }
+  $result .= $uri{path};
+  if ($uri{query}) {
+    $result .= "?" . $uri{query};
+  }
+  if ($uri{fragment}) {
+    $result .= "#" . $uri{fragment};
+  }
+  push @{$self->{html}{uri}}, $result;
+}
+
 sub html_uri {
   my ($self, $tag, $attr, $num) = @_;
   my $uri;
 
   # ordered by frequency of tag groups
   if ($tag =~ /^(?:body|table|tr|td)$/) {
-    push @{$self->{html_text}}, "URI:$uri " if $uri = $attr->{background};
+    $self->push_uri($attr->{background});
   }
   elsif ($tag =~ /^(?:a|area|link)$/) {
-    push @{$self->{html_text}}, "URI:$uri " if $uri = $attr->{href};
+    $self->push_uri($attr->{href});
   }
   elsif ($tag =~ /^(?:img|frame|iframe|embed|script)$/) {
-    push @{$self->{html_text}}, "URI:$uri " if $uri = $attr->{src};
+    $self->push_uri($attr->{src});
   }
   elsif ($tag eq "form") {
-    push @{$self->{html_text}}, "URI:$uri " if $uri = $attr->{action};
+    $self->push_uri($attr->{action});
   }
   elsif ($tag eq "base") {
     if ($uri = $attr->{href}) {
       # use <BASE HREF="URI"> to turn relative links into absolute links
 
       # even if it is a base URI, handle like a normal URI as well
-      push @{$self->{html_text}}, "URI:$uri ";
+      $self->push_uri($uri);
 
       # a base URI will be ignored by browsers unless it is an absolute
       # URI of a standard protocol
-      if ($uri =~ m@^(?:ftp|https?)://@i) {
+      if ($uri =~ m@^(?:https?|ftp)://@i) {
 	# remove trailing filename, if any; base URIs can have the
 	# form of "http://foo.com/index.html"
 	$uri =~ s@^([a-z]+://[^/]+/.*?)[^/\.]+\.[^/\.]{2,4}$@$1@i;
@@ -643,12 +734,7 @@
   {
     $self->{html}{charsets} .= exists $self->{html}{charsets} ? " $1" : $1;
   }
-  if ($tag eq "img") {
-    # might as well always clear this here
-    $self->{html}{anchor_empty} = 0;
-  }
 
-  $self->{html}{anchor_empty} = 1 if ($tag eq "a" && exists $attr->{href});
   $self->{html}{anchor_text} ||= "" if ($tag eq "a");
 }
 
@@ -666,7 +752,6 @@
 
   if (exists $self->{html}{"inside_a"} && $self->{html}{"inside_a"} > 0) {
     $self->{html}{anchor_text} .= " $text";
-    $self->{html}{anchor_empty} = 0;
   }
 
   if (exists $self->{html}{"inside_script"} && $self->{html}{"inside_script"} > 0)

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm	Sun Feb 15 11:53:39 2004
@@ -301,20 +301,17 @@
       # some tests done after rendering
       my $r = $self->{html_results}; # temporary reference for brevity
       my $space = 0;
-      $r->{non_uri_len} = 0;
+      $r->{html_length} = 0;
       for my $line (@lines) {
         $line = pack ('C0A*', $line);
         $space += ($line =~ tr/ \t\n\r\x0b\xa0/ \t\n\r\x0b\xa0/);
-        $r->{non_uri_len} += length($line);
-        for my $uri ($line =~ m/\b(URI:\S+)/g) {
-          $r->{non_uri_len} -= length($uri);
-        }
+        $r->{html_length} += length($line);
       }
-      $r->{non_space_len} = $r->{non_uri_len} - $space;
-      $r->{ratio} = ($raw - $r->{non_uri_len}) / $raw;
-      if (exists $r->{total_comment_length} && $r->{non_uri_len} > 0) {
+      $r->{non_space_len} = $r->{html_length} - $space;
+      $r->{ratio} = ($raw - $r->{html_length}) / $raw;
+      if (exists $r->{total_comment_length} && $r->{html_length} > 0) {
         $r->{total_comment_ratio} = 
-	    $r->{total_comment_length} / $r->{non_uri_len};
+	    $r->{total_comment_length} / $r->{html_length};
       }
       if (exists $r->{elements} && exists $r->{tags}) {
 	$r->{bad_tag_ratio} = ($r->{tags} - $r->{elements}) / $r->{tags};

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm	Sun Feb 15 11:53:39 2004
@@ -1452,7 +1452,6 @@
   my ($rulename, $pat, @uris);
   local ($_);
 
-  my $base_uri = $self->{html}{base_href} || "http://";
   my $text;
 
   for (@$textary) {
@@ -1462,11 +1461,7 @@
 
       $uri =~ s/^<(.*)>$/$1/;
       $uri =~ s/[\]\)>#]$//;
-      $uri =~ s/^URI://i;
 
-      # Does the uri start with "http://", "mailto:", "javascript:" or
-      # such?  If not, we probably need to put the base URI in front
-      # of it.
       if ($uri !~ /^${schemeRE}:/io) {
         # If it's a hostname that was just sitting out in the
         # open, without a protocol, and not inside of an HTML tag,
@@ -1481,9 +1476,6 @@
           push (@uris, $uri);
           $uri = "ftp://$uri";
         }
-        else {
-          $uri = "${base_uri}$uri";
-        }
       }
 
       # warn("Got URI: $uri\n");
@@ -1492,12 +1484,16 @@
     while (/($Addr_spec_re)/go) {
       my $uri = $1;
 
-      $uri =~ s/^URI://i;
       $uri = "mailto:$uri";
 
       #warn("Got URI: $uri\n");
       push @uris, $uri;
     }
+  }
+
+  # get URIs from HTML parsing
+  if (defined $self->{html}{uri}) {
+    push @uris, @{ $self->{html}{uri} };
   }
 
   # Make sure we catch bad encoding tricks ...