You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by qu...@apache.org on 2004/10/19 23:39:34 UTC

svn commit: rev 55103 - in spamassassin/trunk: lib/Mail/SpamAssassin lib/Mail/SpamAssassin/Message rules

Author: quinlan
Date: Tue Oct 19 14:39:34 2004
New Revision: 55103

Modified:
   spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
   spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
   spamassassin/trunk/lib/Mail/SpamAssassin/Message/Node.pm
   spamassassin/trunk/rules/20_html_tests.cf
Log:
bug 3907: clean up HTML module


Modified: spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm	(original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm	Tue Oct 19 14:39:34 2004
@@ -2999,9 +2999,9 @@
   $rawtag =~ /^([a-zA-Z0-9]+)$/; my $tag = $1;
   $rawexpr =~ /^([\<\>\=\!\-\+ 0-9]+)$/; my $expr = $1;
 
-  return 0 unless exists $self->{html}{"inside_$tag"};
+  return 0 unless exists $self->{html}{inside}{$tag};
 
-  $self->{html}{"inside_$tag"} =~ /^([\<\>\=\!\-\+ 0-9]+)$/;
+  $self->{html}{inside}{$tag} =~ /^([\<\>\=\!\-\+ 0-9]+)$/;
   my $val = $1;
   return eval "\$val $expr";
 }
@@ -3009,7 +3009,7 @@
 sub html_image_only {
   my ($self, undef, $min, $max) = @_;
 
-  return (exists $self->{html}{"inside_img"} &&
+  return (exists $self->{html}{inside}{img} &&
 	  exists $self->{html}{length} &&
 	  $self->{html}{length} > $min &&
 	  $self->{html}{length} <= $max);
@@ -3048,7 +3048,7 @@
 
 sub html_tag_exists {
   my ($self, undef, $tag) = @_;
-  return exists $self->{html}{"inside_$tag"};
+  return exists $self->{html}{inside}{$tag};
 }
 
 sub html_test {
@@ -3166,19 +3166,19 @@
       # in one part will be the same in other parts.
       #
       if ($type eq 'text/html') {
-        foreach my $w (grep(/\w/,split(/\s+/,$rnd))) {
+        foreach my $w (grep(/\w/, split(/\s+/, $rnd))) {
 	  #dbg("eval: HTML: $w");
           $html{$w}++;
         }
 
 	# If there are no words, mark if there's at least 1 image ...
-	if (keys %html == 0 && exists $self->{html}{"inside_img"}) {
+	if (keys %html == 0 && exists $self->{html}{inside}{img}) {
 	  # Use "\n" as the mark since it can't ever occur normally
 	  $html{"\n"}=1;
 	}
       }
       else {
-        foreach my $w (grep(/\w/,split(/\s+/,$rnd))) {
+        foreach my $w (grep(/\w/, split(/\s+/, $rnd))) {
 	  #dbg("eval: TEXT: $w");
           $text{$w}++;
         }

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm	(original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm	Tue Oct 19 14:39:34 2004
@@ -23,18 +23,13 @@
 use bytes;
 
 package Mail::SpamAssassin::HTML;
-
-require Exporter;
-my @ISA = qw(Exporter);
-my @EXPORT = qw($re_start $re_loose $re_strict get_results);
-my @EXPORT_OK = qw();
-
 use HTML::Parser 3.24 ();
-use vars qw($re_start $re_loose $re_strict $re_other);
+use vars qw($re_loose $re_strict $re_other @ISA @EXPORT @EXPORT_OK);
 
-# elements that trigger HTML rendering in text/plain in some mail clients
-# (repeats ones listed in $re_strict)
-$re_start = 'body|head|html|img|pre|table|title';
+require Exporter;
+@ISA = qw(HTML::Parser Exporter);
+@EXPORT = qw($re_loose $re_strict get_results);
+@EXPORT_OK = qw();
 
 # elements defined by the HTML 4.01 and XHTML 1.0 DTDs (do not change them!)
 $re_loose = 'applet|basefont|center|dir|font|frame|frameset|iframe|isindex|menu|noframes|s|strike|u';
@@ -68,25 +63,31 @@
 		 background => [qw(body marquee)],
 		 );
 
-my %tested_colors;
-
 sub new {
-  my $this = shift;
-  my $class = ref($this) || $this;
-  my $self = {};
-  bless($self, $class);
-
-  $self->html_start();
-
-  return $self;
+  my ($class) = @_;
+  my $self = $class->SUPER::new(
+		api_version => 3,
+		handlers => [
+			start_document => ["html_start", "self"],
+			start => ["html_tag", "self,tagname,attr,'+1'"],
+			end_document => ["html_end", "self"],
+			end => ["html_tag", "self,tagname,attr,'-1'"],
+			text => ["html_text", "self,dtext"],
+			comment => ["html_comment", "self,text"],
+			declaration => ["html_declaration", "self,text"],
+		],
+		marked_sections => 1);
+  $self;
 }
 
 sub html_start {
   my ($self) = @_;
 
-  $self->{basefont} = 3;
+  # trigger HTML_MESSAGE
+  $self->put_results(html => 1);
 
-  undef $self->{text_style};
+  # initial display attributes
+  $self->{basefont} = 3;
   my %default = (tag => "default",
 		 fgcolor => "#000000",
 		 bgcolor => "#ffffff",
@@ -97,43 +98,104 @@
 sub html_end {
   my ($self) = @_;
 
-  $self->display_text();
+  delete $self->{text_style};
+
+  # final results scalars
+  $self->put_results(image_area => $self->{image_area});
+  $self->put_results(max_shouting => $self->{max_shouting});
+  $self->put_results(length => $self->{length});
+  $self->put_results(min_size => $self->{min_size});
+  $self->put_results(max_size => $self->{max_size});
+
+  # final result arrays
+  $self->put_results(comment => $self->{comment});
+  $self->put_results(title => $self->{title});
+  $self->put_results(anchor => $self->{anchor});
+
+  # final result hashes
+  $self->put_results(inside => $self->{inside});
+  $self->put_results(uri => $self->{uri});
+
+  # end-of-document result values that don't require looking at the text
+  if (exists $self->{backhair}) {
+    $self->put_results(backhair_count => scalar keys %{ $self->{backhair} });
+  }
+  if (exists $self->{elements} && exists $self->{tags}) {
+    $self->put_results(bad_tag_ratio =>
+		       ($self->{tags} - $self->{elements}) / $self->{tags});
+  }
+  if (exists $self->{elements_seen} && exists $self->{tags_seen}) {
+    $self->put_results(non_element_ratio =>
+		       ($self->{tags_seen} - $self->{elements_seen}) /
+		       $self->{tags_seen});
+  }
+  if (exists $self->{tags} && exists $self->{obfuscation}) {
+    $self->put_results(obfuscation_ratio =>
+		       $self->{obfuscation} / $self->{tags});
+  }
+  if (exists $self->{attr_bad} && exists $self->{attr_all}) {
+    $self->put_results(attr_bad => $self->{attr_bad} / $self->{attr_all});
+  }
+  if (exists $self->{attr_unique_bad} && exists $self->{attr_unique_all}) {
+    $self->put_results(attr_unique_bad =>
+		       $self->{attr_unique_bad} / $self->{attr_unique_all});
+  }
+}
+
+sub put_results {
+  my $self = shift;
+  my %results = @_;
+
+  while (my ($k, $v) = each %results) {
+    $self->{results}{$k} = $v;
+  }  
 }
 
 sub get_results {
   my ($self) = @_;
 
-  return $self->{html};
+  return $self->{results};
+}
+
+sub get_rendered_text {
+  my $self = shift;
+  my %options = @_;
+
+  return join('', @{ $self->{text} }) unless keys %options;
+
+  my $mask;
+  while (my ($k, $v) = each %options) {
+    next if !defined $self->{"text_$k"};
+    if (!defined $mask) {
+      $mask |= $v ? $self->{"text_$k"} : ~ $self->{"text_$k"};
+    }
+    else {
+      $mask &= $v ? $self->{"text_$k"} : ~ $self->{"text_$k"};
+    }
+  }
+
+  my $text = '';
+  my $i = 0;
+  for (@{ $self->{text} }) { $text .= $_ if vec($mask, $i++, 1); }
+  return $text;
 }
 
-sub html_render {
+sub parse {
   my ($self, $text) = @_;
 
-  # clean this up later
-  for my $key (keys %{ $self->{html} }) {
-    delete $self->{html}{$key};
-  }
-
-  $self->{html}{ratio} = 0;
-  $self->{html}{image_area} = 0;
-  $self->{html}{shouting} = 0;
-  $self->{html}{max_shouting} = 0;
-  $self->{html}{anchor_index} = -1;
-  $self->{html}{title_index} = -1;
-  $self->{html}{max_size} = 3;	# start at default size
-  $self->{html}{min_size} = 3;	# start at default size
-
-  $self->{html_text} = [];
-  $self->{html_visible_text} = [];
-  $self->{html_invisible_text} = [];
-  $self->{last_text} = "";
-  $self->{last_visible_text} = "";
-  $self->{last_invisible_text} = "";
-  $self->{html_last_tag} = 0;
-  $self->{html}{closed_html} = 0;
-  $self->{html}{closed_body} = 0;
+  $self->{image_area} = 0;
+  $self->{shouting} = 0;
+  $self->{max_shouting} = 0;
+  $self->{anchor_index} = -1;
+  $self->{title_index} = -1;
+  $self->{max_size} = 3;	# start at default size
+  $self->{min_size} = 3;	# start at default size
+  $self->{closed_html} = 0;
+  $self->{closed_body} = 0;
+  $self->{text} = [];		# rendered text
+  $self->{text_invisible} = '';	# vec of invisibility state in $self->{text}
 
-  $self->{html}{length} += $1 if (length($text) =~ m/^(\d+)$/);	# untaint
+  $self->{length} += $1 if (length($text) =~ m/^(\d+)$/);	# untaint
 
   # NOTE: We *only* need to fix the rendering when we verify that it
   # differs from what people see in their MUA.  Testing is best done with
@@ -159,32 +221,17 @@
     $text =~ s/<\/(?:\s.*?)?>//gs;
   }
 
-  # HTML::Parser 3.31, at least, converts &nbsp; into a question mark "?" for some reason.
-  # Let's convert them to spaces.
+  # HTML::Parser converts &nbsp; into a question mark ("?") for some
+  # reason, so convert them to spaces.  Confirmed in 3.31, at least.
   $text =~ s/&nbsp;/ /g;
 
-  my $hp = HTML::Parser->new(
-		api_version => 3,
-		handlers => [
-		  start_document => [sub { $self->html_start(@_) }],
-		  start => [sub { $self->html_tag(@_) }, "tagname,attr,'+1'"],
-		  end_document => [sub { $self->html_end(@_) }],
-		  end => [sub { $self->html_tag(@_) }, "tagname,attr,'-1'"],
-		  text => [sub { $self->html_text(@_) }, "dtext"],
-		  comment => [sub { $self->html_comment(@_) }, "text"],
-		  declaration => [sub { $self->html_declaration(@_) }, "text"],
-		],
-		marked_sections => 1);
-
   # ALWAYS pack it into byte-representation, even if we're using 'use bytes',
   # since the HTML::Parser object may use Unicode internally.
   # (bug 1417, maybe)
-  $hp->parse(pack ('C0A*', $text));
-  $hp->eof;
+  $self->SUPER::parse(pack('C0A*', $text));
+  $self->SUPER::eof;
 
-  delete $self->{html_last_tag};
-
-  return $self->{html_text};
+  return $self->{text};
 }
 
 sub html_tag {
@@ -194,22 +241,22 @@
 
   # general tracking
   if ($is_element) {
-    $self->{html}{elements}++;
-    $self->{html}{elements_seen}++ if !exists $self->{html}{"inside_$tag"};
+    $self->{elements}++;
+    $self->{elements_seen}++ if !exists $self->{inside}{$tag};
   }
-  $self->{html}{tags}++;
-  $self->{html}{tags_seen}++ if !exists $self->{html}{"inside_$tag"};
-  $self->{html}{"inside_$tag"} += $num;
-  $self->{html}{"inside_$tag"} = 0 if $self->{html}{"inside_$tag"} < 0;
+  $self->{tags}++;
+  $self->{tags_seen}++ if !exists $self->{inside}{$tag};
+  $self->{inside}{$tag} += $num;
+  $self->{inside}{$tag} = 0 if $self->{inside}{$tag} < 0;
 
   # check attributes
   for my $name (keys %$attr) {
     if ($name !~ /^(?:$re_attr|$re_attr_extra)$/io) {
-      $self->{html}{attr_bad}++;
-      $self->{html}{attr_unique_bad}++ if !exists $self->{"attr_seen_$name"};
+      $self->{attr_bad}++;
+      $self->{attr_unique_bad}++ if !exists $self->{"attr_seen_$name"};
     }
-    $self->{html}{attr_all}++;
-    $self->{html}{attr_unique_all}++ if !exists $self->{"attr_seen_$name"};
+    $self->{attr_all}++;
+    $self->{attr_unique_all}++ if !exists $self->{"attr_seen_$name"};
     $self->{"attr_seen_$name"} = 1;
   }
 
@@ -231,173 +278,32 @@
     }
     # end tags
     elsif ($num == -1) {
-      $self->{html}{closed_html} = 1 if $tag eq "html";
-      $self->{html}{closed_body} = 1 if $tag eq "body";
+      $self->{closed_html} = 1 if $tag eq "html";
+      $self->{closed_body} = 1 if $tag eq "body";
     }
     # shouting
     if ($tag =~ /^(?:b|i|u|strong|em|big|center|h\d)$/) {
-      $self->{html}{shouting} += $num;
-      if ($self->{html}{shouting} > $self->{html}{max_shouting}) {
-	$self->{html}{max_shouting} = $self->{html}{shouting};
+      $self->{shouting} += $num;
+      if ($self->{shouting} > $self->{max_shouting}) {
+	$self->{max_shouting} = $self->{shouting};
       }
     }
-
-    $self->{html_last_tag} = (($num < 0) ? "/" : "") . $tag;
   }
 }
 
 sub html_format {
   my ($self, $tag, $attr, $num) = @_;
 
-  # ordered by frequency of tag groups
+  # ordered by frequency of tag groups, note: whitespace is always "visible"
   if ($tag eq "br" || $tag eq "div") {
-    $self->display_text();
-    push @{$self->{html_visible_text}}, "\n";
-    push @{$self->{html_invisible_text}}, "\n";
-    push @{$self->{html_text}}, "\n";
+    $self->display_text("\n", whitespace => 1);
   }
   elsif ($tag =~ /^(?:li|t[hd]|d[td])$/) {
-    $self->display_text();
-    push @{$self->{html_visible_text}}, " ";
-    push @{$self->{html_invisible_text}}, " ";
-    push @{$self->{html_text}}, " ";
+    $self->display_text(" ", whitespace => 1);
   }
   elsif ($tag =~ /^(?:p|hr|blockquote|pre)$/) {
-    $self->display_text();
-    push @{$self->{html_visible_text}}, "\n\n";
-    push @{$self->{html_invisible_text}}, "\n\n";
-    push @{$self->{html_text}}, "\n\n";
-  }
-}
-
-use constant URI_STRICT => 0;
-
-# resolving relative URIs as defined in RFC 2396 (steps from section 5.2)
-# using draft http://www.gbiv.com/protocols/uri/rev-2002/rfc2396bis.html
-sub parse_uri {
-  my ($u) = @_;
-  my %u;
-  ($u{scheme}, $u{authority}, $u{path}, $u{query}, $u{fragment}) =
-    $u =~ m|^(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;
-  return %u;
-}
-
-sub remove_dot_segments {
-  my ($input) = @_;
-  my $output = "";
-
-  $input =~ s@^(?:\.\.?/)@/@;
-
-  while ($input) {
-    if ($input =~ s@^/\.(?:$|/)@/@) {
-    }
-    elsif ($input =~ s@^/\.\.(?:$|/)@/@) {
-      $output =~ s@/?[^/]*$@@;
-    }
-    elsif ($input =~ s@(/?[^/]*)@@) {
-      $output .= $1;
-    }
-  }
-  return $output;
-}
-
-sub merge_uri {
-  my ($base_authority, $base_path, $r_path) = @_;
-
-  if (defined $base_authority && !$base_path) {
-    return "/" . $r_path;
-  }
-  else {
-    if ($base_path =~ m|/|) {
-      $base_path =~ s|(?<=/)[^/]*$||;
-    }
-    else {
-      $base_path = "";
-    }
-    return $base_path . $r_path;
-  }
-}
-
-sub target_uri {
-  my ($base, $r) = @_;
-
-  my %r = parse_uri($r);	# parsed relative URI
-  my %base = parse_uri($base);	# parsed base URI
-  my %t;			# generated temporary URI
-
-  if ((not URI_STRICT) and
-      (defined $r{scheme} && defined $base{scheme}) and
-      ($r{scheme} eq $base{scheme}))
-  {
-    undef $r{scheme};
-  }
-
-  if (defined $r{scheme}) {
-    $t{scheme} = $r{scheme};
-    $t{authority} = $r{authority};
-    $t{path} = remove_dot_segments($r{path});
-    $t{query} = $r{query};
-  }
-  else {
-    if (defined $r{authority}) {
-      $t{authority} = $r{authority};
-      $t{path} = remove_dot_segments($r{path});
-      $t{query} = $r{query};
-    }
-    else {
-      if ($r{path} eq "") {
-	$t{path} = $base{path};
-	if (defined $r{query}) {
-	  $t{query} = $r{query};
-	}
-	else {
-	  $t{query} = $base{query};
-	}
-      }
-      else {
-	if ($r{path} =~ m|^/|) {
-	  $t{path} = remove_dot_segments($r{path});
-	}
-	else {
-	  $t{path} = merge_uri($base{authority}, $base{path}, $r{path});
-	  $t{path} = remove_dot_segments($t{path});
-	}
-	$t{query} = $r{query};
-      }
-      $t{authority} = $base{authority};
-    }
-    $t{scheme} = $base{scheme};
-  }
-  $t{fragment} = $r{fragment};
-
-  # recompose URI
-  my $result = "";
-  if ($t{scheme}) {
-    $result .= $t{scheme} . ":";
-  }
-  elsif (defined $t{authority}) {
-    # this block is not part of the RFC
-    # TODO: figure out what MUAs actually do with unschemed URIs
-    # maybe look at URI::Heuristic
-    if ($t{authority} =~ /^www\d*\./i) {
-      # some spammers are using unschemed URIs to escape filters
-      $result .= "http:";
-    }
-    elsif ($t{authority} =~ /^ftp\d*\./i) {
-      $result .= "ftp:";
-    }
-  }
-  if ($t{authority}) {
-    $result .= "//" . $t{authority};
-  }
-  $result .= $t{path};
-  if ($t{query}) {
-    $result .= "?" . $t{query};
-  }
-  if ($t{fragment}) {
-    $result .= "#" . $t{fragment};
+    $self->display_text("\n\n", whitespace => 1);
   }
-  return $result;
 }
 
 sub push_uri {
@@ -409,8 +315,8 @@
   $uri =~ s/^\s+//;
   $uri =~ s/\s+$//;
 
-  my $target = target_uri($self->{html}{base_href} || "", $uri);
-  push @{$self->{html}{uri}}, $target if $target;
+  my $target = target_uri($self->{base_href} || "", $uri);
+  push @{ $self->{uri} }, $target if $target;
 }
 
 sub html_uri {
@@ -434,7 +340,7 @@
       # use <BASE HREF="URI"> to turn relative links into absolute links
 
       # even if it is a base URI, handle like a normal URI as well
-      push @{$self->{html}{uri}}, $uri;
+      push @{ $self->{uri} }, $uri;
 
       # a base URI will be ignored by browsers unless it is an absolute
       # URI of a standard protocol
@@ -445,188 +351,12 @@
 
 	# Make sure it ends in a slash
 	$uri .= "/" unless $uri =~ m@/$@;
-	$self->{html}{base_href} = $uri;
+	$self->{base_href} = $uri;
       }
     }
   }
 }
 
-my %html_color = (
-  # HTML 4 defined 16 colors
-  aqua => 0x00ffff,
-  black => 0x000000,
-  blue => 0x0000ff,
-  fuchsia => 0xff00ff,
-  gray => 0x808080,
-  green => 0x008000,
-  lime => 0x00ff00,
-  maroon => 0x800000,
-  navy => 0x000080,
-  olive => 0x808000,
-  purple => 0x800080,
-  red => 0xff0000,
-  silver => 0xc0c0c0,
-  teal => 0x008080,
-  white => 0xffffff,
-  yellow => 0xffff00,
-  # colors specified in CSS3 color module
-  aliceblue => 0xf0f8ff,
-  antiquewhite => 0xfaebd7,
-  aqua => 0x00ffff,
-  aquamarine => 0x7fffd4,
-  azure => 0xf0ffff,
-  beige => 0xf5f5dc,
-  bisque => 0xffe4c4,
-  black => 0x000000,
-  blanchedalmond => 0xffebcd,
-  blue => 0x0000ff,
-  blueviolet => 0x8a2be2,
-  brown => 0xa52a2a,
-  burlywood => 0xdeb887,
-  cadetblue => 0x5f9ea0,
-  chartreuse => 0x7fff00,
-  chocolate => 0xd2691e,
-  coral => 0xff7f50,
-  cornflowerblue => 0x6495ed,
-  cornsilk => 0xfff8dc,
-  crimson => 0xdc143c,
-  cyan => 0x00ffff,
-  darkblue => 0x00008b,
-  darkcyan => 0x008b8b,
-  darkgoldenrod => 0xb8860b,
-  darkgray => 0xa9a9a9,
-  darkgreen => 0x006400,
-  darkgrey => 0xa9a9a9,
-  darkkhaki => 0xbdb76b,
-  darkmagenta => 0x8b008b,
-  darkolivegreen => 0x556b2f,
-  darkorange => 0xff8c00,
-  darkorchid => 0x9932cc,
-  darkred => 0x8b0000,
-  darksalmon => 0xe9967a,
-  darkseagreen => 0x8fbc8f,
-  darkslateblue => 0x483d8b,
-  darkslategray => 0x2f4f4f,
-  darkslategrey => 0x2f4f4f,
-  darkturquoise => 0x00ced1,
-  darkviolet => 0x9400d3,
-  deeppink => 0xff1493,
-  deepskyblue => 0x00bfff,
-  dimgray => 0x696969,
-  dimgrey => 0x696969,
-  dodgerblue => 0x1e90ff,
-  firebrick => 0xb22222,
-  floralwhite => 0xfffaf0,
-  forestgreen => 0x228b22,
-  fuchsia => 0xff00ff,
-  gainsboro => 0xdcdcdc,
-  ghostwhite => 0xf8f8ff,
-  gold => 0xffd700,
-  goldenrod => 0xdaa520,
-  gray => 0x808080,
-  green => 0x008000,
-  greenyellow => 0xadff2f,
-  grey => 0x808080,
-  honeydew => 0xf0fff0,
-  hotpink => 0xff69b4,
-  indianred => 0xcd5c5c,
-  indigo => 0x4b0082,
-  ivory => 0xfffff0,
-  khaki => 0xf0e68c,
-  lavender => 0xe6e6fa,
-  lavenderblush => 0xfff0f5,
-  lawngreen => 0x7cfc00,
-  lemonchiffon => 0xfffacd,
-  lightblue => 0xadd8e6,
-  lightcoral => 0xf08080,
-  lightcyan => 0xe0ffff,
-  lightgoldenrodyellow => 0xfafad2,
-  lightgray => 0xd3d3d3,
-  lightgreen => 0x90ee90,
-  lightgrey => 0xd3d3d3,
-  lightpink => 0xffb6c1,
-  lightsalmon => 0xffa07a,
-  lightseagreen => 0x20b2aa,
-  lightskyblue => 0x87cefa,
-  lightslategray => 0x778899,
-  lightslategrey => 0x778899,
-  lightsteelblue => 0xb0c4de,
-  lightyellow => 0xffffe0,
-  lime => 0x00ff00,
-  limegreen => 0x32cd32,
-  linen => 0xfaf0e6,
-  magenta => 0xff00ff,
-  maroon => 0x800000,
-  mediumaquamarine => 0x66cdaa,
-  mediumblue => 0x0000cd,
-  mediumorchid => 0xba55d3,
-  mediumpurple => 0x9370db,
-  mediumseagreen => 0x3cb371,
-  mediumslateblue => 0x7b68ee,
-  mediumspringgreen => 0x00fa9a,
-  mediumturquoise => 0x48d1cc,
-  mediumvioletred => 0xc71585,
-  midnightblue => 0x191970,
-  mintcream => 0xf5fffa,
-  mistyrose => 0xffe4e1,
-  moccasin => 0xffe4b5,
-  navajowhite => 0xffdead,
-  navy => 0x000080,
-  oldlace => 0xfdf5e6,
-  olive => 0x808000,
-  olivedrab => 0x6b8e23,
-  orange => 0xffa500,
-  orangered => 0xff4500,
-  orchid => 0xda70d6,
-  palegoldenrod => 0xeee8aa,
-  palegreen => 0x98fb98,
-  paleturquoise => 0xafeeee,
-  palevioletred => 0xdb7093,
-  papayawhip => 0xffefd5,
-  peachpuff => 0xffdab9,
-  peru => 0xcd853f,
-  pink => 0xffc0cb,
-  plum => 0xdda0dd,
-  powderblue => 0xb0e0e6,
-  purple => 0x800080,
-  red => 0xff0000,
-  rosybrown => 0xbc8f8f,
-  royalblue => 0x4169e1,
-  saddlebrown => 0x8b4513,
-  salmon => 0xfa8072,
-  sandybrown => 0xf4a460,
-  seagreen => 0x2e8b57,
-  seashell => 0xfff5ee,
-  sienna => 0xa0522d,
-  silver => 0xc0c0c0,
-  skyblue => 0x87ceeb,
-  slateblue => 0x6a5acd,
-  slategray => 0x708090,
-  slategrey => 0x708090,
-  snow => 0xfffafa,
-  springgreen => 0x00ff7f,
-  steelblue => 0x4682b4,
-  tan => 0xd2b48c,
-  teal => 0x008080,
-  thistle => 0xd8bfd8,
-  tomato => 0xff6347,
-  turquoise => 0x40e0d0,
-  violet => 0xee82ee,
-  wheat => 0xf5deb3,
-  white => 0xffffff,
-  whitesmoke => 0xf5f5f5,
-  yellow => 0xffff00,
-  yellowgreen => 0x9acd32,
-);
-
-sub name_to_rgb {
-  my $color = lc $_[0];
-  if (my $hex = $html_color{$color}) {
-      return sprintf("#%06x", $hex);
-  }
-  return $color;
-}
-
 # this might not be quite right, may need to pay attention to table nesting
 sub close_table_tag {
   my ($self, $tag) = @_;
@@ -636,7 +366,7 @@
 
   my $top;
   while (@{ $self->{text_style} } && ($top = $self->{text_style}[-1]->{tag})) {
-    if (($tag eq "td" && $top =~ /^(?:font|td)$/) ||
+    if (($tag eq "td" && ($top eq "font" || $top eq "td")) ||
 	($tag eq "tr" && $top =~ /^(?:font|td|tr)$/))
     {
       pop @{ $self->{text_style} };
@@ -713,9 +443,9 @@
     # tag attributes
     for my $name (keys %$attr) {
       next unless (grep { $_ eq $tag } @{ $ok_attribute{$name} });
-      if ($name =~ /^(?:text|color)$/) {
+      if ($name eq "text" || $name eq "color") {
 	# two different names for text color
-	$new{fgcolor} = name_to_rgb(lc($attr->{$name}));
+	$new{fgcolor} = _name_to_rgb($attr->{$name});
       }
       elsif ($name eq "size" && $attr->{size} =~ /^\s*([+-]\d+)/) {
 	# relative font size
@@ -724,7 +454,7 @@
       else {
 	if ($name eq "bgcolor") {
 	  # overwrite with hex value, $new{bgcolor} is set below
-	  $attr->{bgcolor} = name_to_rgb(lc($attr->{bgcolor}));
+	  $attr->{bgcolor} = _name_to_rgb($attr->{bgcolor});
 	}
 	if ($name eq "size" && $attr->{size} !~ /^\s*([+-])(\d+)/) {
 	  # attribute is malformed
@@ -734,11 +464,11 @@
 	  $new{$name} = $attr->{$name};
 	}
       }
-      if ($new{size} > $self->{html}{max_size}) {
-	$self->{html}{max_size} = $new{size};
+      if ($new{size} > $self->{max_size}) {
+	$self->{max_size} = $new{size};
       }
-      elsif ($new{size} < $self->{html}{min_size}) {
-	$self->{html}{min_size} = $new{size};
+      elsif ($new{size} < $self->{min_size}) {
+	$self->{min_size} = $new{size};
       }
     }
     push @{ $self->{text_style} }, \%new;
@@ -757,12 +487,11 @@
 
   my $fg = $self->{text_style}[-1]->{fgcolor};
   my $bg = $self->{text_style}[-1]->{bgcolor};
-  my $visible_for_bayes = 1;
 
   # invisibility
   if (substr($fg,-6) eq substr($bg,-6)) {
-    $self->{html}{font_invisible} = 1;
-    $visible_for_bayes = 0;
+    $self->put_results(font_invisible => 1);
+    return 1;
   }
   # near-invisibility
   elsif ($fg =~ /^\#?([0-9a-f]{2})([0-9a-f]{2})([0-9a-f]{2})$/) {
@@ -784,63 +513,47 @@
       # increases (near-invisible text is at about 0.95% of spam and
       # 1.25% of HTML spam right now), but please test any changes first
       if ($distance < 12) {
-	$self->{html}{"font_low_contrast"} = 1;
-        $visible_for_bayes = 0;
+	$self->put_results(font_low_contrast => 1);
+	return 1;
       }
     }
   }
 
-  return $visible_for_bayes;
+  return 0;
 }
 
 sub html_tests {
   my ($self, $tag, $attr, $num) = @_;
-  local ($_);
 
-  if ($tag eq "table" && exists $attr->{border} && $attr->{border} =~ /(\d+)/)
-  {
-    $self->{html}{thick_border} = 1 if $1 > 1;
-  }
-  # if ($tag eq "script") {
-  # $self->{html}{javascript} = 1;
-  # }
   if ($tag =~ /^(?:a|body|div|input|form|td|layer|area|img)$/i) {
-    for (keys %$attr) {
-      if (/\b(?:$events)\b/io)
-      {
-	$self->{html}{html_event} = 1;
-      }
-      if (/\bon(?:contextmenu|load|resize|submit|unload)\b/i &&
-	  $attr->{$_})
+    for my $key (keys %$attr) {
+      if ($key =~ /\bon(?:contextmenu|load|resize|submit|unload)\b/i &&
+	  $attr->{$key})
       {
-	$self->{html}{html_event_unsafe} = 1;
-        # if ($attr->{$_} =~ /\.open\s*\(/) { $self->{html}{window_open} = 1; }
+	$self->put_results(html_event_unsafe => 1);
       }
     }
   }
   if ($tag eq "font" && exists $attr->{size}) {
     my $size = $attr->{size};
-    $self->{html}{tiny_font} = 1 if (($size =~ /^\s*(\d+)/ && $1 < 1) ||
-				     ($size =~ /\-(\d+)/ && $1 >= 3));
-    $self->{html}{big_font} = 1 if (($size =~ /^\s*(\d+)/ && $1 > 3) ||
-				    ($size =~ /\+(\d+)/ && $1 >= 1));
+    $self->put_results(tiny_font => 1) if (($size =~ /^\s*(\d+)/ && $1 < 1) ||
+					   ($size =~ /\-(\d+)/ && $1 >= 3));
+    $self->put_results(big_font => 1) if (($size =~ /^\s*(\d+)/ && $1 > 3) ||
+					  ($size =~ /\+(\d+)/ && $1 >= 1));
   }
   if ($tag eq "font" && exists $attr->{face}) {
     if ($attr->{face} =~ /[A-Z]{3}/ && $attr->{face} !~ /M[ST][A-Z]|ITC/) {
-      $self->{html}{font_face_caps} = 1;
+      $self->put_results(font_face_caps => 1);
     }
     if ($attr->{face} !~ /^[a-z][a-z -]*[a-z](?:,\s*[a-z][a-z -]*[a-z])*$/i) {
-      $self->{html}{font_face_bad} = 1;
+      $self->put_results(font_face_bad => 1);
     }
   }
-  if (exists($attr->{style})) {
+  if (exists $attr->{style}) {
     if ($attr->{style} =~ /font(?:-size)?:\s*(\d+(?:\.\d*)?|\.\d+)(p[tx])/i) {
-      $self->examine_text_style ($1, $2);
+      $self->examine_text_style($1, $2);
     }
   }
-  if ($tag eq "img") {
-    push @{ $self->{html}{img_src} }, $attr->{src} if exists $attr->{src};
-  }
   if ($tag eq "img" && exists $attr->{width} && exists $attr->{height}) {
     my $width = 0;
     my $height = 0;
@@ -860,32 +573,30 @@
     $height = 200 if $height <= 0;
     if ($width > 0 && $height > 0) {
       $area = $width * $height;
-      $self->{html}{image_area} += $area;
+      $self->{image_area} += $area;
     }
     # this is intended to match any width and height if they're specified
     if (exists $attr->{src} &&
 	$attr->{src} =~ /\.(?:pl|cgi|php|asp|jsp|cfm)\b/i)
     {
-      $self->{html}{web_bugs} = 1;
+      $self->put_results(web_bugs => 1);
     }
   }
   if ($tag eq "form" && exists $attr->{action}) {
-    $self->{html}{form_action_mailto} = 1 if $attr->{action} =~ /mailto:/i
+    $self->put_results(form_action_mailto => 1) if $attr->{action} =~ /mailto:/i
   }
-  if ($tag =~ /^(?:object|embed)$/) {
-    $self->{html}{embeds} = 1;
+  if ($tag eq "object" || $tag eq "embed") {
+    $self->put_results(embeds => 1);
   }
 
   # special text delimiters - <a> and <title>
   if ($tag eq "a") {
-    $self->{html}{anchor_index}++;
-    $self->{html}{anchor}->[$self->{html}{anchor_index}] = "";
+    $self->{anchor_index}++;
+    $self->{anchor}->[$self->{anchor_index}] = "";
   }
   if ($tag eq "title") {
-    $self->{html}{title_index}++;
-    $self->{html}{title}->[$self->{html}{title_index}] = "";
-
-    # $self->{html}{title_extra}++ if $self->{html}{title_index} > 0;
+    $self->{title_index}++;
+    $self->{title}->[$self->{title_index}] = "";
   }
 
   if ($tag eq "meta" &&
@@ -894,29 +605,47 @@
       $attr->{'http-equiv'} =~ /Content-Type/i &&
       $attr->{content} =~ /\bcharset\s*=\s*["']?([^"']+)/i)
   {
-    $self->{html}{charsets} .= exists $self->{html}{charsets} ? " $1" : $1;
+    $self->{charsets} .= exists $self->{charsets} ? " $1" : $1;
   }
 }
 
 sub examine_text_style {
   my ($self, $size, $type) = @_;
   $type = lc $type;
-  $self->{html}{tiny_font} = 1 if ($type eq "pt" && $size < 4);
-  $self->{html}{tiny_font} = 1 if ($type eq "pt" && $size < 4);
-  $self->{html}{big_font} = 1 if ($type eq "pt" && $size > 14);
-  $self->{html}{big_font} = 1 if ($type eq "px" && $size > 18);
+  $self->put_results(tiny_font => 1) if ($type eq "pt" && $size < 4);
+  $self->put_results(tiny_font => 1) if ($type eq "px" && $size < 4);
+  $self->put_results(big_font => 1) if ($type eq "pt" && $size > 14);
+  $self->put_results(big_font => 1) if ($type eq "px" && $size > 18);
 }
 
 sub display_text {
-  my ($self) = @_;
-
-  for my $type ('text', 'visible_text', 'invisible_text') {
-    my $text = $self->{"last_$type"};
+  my $self = shift;
+  my $text = shift;
+  my %display = @_;
+
+  if ($display{whitespace}) {
+    # trim trailing whitespace from previous element if it was not whitespace
+    if (@{ $self->{text} } &&
+	(!defined $self->{text_whitespace} ||
+	 !vec($self->{text_whitespace}, $#{$self->{text}}, 1)))
+    {
+      $self->{text}->[-1] =~ s/ $//;
+    }
+  }
+  else {
     $text =~ s/[ \t\n\r\f\x0b\xa0]+/ /g;
-    $text =~ s/^ //;
-    $text =~ s/ $//;
-    push @{$self->{"html_$type"}}, $text;
-    $self->{"last_$type"} = "";
+    # trim leading whitespace if previous element was whitespace
+    if (@{ $self->{text} } &&
+	defined $self->{text_whitespace} &&
+	vec($self->{text_whitespace}, $#{$self->{text}}, 1))
+    {
+      $text =~ s/^ //;
+    }
+  }
+  push @{ $self->{text} }, $text;
+  while (my ($k, $v) = each %display) {
+    $self->{"text_$k"} ||= '';
+    vec($self->{"text_$k"}, $#{$self->{text}}, 1) = $v;
   }
 }
 
@@ -926,104 +655,85 @@
   # note: this comes back from HTML::Parser as UTF-8-tainted.  Enforce byte
   # mode by repacking the string in byte mode, to avoid 'Malformed UTF-8
   # character (unexpected non-continuation byte)' warnings
-  $text = pack ("C0A*", $text);
+  $text = pack("C0A*", $text);
 
   # text that is not part of body
-  if (exists $self->{html}{inside_script} && $self->{html}{inside_script} > 0)
+  if (exists $self->{inside}{script} && $self->{inside}{script} > 0)
   {
     if ($text =~ /\bon(?:blur|contextmenu|focus|load|resize|submit|unload)\b/i)
     {
-      $self->{html}{html_event_unsafe} = 1;
+      $self->put_results(html_event_unsafe => 1);
     }
-    if ($text =~ /\b(?:$events)\b/io) { $self->{html}{html_event} = 1; }
-    # if ($text =~ /\.open\s*\(/) { $self->{html}{window_open} = 1; }
     return;
   }
-  if (exists $self->{html}{inside_style} && $self->{html}{inside_style} > 0) {
+  if (exists $self->{inside}{style} && $self->{inside}{style} > 0) {
     if ($text =~ /font(?:-size)?:\s*(\d+(?:\.\d*)?|\.\d+)(p[tx])/i) {
-      $self->examine_text_style ($1, $2);
+      $self->examine_text_style($1, $2);
     }
     return;
   }
 
   # text that is part of body and also stored separately
-  if (exists $self->{html}{inside_a} && $self->{html}{inside_a} > 0) {
-    $self->{html}{anchor}->[$self->{html}{anchor_index}] .= $text;
+  if (exists $self->{inside}{a} && $self->{inside}{a} > 0) {
+    $self->{anchor}->[$self->{anchor_index}] .= $text;
   }
-  if (exists $self->{html}{inside_title} && $self->{html}{inside_title} > 0) {
-    $self->{html}{title}->[$self->{html}{title_index}] .= $text;
+  if (exists $self->{inside}{title} && $self->{inside}{title} > 0) {
+    $self->{title}->[$self->{title_index}] .= $text;
   }
 
-  my $visible_for_bayes = 1;
+  my $invisible_for_bayes = 0;
   if ($text =~ /[^ \t\n\r\f\x0b\xa0]/) {
-    $visible_for_bayes = $self->html_font_invisible($text);
-    $self->{html}{text_after_body} = 1 if $self->{html}{closed_body};
-    $self->{html}{text_after_html} = 1 if $self->{html}{closed_html};
+    $invisible_for_bayes = $self->html_font_invisible($text);
+    $self->put_results(text_after_body => 1) if $self->{closed_body};
+    $self->put_results(text_after_html => 1) if $self->{closed_html};
   }
 
-  if ($self->{last_text}) {
+  if (exists $self->{text}->[-1]) {
     # ideas discarded since they would be easy to evade:
     # 1. using \w or [A-Za-z] instead of \S or non-punctuation
     # 2. exempting certain tags
     if ($text =~ /^[^\s\x21-\x2f\x3a-\x40\x5b-\x60\x7b-\x7e]/s &&
-	$self->{last_text} =~ /[^\s\x21-\x2f\x3a-\x40\x5b-\x60\x7b-\x7e]\z/s)
+	$self->{text}->[-1] =~ /[^\s\x21-\x2f\x3a-\x40\x5b-\x60\x7b-\x7e]\z/s)
     {
-      $self->{html}{obfuscation}++;
+      $self->{obfuscation}++;
     }
-    if ($self->{last_text} =~
+    if ($self->{text}->[-1] =~
 	/\b([^\s\x21-\x2f\x3a-\x40\x5b-\x60\x7b-\x7e]{1,7})\z/s)
     {
       my $start = length($1);
       if ($text =~ /^([^\s\x21-\x2f\x3a-\x40\x5b-\x60\x7b-\x7e]{1,7})\b/s) {
-	my $backhair = $start . "_" . length($1);
-	$self->{html}{backhair}->{$backhair}++;
-	$self->{html}{backhair_count} = keys %{ $self->{html}{backhair} };
+	$self->{backhair}->{$start . "_" . length($1)}++;
       }
     }
   }
 
-  if ($visible_for_bayes) {
-    $self->{last_visible_text} .= $text;
+  if ($invisible_for_bayes) {
+    $self->display_text($text, invisible => 1);
   }
   else {
-    $self->{last_invisible_text} .= $text;
+    $self->display_text($text);
   }
-  $self->{last_text} .= $text;
 }
 
 # note: $text includes <!-- and -->
 sub html_comment {
   my ($self, $text) = @_;
 
-  push @{ $self->{html}{comment} }, $text;
+  push @{ $self->{comment} }, $text;
 
-  if ($self->{html_last_tag} eq "div" &&
-      $text =~ /Converted from text\/plain format/)
-  {
-    $self->{html}{div_converted} = 1;
-  }
-  if (exists $self->{html}{inside_script} && $self->{html}{inside_script} > 0)
+  if (exists $self->{inside}{script} && $self->{inside}{script} > 0)
   {
-    if ($text =~ /\b(?:$events)\b/io)
-    {
-      $self->{html}{html_event} = 1;
-    }
     if ($text =~ /\bon(?:blur|contextmenu|focus|load|resize|submit|unload)\b/i)
     {
-      $self->{html}{html_event_unsafe} = 1;
+      $self->put_results(html_event_unsafe => 1);
     }
-    # if ($text =~ /\.open\s*\(/) { $self->{html}{window_open} = 1; }
     return;
   }
-
-  if (exists $self->{html}{inside_style} && $self->{html}{inside_style} > 0) {
+  if (exists $self->{inside}{style} && $self->{inside}{style} > 0) {
     if ($text =~ /font(?:-size)?:\s*(\d+(?:\.\d*)?|\.\d+)(p[tx])/i) {
-      $self->examine_text_style ($1, $2);
+      $self->examine_text_style($1, $2);
     }
-  }
-
-  if (exists $self->{html}{shouting} && $self->{html}{shouting} > 1) {
-    $self->{html}{comment_shouting} = 1;
+    return;
   }
 }
 
@@ -1033,10 +743,318 @@
   if ($text =~ /^<!doctype/i) {
     my $tag = "!doctype";
 
-    $self->{html}{elements}++;
-    $self->{html}{tags}++;
-    $self->{html}{"inside_$tag"} = 0;
+    $self->{elements}++;
+    $self->{tags}++;
+    $self->{inside}{$tag} = 0;
   }
+}
+
+###########################################################################
+
+my %html_color = (
+  # HTML 4 defined 16 colors
+  aqua => 0x00ffff,
+  black => 0x000000,
+  blue => 0x0000ff,
+  fuchsia => 0xff00ff,
+  gray => 0x808080,
+  green => 0x008000,
+  lime => 0x00ff00,
+  maroon => 0x800000,
+  navy => 0x000080,
+  olive => 0x808000,
+  purple => 0x800080,
+  red => 0xff0000,
+  silver => 0xc0c0c0,
+  teal => 0x008080,
+  white => 0xffffff,
+  yellow => 0xffff00,
+  # colors specified in CSS3 color module
+  aliceblue => 0xf0f8ff,
+  antiquewhite => 0xfaebd7,
+  aqua => 0x00ffff,
+  aquamarine => 0x7fffd4,
+  azure => 0xf0ffff,
+  beige => 0xf5f5dc,
+  bisque => 0xffe4c4,
+  black => 0x000000,
+  blanchedalmond => 0xffebcd,
+  blue => 0x0000ff,
+  blueviolet => 0x8a2be2,
+  brown => 0xa52a2a,
+  burlywood => 0xdeb887,
+  cadetblue => 0x5f9ea0,
+  chartreuse => 0x7fff00,
+  chocolate => 0xd2691e,
+  coral => 0xff7f50,
+  cornflowerblue => 0x6495ed,
+  cornsilk => 0xfff8dc,
+  crimson => 0xdc143c,
+  cyan => 0x00ffff,
+  darkblue => 0x00008b,
+  darkcyan => 0x008b8b,
+  darkgoldenrod => 0xb8860b,
+  darkgray => 0xa9a9a9,
+  darkgreen => 0x006400,
+  darkgrey => 0xa9a9a9,
+  darkkhaki => 0xbdb76b,
+  darkmagenta => 0x8b008b,
+  darkolivegreen => 0x556b2f,
+  darkorange => 0xff8c00,
+  darkorchid => 0x9932cc,
+  darkred => 0x8b0000,
+  darksalmon => 0xe9967a,
+  darkseagreen => 0x8fbc8f,
+  darkslateblue => 0x483d8b,
+  darkslategray => 0x2f4f4f,
+  darkslategrey => 0x2f4f4f,
+  darkturquoise => 0x00ced1,
+  darkviolet => 0x9400d3,
+  deeppink => 0xff1493,
+  deepskyblue => 0x00bfff,
+  dimgray => 0x696969,
+  dimgrey => 0x696969,
+  dodgerblue => 0x1e90ff,
+  firebrick => 0xb22222,
+  floralwhite => 0xfffaf0,
+  forestgreen => 0x228b22,
+  fuchsia => 0xff00ff,
+  gainsboro => 0xdcdcdc,
+  ghostwhite => 0xf8f8ff,
+  gold => 0xffd700,
+  goldenrod => 0xdaa520,
+  gray => 0x808080,
+  green => 0x008000,
+  greenyellow => 0xadff2f,
+  grey => 0x808080,
+  honeydew => 0xf0fff0,
+  hotpink => 0xff69b4,
+  indianred => 0xcd5c5c,
+  indigo => 0x4b0082,
+  ivory => 0xfffff0,
+  khaki => 0xf0e68c,
+  lavender => 0xe6e6fa,
+  lavenderblush => 0xfff0f5,
+  lawngreen => 0x7cfc00,
+  lemonchiffon => 0xfffacd,
+  lightblue => 0xadd8e6,
+  lightcoral => 0xf08080,
+  lightcyan => 0xe0ffff,
+  lightgoldenrodyellow => 0xfafad2,
+  lightgray => 0xd3d3d3,
+  lightgreen => 0x90ee90,
+  lightgrey => 0xd3d3d3,
+  lightpink => 0xffb6c1,
+  lightsalmon => 0xffa07a,
+  lightseagreen => 0x20b2aa,
+  lightskyblue => 0x87cefa,
+  lightslategray => 0x778899,
+  lightslategrey => 0x778899,
+  lightsteelblue => 0xb0c4de,
+  lightyellow => 0xffffe0,
+  lime => 0x00ff00,
+  limegreen => 0x32cd32,
+  linen => 0xfaf0e6,
+  magenta => 0xff00ff,
+  maroon => 0x800000,
+  mediumaquamarine => 0x66cdaa,
+  mediumblue => 0x0000cd,
+  mediumorchid => 0xba55d3,
+  mediumpurple => 0x9370db,
+  mediumseagreen => 0x3cb371,
+  mediumslateblue => 0x7b68ee,
+  mediumspringgreen => 0x00fa9a,
+  mediumturquoise => 0x48d1cc,
+  mediumvioletred => 0xc71585,
+  midnightblue => 0x191970,
+  mintcream => 0xf5fffa,
+  mistyrose => 0xffe4e1,
+  moccasin => 0xffe4b5,
+  navajowhite => 0xffdead,
+  navy => 0x000080,
+  oldlace => 0xfdf5e6,
+  olive => 0x808000,
+  olivedrab => 0x6b8e23,
+  orange => 0xffa500,
+  orangered => 0xff4500,
+  orchid => 0xda70d6,
+  palegoldenrod => 0xeee8aa,
+  palegreen => 0x98fb98,
+  paleturquoise => 0xafeeee,
+  palevioletred => 0xdb7093,
+  papayawhip => 0xffefd5,
+  peachpuff => 0xffdab9,
+  peru => 0xcd853f,
+  pink => 0xffc0cb,
+  plum => 0xdda0dd,
+  powderblue => 0xb0e0e6,
+  purple => 0x800080,
+  red => 0xff0000,
+  rosybrown => 0xbc8f8f,
+  royalblue => 0x4169e1,
+  saddlebrown => 0x8b4513,
+  salmon => 0xfa8072,
+  sandybrown => 0xf4a460,
+  seagreen => 0x2e8b57,
+  seashell => 0xfff5ee,
+  sienna => 0xa0522d,
+  silver => 0xc0c0c0,
+  skyblue => 0x87ceeb,
+  slateblue => 0x6a5acd,
+  slategray => 0x708090,
+  slategrey => 0x708090,
+  snow => 0xfffafa,
+  springgreen => 0x00ff7f,
+  steelblue => 0x4682b4,
+  tan => 0xd2b48c,
+  teal => 0x008080,
+  thistle => 0xd8bfd8,
+  tomato => 0xff6347,
+  turquoise => 0x40e0d0,
+  violet => 0xee82ee,
+  wheat => 0xf5deb3,
+  white => 0xffffff,
+  whitesmoke => 0xf5f5f5,
+  yellow => 0xffff00,
+  yellowgreen => 0x9acd32,
+);
+
+sub _name_to_rgb {
+  my $color = lc $_[0];
+  if (my $hex = $html_color{$color}) {
+      return sprintf("#%06x", $hex);
+  }
+  return $color;
+}
+
+use constant URI_STRICT => 0;
+
+# resolving relative URIs as defined in RFC 2396 (steps from section 5.2)
+# using draft http://www.gbiv.com/protocols/uri/rev-2002/rfc2396bis.html
+sub _parse_uri {
+  my ($u) = @_;
+  my %u;
+  ($u{scheme}, $u{authority}, $u{path}, $u{query}, $u{fragment}) =
+    $u =~ m|^(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;
+  return %u;
+}
+
+sub _remove_dot_segments {
+  my ($input) = @_;
+  my $output = "";
+
+  $input =~ s@^(?:\.\.?/)@/@;
+
+  while ($input) {
+    if ($input =~ s@^/\.(?:$|/)@/@) {
+    }
+    elsif ($input =~ s@^/\.\.(?:$|/)@/@) {
+      $output =~ s@/?[^/]*$@@;
+    }
+    elsif ($input =~ s@(/?[^/]*)@@) {
+      $output .= $1;
+    }
+  }
+  return $output;
+}
+
+sub _merge_uri {
+  my ($base_authority, $base_path, $r_path) = @_;
+
+  if (defined $base_authority && !$base_path) {
+    return "/" . $r_path;
+  }
+  else {
+    if ($base_path =~ m|/|) {
+      $base_path =~ s|(?<=/)[^/]*$||;
+    }
+    else {
+      $base_path = "";
+    }
+    return $base_path . $r_path;
+  }
+}
+
+sub target_uri {
+  my ($base, $r) = @_;
+
+  my %r = _parse_uri($r);	# parsed relative URI
+  my %base = _parse_uri($base);	# parsed base URI
+  my %t;			# generated temporary URI
+
+  if ((not URI_STRICT) and
+      (defined $r{scheme} && defined $base{scheme}) and
+      ($r{scheme} eq $base{scheme}))
+  {
+    undef $r{scheme};
+  }
+
+  if (defined $r{scheme}) {
+    $t{scheme} = $r{scheme};
+    $t{authority} = $r{authority};
+    $t{path} = _remove_dot_segments($r{path});
+    $t{query} = $r{query};
+  }
+  else {
+    if (defined $r{authority}) {
+      $t{authority} = $r{authority};
+      $t{path} = _remove_dot_segments($r{path});
+      $t{query} = $r{query};
+    }
+    else {
+      if ($r{path} eq "") {
+	$t{path} = $base{path};
+	if (defined $r{query}) {
+	  $t{query} = $r{query};
+	}
+	else {
+	  $t{query} = $base{query};
+	}
+      }
+      else {
+	if ($r{path} =~ m|^/|) {
+	  $t{path} = _remove_dot_segments($r{path});
+	}
+	else {
+	  $t{path} = _merge_uri($base{authority}, $base{path}, $r{path});
+	  $t{path} = _remove_dot_segments($t{path});
+	}
+	$t{query} = $r{query};
+      }
+      $t{authority} = $base{authority};
+    }
+    $t{scheme} = $base{scheme};
+  }
+  $t{fragment} = $r{fragment};
+
+  # recompose URI
+  my $result = "";
+  if ($t{scheme}) {
+    $result .= $t{scheme} . ":";
+  }
+  elsif (defined $t{authority}) {
+    # this block is not part of the RFC
+    # TODO: figure out what MUAs actually do with unschemed URIs
+    # maybe look at URI::Heuristic
+    if ($t{authority} =~ /^www\d*\./i) {
+      # some spammers are using unschemed URIs to escape filters
+      $result .= "http:";
+    }
+    elsif ($t{authority} =~ /^ftp\d*\./i) {
+      $result .= "ftp:";
+    }
+  }
+  if ($t{authority}) {
+    $result .= "//" . $t{authority};
+  }
+  $result .= $t{path};
+  if ($t{query}) {
+    $result .= "?" . $t{query};
+  }
+  if ($t{fragment}) {
+    $result .= "#" . $t{fragment};
+  }
+  return $result;
 }
 
 1;

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Message/Node.pm
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Message/Node.pm	(original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Message/Node.pm	Tue Oct 19 14:39:34 2004
@@ -321,18 +321,20 @@
 }
 
 # Look at a text scalar and determine whether it should be rendered
-# as text/html.  Based on a heuristic which simulates a certain
-# well-used/common mail client.
+# as text/html.
 #
-# We don't need to advertise this in the POD doc.
+# This is not a public function.
 # 
-sub _html_near_start {
-  my ($pad) = @_;
-
-  my $count = 0;
-  $count += ($pad =~ tr/\n//d) * 2;
-  $count += ($pad =~ tr/\n//cd);
-  return ($count < 24);
+sub _html_render {
+  if ($_[0] =~ m/^(.{0,18}?<(?:body|head|html|img|pre|table|title)(?:\s.{0,18}?)?>)/is)
+  {
+    my $pad = $1;
+    my $count = 0;
+    $count += ($pad =~ tr/\n//d) * 2;
+    $count += ($pad =~ tr/\n//cd);
+    return ($count < 24);
+  }
+  return 0;
 }
 
 =item rendered()
@@ -351,68 +353,35 @@
   # We don't render anything except text
   return(undef,undef) unless ( $self->{'type'} =~ /^text\b/i );
 
-  if ( !exists $self->{rendered} ) {
+  if (!exists $self->{rendered}) {
     my $text = $self->decode();
     my $raw = length($text);
 
     # render text/html always, or any other text|text/plain part as text/html
     # based on a heuristic which simulates a certain common mail client
-    if ( $raw > 0 && (
-        $self->{'type'} =~ m@^text/html\b@i || (
-        $self->{'type'} =~ m@^text(?:$|/plain)@i &&
-	  $text =~ m/^(.{0,18}?<(?:$Mail::SpamAssassin::HTML::re_start)(?:\s.{0,18}?)?>)/ois &&
-	  _html_near_start($1))
-        )
-       ) 
+    if ($raw > 0 && ($self->{'type'} =~ m@^text/html\b@i ||
+		     ($self->{'type'} =~ m@^text(?:$|/plain)@i &&
+		      _html_render(substr($text, 0, 23)))))
     {
-      $self->{'rendered_type'} = 'text/html';
-      my $html = Mail::SpamAssassin::HTML->new(); # object
-      my @lines = @{$html->html_render($text)};
-      $self->{rendered} = join('', @lines);
-      $self->{html_results} = $html->get_results(); # needed in eval tests
-
-      # the visible text parts of the message; all invisible or low-contrast
-      # text removed.  TODO: wonder if we should just replace 
-      # $self->{rendered} with this?
-      $self->{invisible_rendered} = join('',
-                                @{$html->{html_invisible_text}});
-      $self->{visible_rendered} = join('',
-                                @{$html->{html_visible_text}});
-
-      # some tests done after rendering
-      my $r = $self->{html_results}; # temporary reference for brevity
-      $r->{html_message} = 1;
-      $r->{html_length} = 0;
-      my $space = 0;
-      for my $line (@lines) {
-        $line = pack ('C0A*', $line);
-        $space += ($line =~ tr/ \t\n\r\x0b\xa0/ \t\n\r\x0b\xa0/);
-        $r->{html_length} += length($line);
-      }
+      $self->{rendered_type} = 'text/html';
+
+      my $html = Mail::SpamAssassin::HTML->new();	# object
+      $html->parse($text);				# parse+render text
+      $self->{rendered} = $html->get_rendered_text();
+      $self->{visible_rendered} = $html->get_rendered_text(invisible => 0);
+      $self->{invisible_rendered} = $html->get_rendered_text(invisible => 1);
+      $self->{html_results} = $html->get_results();
+
+      # end-of-document result values that require looking at the text
+      my $r = $self->{html_results};	# temporary reference for brevity
+      my $space = ($self->{rendered} =~ tr/ \t\n\r\x0b\xa0/ \t\n\r\x0b\xa0/);
+      $r->{html_length} = length($self->{rendered});
       $r->{non_space_len} = $r->{html_length} - $space;
       $r->{ratio} = ($raw - $r->{html_length}) / $raw;
-      if (exists $r->{elements} && exists $r->{tags}) {
-	$r->{bad_tag_ratio} = ($r->{tags} - $r->{elements}) / $r->{tags};
-      }
-      if (exists $r->{elements_seen} && exists $r->{tags_seen}) {
-	$r->{non_element_ratio} =
-	    ($r->{tags_seen} - $r->{elements_seen}) / $r->{tags_seen};
-      }
-      if (exists $r->{tags} && exists $r->{obfuscation}) {
-	$r->{obfuscation_ratio} = $r->{obfuscation} / $r->{tags};
-      }
-      if (exists $r->{attr_bad} && exists $r->{attr_all}) {
-	$r->{attr_bad} = $r->{attr_bad} / $r->{attr_all};
-      }
-      if (exists $r->{attr_unique_bad} && exists $r->{attr_unique_all}) {
-	$r->{attr_unique_bad} = $r->{attr_unique_bad} / $r->{attr_unique_all};
-      }
     }
     else {
       $self->{rendered_type} = $self->{type};
       $self->{rendered} = $text;
-      $self->{invisible_rendered} = '';
-      $self->{visible_rendered} = $text;
     }
   }
 

Modified: spamassassin/trunk/rules/20_html_tests.cf
==============================================================================
--- spamassassin/trunk/rules/20_html_tests.cf	(original)
+++ spamassassin/trunk/rules/20_html_tests.cf	Tue Oct 19 14:39:34 2004
@@ -29,7 +29,7 @@
 # please sort these by eval type then name
 
 # HTML control test, HTML spam rules should all have better S/O than this
-body HTML_MESSAGE		eval:html_test('html_message')
+body HTML_MESSAGE		eval:html_test('html')
 describe HTML_MESSAGE		HTML included in message
 
 # the HTML percentage range