You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by qu...@apache.org on 2004/10/19 23:39:34 UTC
svn commit: rev 55103 - in spamassassin/trunk: lib/Mail/SpamAssassin lib/Mail/SpamAssassin/Message rules
Author: quinlan
Date: Tue Oct 19 14:39:34 2004
New Revision: 55103
Modified:
spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Message/Node.pm
spamassassin/trunk/rules/20_html_tests.cf
Log:
bug 3907: clean up HTML module
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm Tue Oct 19 14:39:34 2004
@@ -2999,9 +2999,9 @@
$rawtag =~ /^([a-zA-Z0-9]+)$/; my $tag = $1;
$rawexpr =~ /^([\<\>\=\!\-\+ 0-9]+)$/; my $expr = $1;
- return 0 unless exists $self->{html}{"inside_$tag"};
+ return 0 unless exists $self->{html}{inside}{$tag};
- $self->{html}{"inside_$tag"} =~ /^([\<\>\=\!\-\+ 0-9]+)$/;
+ $self->{html}{inside}{$tag} =~ /^([\<\>\=\!\-\+ 0-9]+)$/;
my $val = $1;
return eval "\$val $expr";
}
@@ -3009,7 +3009,7 @@
sub html_image_only {
my ($self, undef, $min, $max) = @_;
- return (exists $self->{html}{"inside_img"} &&
+ return (exists $self->{html}{inside}{img} &&
exists $self->{html}{length} &&
$self->{html}{length} > $min &&
$self->{html}{length} <= $max);
@@ -3048,7 +3048,7 @@
sub html_tag_exists {
my ($self, undef, $tag) = @_;
- return exists $self->{html}{"inside_$tag"};
+ return exists $self->{html}{inside}{$tag};
}
sub html_test {
@@ -3166,19 +3166,19 @@
# in one part will be the same in other parts.
#
if ($type eq 'text/html') {
- foreach my $w (grep(/\w/,split(/\s+/,$rnd))) {
+ foreach my $w (grep(/\w/, split(/\s+/, $rnd))) {
#dbg("eval: HTML: $w");
$html{$w}++;
}
# If there are no words, mark if there's at least 1 image ...
- if (keys %html == 0 && exists $self->{html}{"inside_img"}) {
+ if (keys %html == 0 && exists $self->{html}{inside}{img}) {
# Use "\n" as the mark since it can't ever occur normally
$html{"\n"}=1;
}
}
else {
- foreach my $w (grep(/\w/,split(/\s+/,$rnd))) {
+ foreach my $w (grep(/\w/, split(/\s+/, $rnd))) {
#dbg("eval: TEXT: $w");
$text{$w}++;
}
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm Tue Oct 19 14:39:34 2004
@@ -23,18 +23,13 @@
use bytes;
package Mail::SpamAssassin::HTML;
-
-require Exporter;
-my @ISA = qw(Exporter);
-my @EXPORT = qw($re_start $re_loose $re_strict get_results);
-my @EXPORT_OK = qw();
-
use HTML::Parser 3.24 ();
-use vars qw($re_start $re_loose $re_strict $re_other);
+use vars qw($re_loose $re_strict $re_other @ISA @EXPORT @EXPORT_OK);
-# elements that trigger HTML rendering in text/plain in some mail clients
-# (repeats ones listed in $re_strict)
-$re_start = 'body|head|html|img|pre|table|title';
+require Exporter;
+@ISA = qw(HTML::Parser Exporter);
+@EXPORT = qw($re_loose $re_strict get_results);
+@EXPORT_OK = qw();
# elements defined by the HTML 4.01 and XHTML 1.0 DTDs (do not change them!)
$re_loose = 'applet|basefont|center|dir|font|frame|frameset|iframe|isindex|menu|noframes|s|strike|u';
@@ -68,25 +63,31 @@
background => [qw(body marquee)],
);
-my %tested_colors;
-
sub new {
- my $this = shift;
- my $class = ref($this) || $this;
- my $self = {};
- bless($self, $class);
-
- $self->html_start();
-
- return $self;
+ my ($class) = @_;
+ my $self = $class->SUPER::new(
+ api_version => 3,
+ handlers => [
+ start_document => ["html_start", "self"],
+ start => ["html_tag", "self,tagname,attr,'+1'"],
+ end_document => ["html_end", "self"],
+ end => ["html_tag", "self,tagname,attr,'-1'"],
+ text => ["html_text", "self,dtext"],
+ comment => ["html_comment", "self,text"],
+ declaration => ["html_declaration", "self,text"],
+ ],
+ marked_sections => 1);
+ $self;
}
sub html_start {
my ($self) = @_;
- $self->{basefont} = 3;
+ # trigger HTML_MESSAGE
+ $self->put_results(html => 1);
- undef $self->{text_style};
+ # initial display attributes
+ $self->{basefont} = 3;
my %default = (tag => "default",
fgcolor => "#000000",
bgcolor => "#ffffff",
@@ -97,43 +98,104 @@
sub html_end {
my ($self) = @_;
- $self->display_text();
+ delete $self->{text_style};
+
+ # final results scalars
+ $self->put_results(image_area => $self->{image_area});
+ $self->put_results(max_shouting => $self->{max_shouting});
+ $self->put_results(length => $self->{length});
+ $self->put_results(min_size => $self->{min_size});
+ $self->put_results(max_size => $self->{max_size});
+
+ # final result arrays
+ $self->put_results(comment => $self->{comment});
+ $self->put_results(title => $self->{title});
+ $self->put_results(anchor => $self->{anchor});
+
+ # final result hashes
+ $self->put_results(inside => $self->{inside});
+ $self->put_results(uri => $self->{uri});
+
+ # end-of-document result values that don't require looking at the text
+ if (exists $self->{backhair}) {
+ $self->put_results(backhair_count => scalar keys %{ $self->{backhair} });
+ }
+ if (exists $self->{elements} && exists $self->{tags}) {
+ $self->put_results(bad_tag_ratio =>
+ ($self->{tags} - $self->{elements}) / $self->{tags});
+ }
+ if (exists $self->{elements_seen} && exists $self->{tags_seen}) {
+ $self->put_results(non_element_ratio =>
+ ($self->{tags_seen} - $self->{elements_seen}) /
+ $self->{tags_seen});
+ }
+ if (exists $self->{tags} && exists $self->{obfuscation}) {
+ $self->put_results(obfuscation_ratio =>
+ $self->{obfuscation} / $self->{tags});
+ }
+ if (exists $self->{attr_bad} && exists $self->{attr_all}) {
+ $self->put_results(attr_bad => $self->{attr_bad} / $self->{attr_all});
+ }
+ if (exists $self->{attr_unique_bad} && exists $self->{attr_unique_all}) {
+ $self->put_results(attr_unique_bad =>
+ $self->{attr_unique_bad} / $self->{attr_unique_all});
+ }
+}
+
+sub put_results {
+ my $self = shift;
+ my %results = @_;
+
+ while (my ($k, $v) = each %results) {
+ $self->{results}{$k} = $v;
+ }
}
sub get_results {
my ($self) = @_;
- return $self->{html};
+ return $self->{results};
+}
+
+sub get_rendered_text {
+ my $self = shift;
+ my %options = @_;
+
+ return join('', @{ $self->{text} }) unless keys %options;
+
+ my $mask;
+ while (my ($k, $v) = each %options) {
+ next if !defined $self->{"text_$k"};
+ if (!defined $mask) {
+ $mask |= $v ? $self->{"text_$k"} : ~ $self->{"text_$k"};
+ }
+ else {
+ $mask &= $v ? $self->{"text_$k"} : ~ $self->{"text_$k"};
+ }
+ }
+
+ my $text = '';
+ my $i = 0;
+ for (@{ $self->{text} }) { $text .= $_ if vec($mask, $i++, 1); }
+ return $text;
}
-sub html_render {
+sub parse {
my ($self, $text) = @_;
- # clean this up later
- for my $key (keys %{ $self->{html} }) {
- delete $self->{html}{$key};
- }
-
- $self->{html}{ratio} = 0;
- $self->{html}{image_area} = 0;
- $self->{html}{shouting} = 0;
- $self->{html}{max_shouting} = 0;
- $self->{html}{anchor_index} = -1;
- $self->{html}{title_index} = -1;
- $self->{html}{max_size} = 3; # start at default size
- $self->{html}{min_size} = 3; # start at default size
-
- $self->{html_text} = [];
- $self->{html_visible_text} = [];
- $self->{html_invisible_text} = [];
- $self->{last_text} = "";
- $self->{last_visible_text} = "";
- $self->{last_invisible_text} = "";
- $self->{html_last_tag} = 0;
- $self->{html}{closed_html} = 0;
- $self->{html}{closed_body} = 0;
+ $self->{image_area} = 0;
+ $self->{shouting} = 0;
+ $self->{max_shouting} = 0;
+ $self->{anchor_index} = -1;
+ $self->{title_index} = -1;
+ $self->{max_size} = 3; # start at default size
+ $self->{min_size} = 3; # start at default size
+ $self->{closed_html} = 0;
+ $self->{closed_body} = 0;
+ $self->{text} = []; # rendered text
+ $self->{text_invisible} = ''; # vec of invisibility state in $self->{text}
- $self->{html}{length} += $1 if (length($text) =~ m/^(\d+)$/); # untaint
+ $self->{length} += $1 if (length($text) =~ m/^(\d+)$/); # untaint
# NOTE: We *only* need to fix the rendering when we verify that it
# differs from what people see in their MUA. Testing is best done with
@@ -159,32 +221,17 @@
$text =~ s/<\/(?:\s.*?)?>//gs;
}
- # HTML::Parser 3.31, at least, converts into a question mark "?" for some reason.
- # Let's convert them to spaces.
+ # HTML::Parser converts into a question mark ("?") for some
+ # reason, so convert them to spaces. Confirmed in 3.31, at least.
$text =~ s/ / /g;
- my $hp = HTML::Parser->new(
- api_version => 3,
- handlers => [
- start_document => [sub { $self->html_start(@_) }],
- start => [sub { $self->html_tag(@_) }, "tagname,attr,'+1'"],
- end_document => [sub { $self->html_end(@_) }],
- end => [sub { $self->html_tag(@_) }, "tagname,attr,'-1'"],
- text => [sub { $self->html_text(@_) }, "dtext"],
- comment => [sub { $self->html_comment(@_) }, "text"],
- declaration => [sub { $self->html_declaration(@_) }, "text"],
- ],
- marked_sections => 1);
-
# ALWAYS pack it into byte-representation, even if we're using 'use bytes',
# since the HTML::Parser object may use Unicode internally.
# (bug 1417, maybe)
- $hp->parse(pack ('C0A*', $text));
- $hp->eof;
+ $self->SUPER::parse(pack('C0A*', $text));
+ $self->SUPER::eof;
- delete $self->{html_last_tag};
-
- return $self->{html_text};
+ return $self->{text};
}
sub html_tag {
@@ -194,22 +241,22 @@
# general tracking
if ($is_element) {
- $self->{html}{elements}++;
- $self->{html}{elements_seen}++ if !exists $self->{html}{"inside_$tag"};
+ $self->{elements}++;
+ $self->{elements_seen}++ if !exists $self->{inside}{$tag};
}
- $self->{html}{tags}++;
- $self->{html}{tags_seen}++ if !exists $self->{html}{"inside_$tag"};
- $self->{html}{"inside_$tag"} += $num;
- $self->{html}{"inside_$tag"} = 0 if $self->{html}{"inside_$tag"} < 0;
+ $self->{tags}++;
+ $self->{tags_seen}++ if !exists $self->{inside}{$tag};
+ $self->{inside}{$tag} += $num;
+ $self->{inside}{$tag} = 0 if $self->{inside}{$tag} < 0;
# check attributes
for my $name (keys %$attr) {
if ($name !~ /^(?:$re_attr|$re_attr_extra)$/io) {
- $self->{html}{attr_bad}++;
- $self->{html}{attr_unique_bad}++ if !exists $self->{"attr_seen_$name"};
+ $self->{attr_bad}++;
+ $self->{attr_unique_bad}++ if !exists $self->{"attr_seen_$name"};
}
- $self->{html}{attr_all}++;
- $self->{html}{attr_unique_all}++ if !exists $self->{"attr_seen_$name"};
+ $self->{attr_all}++;
+ $self->{attr_unique_all}++ if !exists $self->{"attr_seen_$name"};
$self->{"attr_seen_$name"} = 1;
}
@@ -231,173 +278,32 @@
}
# end tags
elsif ($num == -1) {
- $self->{html}{closed_html} = 1 if $tag eq "html";
- $self->{html}{closed_body} = 1 if $tag eq "body";
+ $self->{closed_html} = 1 if $tag eq "html";
+ $self->{closed_body} = 1 if $tag eq "body";
}
# shouting
if ($tag =~ /^(?:b|i|u|strong|em|big|center|h\d)$/) {
- $self->{html}{shouting} += $num;
- if ($self->{html}{shouting} > $self->{html}{max_shouting}) {
- $self->{html}{max_shouting} = $self->{html}{shouting};
+ $self->{shouting} += $num;
+ if ($self->{shouting} > $self->{max_shouting}) {
+ $self->{max_shouting} = $self->{shouting};
}
}
-
- $self->{html_last_tag} = (($num < 0) ? "/" : "") . $tag;
}
}
sub html_format {
my ($self, $tag, $attr, $num) = @_;
- # ordered by frequency of tag groups
+ # ordered by frequency of tag groups, note: whitespace is always "visible"
if ($tag eq "br" || $tag eq "div") {
- $self->display_text();
- push @{$self->{html_visible_text}}, "\n";
- push @{$self->{html_invisible_text}}, "\n";
- push @{$self->{html_text}}, "\n";
+ $self->display_text("\n", whitespace => 1);
}
elsif ($tag =~ /^(?:li|t[hd]|d[td])$/) {
- $self->display_text();
- push @{$self->{html_visible_text}}, " ";
- push @{$self->{html_invisible_text}}, " ";
- push @{$self->{html_text}}, " ";
+ $self->display_text(" ", whitespace => 1);
}
elsif ($tag =~ /^(?:p|hr|blockquote|pre)$/) {
- $self->display_text();
- push @{$self->{html_visible_text}}, "\n\n";
- push @{$self->{html_invisible_text}}, "\n\n";
- push @{$self->{html_text}}, "\n\n";
- }
-}
-
-use constant URI_STRICT => 0;
-
-# resolving relative URIs as defined in RFC 2396 (steps from section 5.2)
-# using draft http://www.gbiv.com/protocols/uri/rev-2002/rfc2396bis.html
-sub parse_uri {
- my ($u) = @_;
- my %u;
- ($u{scheme}, $u{authority}, $u{path}, $u{query}, $u{fragment}) =
- $u =~ m|^(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;
- return %u;
-}
-
-sub remove_dot_segments {
- my ($input) = @_;
- my $output = "";
-
- $input =~ s@^(?:\.\.?/)@/@;
-
- while ($input) {
- if ($input =~ s@^/\.(?:$|/)@/@) {
- }
- elsif ($input =~ s@^/\.\.(?:$|/)@/@) {
- $output =~ s@/?[^/]*$@@;
- }
- elsif ($input =~ s@(/?[^/]*)@@) {
- $output .= $1;
- }
- }
- return $output;
-}
-
-sub merge_uri {
- my ($base_authority, $base_path, $r_path) = @_;
-
- if (defined $base_authority && !$base_path) {
- return "/" . $r_path;
- }
- else {
- if ($base_path =~ m|/|) {
- $base_path =~ s|(?<=/)[^/]*$||;
- }
- else {
- $base_path = "";
- }
- return $base_path . $r_path;
- }
-}
-
-sub target_uri {
- my ($base, $r) = @_;
-
- my %r = parse_uri($r); # parsed relative URI
- my %base = parse_uri($base); # parsed base URI
- my %t; # generated temporary URI
-
- if ((not URI_STRICT) and
- (defined $r{scheme} && defined $base{scheme}) and
- ($r{scheme} eq $base{scheme}))
- {
- undef $r{scheme};
- }
-
- if (defined $r{scheme}) {
- $t{scheme} = $r{scheme};
- $t{authority} = $r{authority};
- $t{path} = remove_dot_segments($r{path});
- $t{query} = $r{query};
- }
- else {
- if (defined $r{authority}) {
- $t{authority} = $r{authority};
- $t{path} = remove_dot_segments($r{path});
- $t{query} = $r{query};
- }
- else {
- if ($r{path} eq "") {
- $t{path} = $base{path};
- if (defined $r{query}) {
- $t{query} = $r{query};
- }
- else {
- $t{query} = $base{query};
- }
- }
- else {
- if ($r{path} =~ m|^/|) {
- $t{path} = remove_dot_segments($r{path});
- }
- else {
- $t{path} = merge_uri($base{authority}, $base{path}, $r{path});
- $t{path} = remove_dot_segments($t{path});
- }
- $t{query} = $r{query};
- }
- $t{authority} = $base{authority};
- }
- $t{scheme} = $base{scheme};
- }
- $t{fragment} = $r{fragment};
-
- # recompose URI
- my $result = "";
- if ($t{scheme}) {
- $result .= $t{scheme} . ":";
- }
- elsif (defined $t{authority}) {
- # this block is not part of the RFC
- # TODO: figure out what MUAs actually do with unschemed URIs
- # maybe look at URI::Heuristic
- if ($t{authority} =~ /^www\d*\./i) {
- # some spammers are using unschemed URIs to escape filters
- $result .= "http:";
- }
- elsif ($t{authority} =~ /^ftp\d*\./i) {
- $result .= "ftp:";
- }
- }
- if ($t{authority}) {
- $result .= "//" . $t{authority};
- }
- $result .= $t{path};
- if ($t{query}) {
- $result .= "?" . $t{query};
- }
- if ($t{fragment}) {
- $result .= "#" . $t{fragment};
+ $self->display_text("\n\n", whitespace => 1);
}
- return $result;
}
sub push_uri {
@@ -409,8 +315,8 @@
$uri =~ s/^\s+//;
$uri =~ s/\s+$//;
- my $target = target_uri($self->{html}{base_href} || "", $uri);
- push @{$self->{html}{uri}}, $target if $target;
+ my $target = target_uri($self->{base_href} || "", $uri);
+ push @{ $self->{uri} }, $target if $target;
}
sub html_uri {
@@ -434,7 +340,7 @@
# use <BASE HREF="URI"> to turn relative links into absolute links
# even if it is a base URI, handle like a normal URI as well
- push @{$self->{html}{uri}}, $uri;
+ push @{ $self->{uri} }, $uri;
# a base URI will be ignored by browsers unless it is an absolute
# URI of a standard protocol
@@ -445,188 +351,12 @@
# Make sure it ends in a slash
$uri .= "/" unless $uri =~ m@/$@;
- $self->{html}{base_href} = $uri;
+ $self->{base_href} = $uri;
}
}
}
}
-my %html_color = (
- # HTML 4 defined 16 colors
- aqua => 0x00ffff,
- black => 0x000000,
- blue => 0x0000ff,
- fuchsia => 0xff00ff,
- gray => 0x808080,
- green => 0x008000,
- lime => 0x00ff00,
- maroon => 0x800000,
- navy => 0x000080,
- olive => 0x808000,
- purple => 0x800080,
- red => 0xff0000,
- silver => 0xc0c0c0,
- teal => 0x008080,
- white => 0xffffff,
- yellow => 0xffff00,
- # colors specified in CSS3 color module
- aliceblue => 0xf0f8ff,
- antiquewhite => 0xfaebd7,
- aqua => 0x00ffff,
- aquamarine => 0x7fffd4,
- azure => 0xf0ffff,
- beige => 0xf5f5dc,
- bisque => 0xffe4c4,
- black => 0x000000,
- blanchedalmond => 0xffebcd,
- blue => 0x0000ff,
- blueviolet => 0x8a2be2,
- brown => 0xa52a2a,
- burlywood => 0xdeb887,
- cadetblue => 0x5f9ea0,
- chartreuse => 0x7fff00,
- chocolate => 0xd2691e,
- coral => 0xff7f50,
- cornflowerblue => 0x6495ed,
- cornsilk => 0xfff8dc,
- crimson => 0xdc143c,
- cyan => 0x00ffff,
- darkblue => 0x00008b,
- darkcyan => 0x008b8b,
- darkgoldenrod => 0xb8860b,
- darkgray => 0xa9a9a9,
- darkgreen => 0x006400,
- darkgrey => 0xa9a9a9,
- darkkhaki => 0xbdb76b,
- darkmagenta => 0x8b008b,
- darkolivegreen => 0x556b2f,
- darkorange => 0xff8c00,
- darkorchid => 0x9932cc,
- darkred => 0x8b0000,
- darksalmon => 0xe9967a,
- darkseagreen => 0x8fbc8f,
- darkslateblue => 0x483d8b,
- darkslategray => 0x2f4f4f,
- darkslategrey => 0x2f4f4f,
- darkturquoise => 0x00ced1,
- darkviolet => 0x9400d3,
- deeppink => 0xff1493,
- deepskyblue => 0x00bfff,
- dimgray => 0x696969,
- dimgrey => 0x696969,
- dodgerblue => 0x1e90ff,
- firebrick => 0xb22222,
- floralwhite => 0xfffaf0,
- forestgreen => 0x228b22,
- fuchsia => 0xff00ff,
- gainsboro => 0xdcdcdc,
- ghostwhite => 0xf8f8ff,
- gold => 0xffd700,
- goldenrod => 0xdaa520,
- gray => 0x808080,
- green => 0x008000,
- greenyellow => 0xadff2f,
- grey => 0x808080,
- honeydew => 0xf0fff0,
- hotpink => 0xff69b4,
- indianred => 0xcd5c5c,
- indigo => 0x4b0082,
- ivory => 0xfffff0,
- khaki => 0xf0e68c,
- lavender => 0xe6e6fa,
- lavenderblush => 0xfff0f5,
- lawngreen => 0x7cfc00,
- lemonchiffon => 0xfffacd,
- lightblue => 0xadd8e6,
- lightcoral => 0xf08080,
- lightcyan => 0xe0ffff,
- lightgoldenrodyellow => 0xfafad2,
- lightgray => 0xd3d3d3,
- lightgreen => 0x90ee90,
- lightgrey => 0xd3d3d3,
- lightpink => 0xffb6c1,
- lightsalmon => 0xffa07a,
- lightseagreen => 0x20b2aa,
- lightskyblue => 0x87cefa,
- lightslategray => 0x778899,
- lightslategrey => 0x778899,
- lightsteelblue => 0xb0c4de,
- lightyellow => 0xffffe0,
- lime => 0x00ff00,
- limegreen => 0x32cd32,
- linen => 0xfaf0e6,
- magenta => 0xff00ff,
- maroon => 0x800000,
- mediumaquamarine => 0x66cdaa,
- mediumblue => 0x0000cd,
- mediumorchid => 0xba55d3,
- mediumpurple => 0x9370db,
- mediumseagreen => 0x3cb371,
- mediumslateblue => 0x7b68ee,
- mediumspringgreen => 0x00fa9a,
- mediumturquoise => 0x48d1cc,
- mediumvioletred => 0xc71585,
- midnightblue => 0x191970,
- mintcream => 0xf5fffa,
- mistyrose => 0xffe4e1,
- moccasin => 0xffe4b5,
- navajowhite => 0xffdead,
- navy => 0x000080,
- oldlace => 0xfdf5e6,
- olive => 0x808000,
- olivedrab => 0x6b8e23,
- orange => 0xffa500,
- orangered => 0xff4500,
- orchid => 0xda70d6,
- palegoldenrod => 0xeee8aa,
- palegreen => 0x98fb98,
- paleturquoise => 0xafeeee,
- palevioletred => 0xdb7093,
- papayawhip => 0xffefd5,
- peachpuff => 0xffdab9,
- peru => 0xcd853f,
- pink => 0xffc0cb,
- plum => 0xdda0dd,
- powderblue => 0xb0e0e6,
- purple => 0x800080,
- red => 0xff0000,
- rosybrown => 0xbc8f8f,
- royalblue => 0x4169e1,
- saddlebrown => 0x8b4513,
- salmon => 0xfa8072,
- sandybrown => 0xf4a460,
- seagreen => 0x2e8b57,
- seashell => 0xfff5ee,
- sienna => 0xa0522d,
- silver => 0xc0c0c0,
- skyblue => 0x87ceeb,
- slateblue => 0x6a5acd,
- slategray => 0x708090,
- slategrey => 0x708090,
- snow => 0xfffafa,
- springgreen => 0x00ff7f,
- steelblue => 0x4682b4,
- tan => 0xd2b48c,
- teal => 0x008080,
- thistle => 0xd8bfd8,
- tomato => 0xff6347,
- turquoise => 0x40e0d0,
- violet => 0xee82ee,
- wheat => 0xf5deb3,
- white => 0xffffff,
- whitesmoke => 0xf5f5f5,
- yellow => 0xffff00,
- yellowgreen => 0x9acd32,
-);
-
-sub name_to_rgb {
- my $color = lc $_[0];
- if (my $hex = $html_color{$color}) {
- return sprintf("#%06x", $hex);
- }
- return $color;
-}
-
# this might not be quite right, may need to pay attention to table nesting
sub close_table_tag {
my ($self, $tag) = @_;
@@ -636,7 +366,7 @@
my $top;
while (@{ $self->{text_style} } && ($top = $self->{text_style}[-1]->{tag})) {
- if (($tag eq "td" && $top =~ /^(?:font|td)$/) ||
+ if (($tag eq "td" && ($top eq "font" || $top eq "td")) ||
($tag eq "tr" && $top =~ /^(?:font|td|tr)$/))
{
pop @{ $self->{text_style} };
@@ -713,9 +443,9 @@
# tag attributes
for my $name (keys %$attr) {
next unless (grep { $_ eq $tag } @{ $ok_attribute{$name} });
- if ($name =~ /^(?:text|color)$/) {
+ if ($name eq "text" || $name eq "color") {
# two different names for text color
- $new{fgcolor} = name_to_rgb(lc($attr->{$name}));
+ $new{fgcolor} = _name_to_rgb($attr->{$name});
}
elsif ($name eq "size" && $attr->{size} =~ /^\s*([+-]\d+)/) {
# relative font size
@@ -724,7 +454,7 @@
else {
if ($name eq "bgcolor") {
# overwrite with hex value, $new{bgcolor} is set below
- $attr->{bgcolor} = name_to_rgb(lc($attr->{bgcolor}));
+ $attr->{bgcolor} = _name_to_rgb($attr->{bgcolor});
}
if ($name eq "size" && $attr->{size} !~ /^\s*([+-])(\d+)/) {
# attribute is malformed
@@ -734,11 +464,11 @@
$new{$name} = $attr->{$name};
}
}
- if ($new{size} > $self->{html}{max_size}) {
- $self->{html}{max_size} = $new{size};
+ if ($new{size} > $self->{max_size}) {
+ $self->{max_size} = $new{size};
}
- elsif ($new{size} < $self->{html}{min_size}) {
- $self->{html}{min_size} = $new{size};
+ elsif ($new{size} < $self->{min_size}) {
+ $self->{min_size} = $new{size};
}
}
push @{ $self->{text_style} }, \%new;
@@ -757,12 +487,11 @@
my $fg = $self->{text_style}[-1]->{fgcolor};
my $bg = $self->{text_style}[-1]->{bgcolor};
- my $visible_for_bayes = 1;
# invisibility
if (substr($fg,-6) eq substr($bg,-6)) {
- $self->{html}{font_invisible} = 1;
- $visible_for_bayes = 0;
+ $self->put_results(font_invisible => 1);
+ return 1;
}
# near-invisibility
elsif ($fg =~ /^\#?([0-9a-f]{2})([0-9a-f]{2})([0-9a-f]{2})$/) {
@@ -784,63 +513,47 @@
# increases (near-invisible text is at about 0.95% of spam and
# 1.25% of HTML spam right now), but please test any changes first
if ($distance < 12) {
- $self->{html}{"font_low_contrast"} = 1;
- $visible_for_bayes = 0;
+ $self->put_results(font_low_contrast => 1);
+ return 1;
}
}
}
- return $visible_for_bayes;
+ return 0;
}
sub html_tests {
my ($self, $tag, $attr, $num) = @_;
- local ($_);
- if ($tag eq "table" && exists $attr->{border} && $attr->{border} =~ /(\d+)/)
- {
- $self->{html}{thick_border} = 1 if $1 > 1;
- }
- # if ($tag eq "script") {
- # $self->{html}{javascript} = 1;
- # }
if ($tag =~ /^(?:a|body|div|input|form|td|layer|area|img)$/i) {
- for (keys %$attr) {
- if (/\b(?:$events)\b/io)
- {
- $self->{html}{html_event} = 1;
- }
- if (/\bon(?:contextmenu|load|resize|submit|unload)\b/i &&
- $attr->{$_})
+ for my $key (keys %$attr) {
+ if ($key =~ /\bon(?:contextmenu|load|resize|submit|unload)\b/i &&
+ $attr->{$key})
{
- $self->{html}{html_event_unsafe} = 1;
- # if ($attr->{$_} =~ /\.open\s*\(/) { $self->{html}{window_open} = 1; }
+ $self->put_results(html_event_unsafe => 1);
}
}
}
if ($tag eq "font" && exists $attr->{size}) {
my $size = $attr->{size};
- $self->{html}{tiny_font} = 1 if (($size =~ /^\s*(\d+)/ && $1 < 1) ||
- ($size =~ /\-(\d+)/ && $1 >= 3));
- $self->{html}{big_font} = 1 if (($size =~ /^\s*(\d+)/ && $1 > 3) ||
- ($size =~ /\+(\d+)/ && $1 >= 1));
+ $self->put_results(tiny_font => 1) if (($size =~ /^\s*(\d+)/ && $1 < 1) ||
+ ($size =~ /\-(\d+)/ && $1 >= 3));
+ $self->put_results(big_font => 1) if (($size =~ /^\s*(\d+)/ && $1 > 3) ||
+ ($size =~ /\+(\d+)/ && $1 >= 1));
}
if ($tag eq "font" && exists $attr->{face}) {
if ($attr->{face} =~ /[A-Z]{3}/ && $attr->{face} !~ /M[ST][A-Z]|ITC/) {
- $self->{html}{font_face_caps} = 1;
+ $self->put_results(font_face_caps => 1);
}
if ($attr->{face} !~ /^[a-z][a-z -]*[a-z](?:,\s*[a-z][a-z -]*[a-z])*$/i) {
- $self->{html}{font_face_bad} = 1;
+ $self->put_results(font_face_bad => 1);
}
}
- if (exists($attr->{style})) {
+ if (exists $attr->{style}) {
if ($attr->{style} =~ /font(?:-size)?:\s*(\d+(?:\.\d*)?|\.\d+)(p[tx])/i) {
- $self->examine_text_style ($1, $2);
+ $self->examine_text_style($1, $2);
}
}
- if ($tag eq "img") {
- push @{ $self->{html}{img_src} }, $attr->{src} if exists $attr->{src};
- }
if ($tag eq "img" && exists $attr->{width} && exists $attr->{height}) {
my $width = 0;
my $height = 0;
@@ -860,32 +573,30 @@
$height = 200 if $height <= 0;
if ($width > 0 && $height > 0) {
$area = $width * $height;
- $self->{html}{image_area} += $area;
+ $self->{image_area} += $area;
}
# this is intended to match any width and height if they're specified
if (exists $attr->{src} &&
$attr->{src} =~ /\.(?:pl|cgi|php|asp|jsp|cfm)\b/i)
{
- $self->{html}{web_bugs} = 1;
+ $self->put_results(web_bugs => 1);
}
}
if ($tag eq "form" && exists $attr->{action}) {
- $self->{html}{form_action_mailto} = 1 if $attr->{action} =~ /mailto:/i
+ $self->put_results(form_action_mailto => 1) if $attr->{action} =~ /mailto:/i
}
- if ($tag =~ /^(?:object|embed)$/) {
- $self->{html}{embeds} = 1;
+ if ($tag eq "object" || $tag eq "embed") {
+ $self->put_results(embeds => 1);
}
# special text delimiters - <a> and <title>
if ($tag eq "a") {
- $self->{html}{anchor_index}++;
- $self->{html}{anchor}->[$self->{html}{anchor_index}] = "";
+ $self->{anchor_index}++;
+ $self->{anchor}->[$self->{anchor_index}] = "";
}
if ($tag eq "title") {
- $self->{html}{title_index}++;
- $self->{html}{title}->[$self->{html}{title_index}] = "";
-
- # $self->{html}{title_extra}++ if $self->{html}{title_index} > 0;
+ $self->{title_index}++;
+ $self->{title}->[$self->{title_index}] = "";
}
if ($tag eq "meta" &&
@@ -894,29 +605,47 @@
$attr->{'http-equiv'} =~ /Content-Type/i &&
$attr->{content} =~ /\bcharset\s*=\s*["']?([^"']+)/i)
{
- $self->{html}{charsets} .= exists $self->{html}{charsets} ? " $1" : $1;
+ $self->{charsets} .= exists $self->{charsets} ? " $1" : $1;
}
}
sub examine_text_style {
my ($self, $size, $type) = @_;
$type = lc $type;
- $self->{html}{tiny_font} = 1 if ($type eq "pt" && $size < 4);
- $self->{html}{tiny_font} = 1 if ($type eq "pt" && $size < 4);
- $self->{html}{big_font} = 1 if ($type eq "pt" && $size > 14);
- $self->{html}{big_font} = 1 if ($type eq "px" && $size > 18);
+ $self->put_results(tiny_font => 1) if ($type eq "pt" && $size < 4);
+ $self->put_results(tiny_font => 1) if ($type eq "px" && $size < 4);
+ $self->put_results(big_font => 1) if ($type eq "pt" && $size > 14);
+ $self->put_results(big_font => 1) if ($type eq "px" && $size > 18);
}
sub display_text {
- my ($self) = @_;
-
- for my $type ('text', 'visible_text', 'invisible_text') {
- my $text = $self->{"last_$type"};
+ my $self = shift;
+ my $text = shift;
+ my %display = @_;
+
+ if ($display{whitespace}) {
+ # trim trailing whitespace from previous element if it was not whitespace
+ if (@{ $self->{text} } &&
+ (!defined $self->{text_whitespace} ||
+ !vec($self->{text_whitespace}, $#{$self->{text}}, 1)))
+ {
+ $self->{text}->[-1] =~ s/ $//;
+ }
+ }
+ else {
$text =~ s/[ \t\n\r\f\x0b\xa0]+/ /g;
- $text =~ s/^ //;
- $text =~ s/ $//;
- push @{$self->{"html_$type"}}, $text;
- $self->{"last_$type"} = "";
+ # trim leading whitespace if previous element was whitespace
+ if (@{ $self->{text} } &&
+ defined $self->{text_whitespace} &&
+ vec($self->{text_whitespace}, $#{$self->{text}}, 1))
+ {
+ $text =~ s/^ //;
+ }
+ }
+ push @{ $self->{text} }, $text;
+ while (my ($k, $v) = each %display) {
+ $self->{"text_$k"} ||= '';
+ vec($self->{"text_$k"}, $#{$self->{text}}, 1) = $v;
}
}
@@ -926,104 +655,85 @@
# note: this comes back from HTML::Parser as UTF-8-tainted. Enforce byte
# mode by repacking the string in byte mode, to avoid 'Malformed UTF-8
# character (unexpected non-continuation byte)' warnings
- $text = pack ("C0A*", $text);
+ $text = pack("C0A*", $text);
# text that is not part of body
- if (exists $self->{html}{inside_script} && $self->{html}{inside_script} > 0)
+ if (exists $self->{inside}{script} && $self->{inside}{script} > 0)
{
if ($text =~ /\bon(?:blur|contextmenu|focus|load|resize|submit|unload)\b/i)
{
- $self->{html}{html_event_unsafe} = 1;
+ $self->put_results(html_event_unsafe => 1);
}
- if ($text =~ /\b(?:$events)\b/io) { $self->{html}{html_event} = 1; }
- # if ($text =~ /\.open\s*\(/) { $self->{html}{window_open} = 1; }
return;
}
- if (exists $self->{html}{inside_style} && $self->{html}{inside_style} > 0) {
+ if (exists $self->{inside}{style} && $self->{inside}{style} > 0) {
if ($text =~ /font(?:-size)?:\s*(\d+(?:\.\d*)?|\.\d+)(p[tx])/i) {
- $self->examine_text_style ($1, $2);
+ $self->examine_text_style($1, $2);
}
return;
}
# text that is part of body and also stored separately
- if (exists $self->{html}{inside_a} && $self->{html}{inside_a} > 0) {
- $self->{html}{anchor}->[$self->{html}{anchor_index}] .= $text;
+ if (exists $self->{inside}{a} && $self->{inside}{a} > 0) {
+ $self->{anchor}->[$self->{anchor_index}] .= $text;
}
- if (exists $self->{html}{inside_title} && $self->{html}{inside_title} > 0) {
- $self->{html}{title}->[$self->{html}{title_index}] .= $text;
+ if (exists $self->{inside}{title} && $self->{inside}{title} > 0) {
+ $self->{title}->[$self->{title_index}] .= $text;
}
- my $visible_for_bayes = 1;
+ my $invisible_for_bayes = 0;
if ($text =~ /[^ \t\n\r\f\x0b\xa0]/) {
- $visible_for_bayes = $self->html_font_invisible($text);
- $self->{html}{text_after_body} = 1 if $self->{html}{closed_body};
- $self->{html}{text_after_html} = 1 if $self->{html}{closed_html};
+ $invisible_for_bayes = $self->html_font_invisible($text);
+ $self->put_results(text_after_body => 1) if $self->{closed_body};
+ $self->put_results(text_after_html => 1) if $self->{closed_html};
}
- if ($self->{last_text}) {
+ if (exists $self->{text}->[-1]) {
# ideas discarded since they would be easy to evade:
# 1. using \w or [A-Za-z] instead of \S or non-punctuation
# 2. exempting certain tags
if ($text =~ /^[^\s\x21-\x2f\x3a-\x40\x5b-\x60\x7b-\x7e]/s &&
- $self->{last_text} =~ /[^\s\x21-\x2f\x3a-\x40\x5b-\x60\x7b-\x7e]\z/s)
+ $self->{text}->[-1] =~ /[^\s\x21-\x2f\x3a-\x40\x5b-\x60\x7b-\x7e]\z/s)
{
- $self->{html}{obfuscation}++;
+ $self->{obfuscation}++;
}
- if ($self->{last_text} =~
+ if ($self->{text}->[-1] =~
/\b([^\s\x21-\x2f\x3a-\x40\x5b-\x60\x7b-\x7e]{1,7})\z/s)
{
my $start = length($1);
if ($text =~ /^([^\s\x21-\x2f\x3a-\x40\x5b-\x60\x7b-\x7e]{1,7})\b/s) {
- my $backhair = $start . "_" . length($1);
- $self->{html}{backhair}->{$backhair}++;
- $self->{html}{backhair_count} = keys %{ $self->{html}{backhair} };
+ $self->{backhair}->{$start . "_" . length($1)}++;
}
}
}
- if ($visible_for_bayes) {
- $self->{last_visible_text} .= $text;
+ if ($invisible_for_bayes) {
+ $self->display_text($text, invisible => 1);
}
else {
- $self->{last_invisible_text} .= $text;
+ $self->display_text($text);
}
- $self->{last_text} .= $text;
}
# note: $text includes <!-- and -->
sub html_comment {
my ($self, $text) = @_;
- push @{ $self->{html}{comment} }, $text;
+ push @{ $self->{comment} }, $text;
- if ($self->{html_last_tag} eq "div" &&
- $text =~ /Converted from text\/plain format/)
- {
- $self->{html}{div_converted} = 1;
- }
- if (exists $self->{html}{inside_script} && $self->{html}{inside_script} > 0)
+ if (exists $self->{inside}{script} && $self->{inside}{script} > 0)
{
- if ($text =~ /\b(?:$events)\b/io)
- {
- $self->{html}{html_event} = 1;
- }
if ($text =~ /\bon(?:blur|contextmenu|focus|load|resize|submit|unload)\b/i)
{
- $self->{html}{html_event_unsafe} = 1;
+ $self->put_results(html_event_unsafe => 1);
}
- # if ($text =~ /\.open\s*\(/) { $self->{html}{window_open} = 1; }
return;
}
-
- if (exists $self->{html}{inside_style} && $self->{html}{inside_style} > 0) {
+ if (exists $self->{inside}{style} && $self->{inside}{style} > 0) {
if ($text =~ /font(?:-size)?:\s*(\d+(?:\.\d*)?|\.\d+)(p[tx])/i) {
- $self->examine_text_style ($1, $2);
+ $self->examine_text_style($1, $2);
}
- }
-
- if (exists $self->{html}{shouting} && $self->{html}{shouting} > 1) {
- $self->{html}{comment_shouting} = 1;
+ return;
}
}
@@ -1033,10 +743,318 @@
if ($text =~ /^<!doctype/i) {
my $tag = "!doctype";
- $self->{html}{elements}++;
- $self->{html}{tags}++;
- $self->{html}{"inside_$tag"} = 0;
+ $self->{elements}++;
+ $self->{tags}++;
+ $self->{inside}{$tag} = 0;
}
+}
+
+###########################################################################
+
+my %html_color = (
+ # HTML 4 defined 16 colors
+ aqua => 0x00ffff,
+ black => 0x000000,
+ blue => 0x0000ff,
+ fuchsia => 0xff00ff,
+ gray => 0x808080,
+ green => 0x008000,
+ lime => 0x00ff00,
+ maroon => 0x800000,
+ navy => 0x000080,
+ olive => 0x808000,
+ purple => 0x800080,
+ red => 0xff0000,
+ silver => 0xc0c0c0,
+ teal => 0x008080,
+ white => 0xffffff,
+ yellow => 0xffff00,
+ # colors specified in CSS3 color module
+ aliceblue => 0xf0f8ff,
+ antiquewhite => 0xfaebd7,
+ aqua => 0x00ffff,
+ aquamarine => 0x7fffd4,
+ azure => 0xf0ffff,
+ beige => 0xf5f5dc,
+ bisque => 0xffe4c4,
+ black => 0x000000,
+ blanchedalmond => 0xffebcd,
+ blue => 0x0000ff,
+ blueviolet => 0x8a2be2,
+ brown => 0xa52a2a,
+ burlywood => 0xdeb887,
+ cadetblue => 0x5f9ea0,
+ chartreuse => 0x7fff00,
+ chocolate => 0xd2691e,
+ coral => 0xff7f50,
+ cornflowerblue => 0x6495ed,
+ cornsilk => 0xfff8dc,
+ crimson => 0xdc143c,
+ cyan => 0x00ffff,
+ darkblue => 0x00008b,
+ darkcyan => 0x008b8b,
+ darkgoldenrod => 0xb8860b,
+ darkgray => 0xa9a9a9,
+ darkgreen => 0x006400,
+ darkgrey => 0xa9a9a9,
+ darkkhaki => 0xbdb76b,
+ darkmagenta => 0x8b008b,
+ darkolivegreen => 0x556b2f,
+ darkorange => 0xff8c00,
+ darkorchid => 0x9932cc,
+ darkred => 0x8b0000,
+ darksalmon => 0xe9967a,
+ darkseagreen => 0x8fbc8f,
+ darkslateblue => 0x483d8b,
+ darkslategray => 0x2f4f4f,
+ darkslategrey => 0x2f4f4f,
+ darkturquoise => 0x00ced1,
+ darkviolet => 0x9400d3,
+ deeppink => 0xff1493,
+ deepskyblue => 0x00bfff,
+ dimgray => 0x696969,
+ dimgrey => 0x696969,
+ dodgerblue => 0x1e90ff,
+ firebrick => 0xb22222,
+ floralwhite => 0xfffaf0,
+ forestgreen => 0x228b22,
+ fuchsia => 0xff00ff,
+ gainsboro => 0xdcdcdc,
+ ghostwhite => 0xf8f8ff,
+ gold => 0xffd700,
+ goldenrod => 0xdaa520,
+ gray => 0x808080,
+ green => 0x008000,
+ greenyellow => 0xadff2f,
+ grey => 0x808080,
+ honeydew => 0xf0fff0,
+ hotpink => 0xff69b4,
+ indianred => 0xcd5c5c,
+ indigo => 0x4b0082,
+ ivory => 0xfffff0,
+ khaki => 0xf0e68c,
+ lavender => 0xe6e6fa,
+ lavenderblush => 0xfff0f5,
+ lawngreen => 0x7cfc00,
+ lemonchiffon => 0xfffacd,
+ lightblue => 0xadd8e6,
+ lightcoral => 0xf08080,
+ lightcyan => 0xe0ffff,
+ lightgoldenrodyellow => 0xfafad2,
+ lightgray => 0xd3d3d3,
+ lightgreen => 0x90ee90,
+ lightgrey => 0xd3d3d3,
+ lightpink => 0xffb6c1,
+ lightsalmon => 0xffa07a,
+ lightseagreen => 0x20b2aa,
+ lightskyblue => 0x87cefa,
+ lightslategray => 0x778899,
+ lightslategrey => 0x778899,
+ lightsteelblue => 0xb0c4de,
+ lightyellow => 0xffffe0,
+ lime => 0x00ff00,
+ limegreen => 0x32cd32,
+ linen => 0xfaf0e6,
+ magenta => 0xff00ff,
+ maroon => 0x800000,
+ mediumaquamarine => 0x66cdaa,
+ mediumblue => 0x0000cd,
+ mediumorchid => 0xba55d3,
+ mediumpurple => 0x9370db,
+ mediumseagreen => 0x3cb371,
+ mediumslateblue => 0x7b68ee,
+ mediumspringgreen => 0x00fa9a,
+ mediumturquoise => 0x48d1cc,
+ mediumvioletred => 0xc71585,
+ midnightblue => 0x191970,
+ mintcream => 0xf5fffa,
+ mistyrose => 0xffe4e1,
+ moccasin => 0xffe4b5,
+ navajowhite => 0xffdead,
+ navy => 0x000080,
+ oldlace => 0xfdf5e6,
+ olive => 0x808000,
+ olivedrab => 0x6b8e23,
+ orange => 0xffa500,
+ orangered => 0xff4500,
+ orchid => 0xda70d6,
+ palegoldenrod => 0xeee8aa,
+ palegreen => 0x98fb98,
+ paleturquoise => 0xafeeee,
+ palevioletred => 0xdb7093,
+ papayawhip => 0xffefd5,
+ peachpuff => 0xffdab9,
+ peru => 0xcd853f,
+ pink => 0xffc0cb,
+ plum => 0xdda0dd,
+ powderblue => 0xb0e0e6,
+ purple => 0x800080,
+ red => 0xff0000,
+ rosybrown => 0xbc8f8f,
+ royalblue => 0x4169e1,
+ saddlebrown => 0x8b4513,
+ salmon => 0xfa8072,
+ sandybrown => 0xf4a460,
+ seagreen => 0x2e8b57,
+ seashell => 0xfff5ee,
+ sienna => 0xa0522d,
+ silver => 0xc0c0c0,
+ skyblue => 0x87ceeb,
+ slateblue => 0x6a5acd,
+ slategray => 0x708090,
+ slategrey => 0x708090,
+ snow => 0xfffafa,
+ springgreen => 0x00ff7f,
+ steelblue => 0x4682b4,
+ tan => 0xd2b48c,
+ teal => 0x008080,
+ thistle => 0xd8bfd8,
+ tomato => 0xff6347,
+ turquoise => 0x40e0d0,
+ violet => 0xee82ee,
+ wheat => 0xf5deb3,
+ white => 0xffffff,
+ whitesmoke => 0xf5f5f5,
+ yellow => 0xffff00,
+ yellowgreen => 0x9acd32,
+);
+
+sub _name_to_rgb {
+ my $color = lc $_[0];
+ if (my $hex = $html_color{$color}) {
+ return sprintf("#%06x", $hex);
+ }
+ return $color;
+}
+
+use constant URI_STRICT => 0;
+
+# resolving relative URIs as defined in RFC 2396 (steps from section 5.2)
+# using draft http://www.gbiv.com/protocols/uri/rev-2002/rfc2396bis.html
+sub _parse_uri {
+ my ($u) = @_;
+ my %u;
+ ($u{scheme}, $u{authority}, $u{path}, $u{query}, $u{fragment}) =
+ $u =~ m|^(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;
+ return %u;
+}
+
+sub _remove_dot_segments {
+ my ($input) = @_;
+ my $output = "";
+
+ $input =~ s@^(?:\.\.?/)@/@;
+
+ while ($input) {
+ if ($input =~ s@^/\.(?:$|/)@/@) {
+ }
+ elsif ($input =~ s@^/\.\.(?:$|/)@/@) {
+ $output =~ s@/?[^/]*$@@;
+ }
+ elsif ($input =~ s@(/?[^/]*)@@) {
+ $output .= $1;
+ }
+ }
+ return $output;
+}
+
+sub _merge_uri {
+ my ($base_authority, $base_path, $r_path) = @_;
+
+ if (defined $base_authority && !$base_path) {
+ return "/" . $r_path;
+ }
+ else {
+ if ($base_path =~ m|/|) {
+ $base_path =~ s|(?<=/)[^/]*$||;
+ }
+ else {
+ $base_path = "";
+ }
+ return $base_path . $r_path;
+ }
+}
+
+sub target_uri {
+ my ($base, $r) = @_;
+
+ my %r = _parse_uri($r); # parsed relative URI
+ my %base = _parse_uri($base); # parsed base URI
+ my %t; # generated temporary URI
+
+ if ((not URI_STRICT) and
+ (defined $r{scheme} && defined $base{scheme}) and
+ ($r{scheme} eq $base{scheme}))
+ {
+ undef $r{scheme};
+ }
+
+ if (defined $r{scheme}) {
+ $t{scheme} = $r{scheme};
+ $t{authority} = $r{authority};
+ $t{path} = _remove_dot_segments($r{path});
+ $t{query} = $r{query};
+ }
+ else {
+ if (defined $r{authority}) {
+ $t{authority} = $r{authority};
+ $t{path} = _remove_dot_segments($r{path});
+ $t{query} = $r{query};
+ }
+ else {
+ if ($r{path} eq "") {
+ $t{path} = $base{path};
+ if (defined $r{query}) {
+ $t{query} = $r{query};
+ }
+ else {
+ $t{query} = $base{query};
+ }
+ }
+ else {
+ if ($r{path} =~ m|^/|) {
+ $t{path} = _remove_dot_segments($r{path});
+ }
+ else {
+ $t{path} = _merge_uri($base{authority}, $base{path}, $r{path});
+ $t{path} = _remove_dot_segments($t{path});
+ }
+ $t{query} = $r{query};
+ }
+ $t{authority} = $base{authority};
+ }
+ $t{scheme} = $base{scheme};
+ }
+ $t{fragment} = $r{fragment};
+
+ # recompose URI
+ my $result = "";
+ if ($t{scheme}) {
+ $result .= $t{scheme} . ":";
+ }
+ elsif (defined $t{authority}) {
+ # this block is not part of the RFC
+ # TODO: figure out what MUAs actually do with unschemed URIs
+ # maybe look at URI::Heuristic
+ if ($t{authority} =~ /^www\d*\./i) {
+ # some spammers are using unschemed URIs to escape filters
+ $result .= "http:";
+ }
+ elsif ($t{authority} =~ /^ftp\d*\./i) {
+ $result .= "ftp:";
+ }
+ }
+ if ($t{authority}) {
+ $result .= "//" . $t{authority};
+ }
+ $result .= $t{path};
+ if ($t{query}) {
+ $result .= "?" . $t{query};
+ }
+ if ($t{fragment}) {
+ $result .= "#" . $t{fragment};
+ }
+ return $result;
}
1;
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Message/Node.pm
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Message/Node.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Message/Node.pm Tue Oct 19 14:39:34 2004
@@ -321,18 +321,20 @@
}
# Look at a text scalar and determine whether it should be rendered
-# as text/html. Based on a heuristic which simulates a certain
-# well-used/common mail client.
+# as text/html.
#
-# We don't need to advertise this in the POD doc.
+# This is not a public function.
#
-sub _html_near_start {
- my ($pad) = @_;
-
- my $count = 0;
- $count += ($pad =~ tr/\n//d) * 2;
- $count += ($pad =~ tr/\n//cd);
- return ($count < 24);
+sub _html_render {
+ if ($_[0] =~ m/^(.{0,18}?<(?:body|head|html|img|pre|table|title)(?:\s.{0,18}?)?>)/is)
+ {
+ my $pad = $1;
+ my $count = 0;
+ $count += ($pad =~ tr/\n//d) * 2;
+ $count += ($pad =~ tr/\n//cd);
+ return ($count < 24);
+ }
+ return 0;
}
=item rendered()
@@ -351,68 +353,35 @@
# We don't render anything except text
return(undef,undef) unless ( $self->{'type'} =~ /^text\b/i );
- if ( !exists $self->{rendered} ) {
+ if (!exists $self->{rendered}) {
my $text = $self->decode();
my $raw = length($text);
# render text/html always, or any other text|text/plain part as text/html
# based on a heuristic which simulates a certain common mail client
- if ( $raw > 0 && (
- $self->{'type'} =~ m@^text/html\b@i || (
- $self->{'type'} =~ m@^text(?:$|/plain)@i &&
- $text =~ m/^(.{0,18}?<(?:$Mail::SpamAssassin::HTML::re_start)(?:\s.{0,18}?)?>)/ois &&
- _html_near_start($1))
- )
- )
+ if ($raw > 0 && ($self->{'type'} =~ m@^text/html\b@i ||
+ ($self->{'type'} =~ m@^text(?:$|/plain)@i &&
+ _html_render(substr($text, 0, 23)))))
{
- $self->{'rendered_type'} = 'text/html';
- my $html = Mail::SpamAssassin::HTML->new(); # object
- my @lines = @{$html->html_render($text)};
- $self->{rendered} = join('', @lines);
- $self->{html_results} = $html->get_results(); # needed in eval tests
-
- # the visible text parts of the message; all invisible or low-contrast
- # text removed. TODO: wonder if we should just replace
- # $self->{rendered} with this?
- $self->{invisible_rendered} = join('',
- @{$html->{html_invisible_text}});
- $self->{visible_rendered} = join('',
- @{$html->{html_visible_text}});
-
- # some tests done after rendering
- my $r = $self->{html_results}; # temporary reference for brevity
- $r->{html_message} = 1;
- $r->{html_length} = 0;
- my $space = 0;
- for my $line (@lines) {
- $line = pack ('C0A*', $line);
- $space += ($line =~ tr/ \t\n\r\x0b\xa0/ \t\n\r\x0b\xa0/);
- $r->{html_length} += length($line);
- }
+ $self->{rendered_type} = 'text/html';
+
+ my $html = Mail::SpamAssassin::HTML->new(); # object
+ $html->parse($text); # parse+render text
+ $self->{rendered} = $html->get_rendered_text();
+ $self->{visible_rendered} = $html->get_rendered_text(invisible => 0);
+ $self->{invisible_rendered} = $html->get_rendered_text(invisible => 1);
+ $self->{html_results} = $html->get_results();
+
+ # end-of-document result values that require looking at the text
+ my $r = $self->{html_results}; # temporary reference for brevity
+ my $space = ($self->{rendered} =~ tr/ \t\n\r\x0b\xa0/ \t\n\r\x0b\xa0/);
+ $r->{html_length} = length($self->{rendered});
$r->{non_space_len} = $r->{html_length} - $space;
$r->{ratio} = ($raw - $r->{html_length}) / $raw;
- if (exists $r->{elements} && exists $r->{tags}) {
- $r->{bad_tag_ratio} = ($r->{tags} - $r->{elements}) / $r->{tags};
- }
- if (exists $r->{elements_seen} && exists $r->{tags_seen}) {
- $r->{non_element_ratio} =
- ($r->{tags_seen} - $r->{elements_seen}) / $r->{tags_seen};
- }
- if (exists $r->{tags} && exists $r->{obfuscation}) {
- $r->{obfuscation_ratio} = $r->{obfuscation} / $r->{tags};
- }
- if (exists $r->{attr_bad} && exists $r->{attr_all}) {
- $r->{attr_bad} = $r->{attr_bad} / $r->{attr_all};
- }
- if (exists $r->{attr_unique_bad} && exists $r->{attr_unique_all}) {
- $r->{attr_unique_bad} = $r->{attr_unique_bad} / $r->{attr_unique_all};
- }
}
else {
$self->{rendered_type} = $self->{type};
$self->{rendered} = $text;
- $self->{invisible_rendered} = '';
- $self->{visible_rendered} = $text;
}
}
Modified: spamassassin/trunk/rules/20_html_tests.cf
==============================================================================
--- spamassassin/trunk/rules/20_html_tests.cf (original)
+++ spamassassin/trunk/rules/20_html_tests.cf Tue Oct 19 14:39:34 2004
@@ -29,7 +29,7 @@
# please sort these by eval type then name
# HTML control test, HTML spam rules should all have better S/O than this
-body HTML_MESSAGE eval:html_test('html_message')
+body HTML_MESSAGE eval:html_test('html')
describe HTML_MESSAGE HTML included in message
# the HTML percentage range