You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by qu...@apache.org on 2004/01/27 23:50:12 UTC

svn commit: rev 6329 - in incubator/spamassassin/trunk: lib/Mail/SpamAssassin rules

Author: quinlan
Date: Tue Jan 27 14:50:11 2004
New Revision: 6329

Modified:
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm
   incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf
Log:
bug 2211: tests to detect invalid HTML tags


Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm	Tue Jan 27 14:50:11 2004
@@ -30,7 +30,7 @@
 my @EXPORT_OK = qw();
 
 use HTML::Parser 3.24 ();
-use vars qw($re_start $re_loose $re_strict);
+use vars qw($re_start $re_loose $re_strict $re_other);
 
 # elements that trigger HTML rendering in text/plain in some mail clients
 # (repeats ones listed in $re_strict)
@@ -43,6 +43,9 @@
 # loose list of HTML events
 my $events = 'on(?:activate|afterupdate|beforeactivate|beforecopy|beforecut|beforedeactivate|beforeeditfocus|beforepaste|beforeupdate|blur|change|click|contextmenu|controlselect|copy|cut|dblclick|deactivate|errorupdate|focus|focusin|focusout|help|keydown|keypress|keyup|load|losecapture|mousedown|mouseenter|mouseleave|mousemove|mouseout|mouseover|mouseup|mousewheel|move|moveend|movestart|paste|propertychange|readystatechange|reset|resize|resizeend|resizestart|select|submit|timeerror|unload)';
 
+# other non-standard tags
+$re_other = 'o:\w+/?|x-sigsep|x-tab';
+
 my %tested_colors;
 
 sub new {
@@ -146,10 +149,15 @@
 sub html_tag {
   my ($self, $tag, $attr, $num) = @_;
 
-  $self->{html}{"inside_$tag"} += $num;
-
-  $self->{html}{elements}++ if $tag =~ /^(?:$re_strict|$re_loose)$/io;
+  if ($tag =~ /^(?:$re_strict|$re_loose|$re_other)$/io) {
+    $self->{html}{elements}++;
+    $self->{html}{elements_seen}++ if !exists $self->{html}{"inside_$tag"};
+  }
   $self->{html}{tags}++;
+  $self->{html}{tags_seen}++ if !exists $self->{html}{"inside_$tag"};
+
+  $self->{html}{"inside_$tag"} += $num;
+  $self->{html}{"inside_$tag"} = 0 if $self->{html}{"inside_$tag"} < 0;
 
   if ($tag =~ /^(?:body|table|tr|th|td)$/) {
     $self->html_bgcolor($tag, $attr, $num);

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm	Tue Jan 27 14:50:11 2004
@@ -293,6 +293,12 @@
       if (exists $self->{html_results}{total_comment_length} && $self->{html_results}{non_uri_len} > 0) {
         $self->{html_results}{total_comment_ratio} = $self->{html_results}{total_comment_length} / $self->{html_results}{non_uri_len};
       }
+      if (exists $self->{html_results}{tags}) {
+	$self->{html_results}{t_bad_tag_ratio} = ($self->{html_results}{tags} - $self->{html_results}{elements}) / $self->{html_results}{tags};
+	$self->{html_results}{t_bad_tag_count} = ($self->{html_results}{tags} - $self->{html_results}{elements});
+	$self->{html_results}{t_bad_tag_unique_ratio} = ($self->{html_results}{tags_seen} - $self->{html_results}{elements_seen}) / $self->{html_results}{tags_seen};
+	$self->{html_results}{t_bad_tag_unique_count} = ($self->{html_results}{tags_seen} - $self->{html_results}{elements_seen});
+      }
     }
     else {
       $self->{'rendered_type'} = $self->{'type'};

Modified: incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf
==============================================================================
--- incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf	(original)
+++ incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf	Tue Jan 27 14:50:11 2004
@@ -317,6 +317,46 @@
 body T_HTML_FONT_TINY		eval:html_test('t_tiny_font')
 describe T_HTML_FONT_TINY	HTML has a tiny font
 
+# HTML bad tag percentage
+body T_HTML_BADTAGS_00_10	eval:html_range('t_bad_tag_ratio','0.00','0.10')
+body T_HTML_BADTAGS_10_20	eval:html_range('t_bad_tag_ratio','0.10','0.20')
+body T_HTML_BADTAGS_20_30	eval:html_range('t_bad_tag_ratio','0.20','0.30')
+body T_HTML_BADTAGS_30_40	eval:html_range('t_bad_tag_ratio','0.30','0.40')
+body T_HTML_BADTAGS_40_50	eval:html_range('t_bad_tag_ratio','0.40','0.50')
+body T_HTML_BADTAGS_50_60	eval:html_range('t_bad_tag_ratio','0.50','0.60')
+body T_HTML_BADTAGS_60_70	eval:html_range('t_bad_tag_ratio','0.60','0.70')
+body T_HTML_BADTAGS_70_80	eval:html_range('t_bad_tag_ratio','0.70','0.80')
+body T_HTML_BADTAGS_80_90	eval:html_range('t_bad_tag_ratio','0.80','0.90')
+body T_HTML_BADTAGS_90_100	eval:html_range('t_bad_tag_ratio','0.90','1.00')
+
+# HTML bad tag count
+body T_HTML_BADTAGS_GT_0	eval:html_range('t_bad_tag_count','0','4')
+body T_HTML_BADTAGS_GT_4	eval:html_range('t_bad_tag_count','4','8')
+body T_HTML_BADTAGS_GT_8	eval:html_range('t_bad_tag_count','8','16')
+body T_HTML_BADTAGS_GT_16	eval:html_range('t_bad_tag_count','16','32')
+body T_HTML_BADTAGS_GT_32	eval:html_range('t_bad_tag_count','32','64')
+body T_HTML_BADTAGS_GT_64	eval:html_range('t_bad_tag_count','64','inf')
+
+# HTML bad tag percentage for unique tags
+body T_HTML_BADTAGS_U_00_10	eval:html_range('t_bad_tag_unique_ratio','0.00','0.10')
+body T_HTML_BADTAGS_U_10_20	eval:html_range('t_bad_tag_unique_ratio','0.10','0.20')
+body T_HTML_BADTAGS_U_20_30	eval:html_range('t_bad_tag_unique_ratio','0.20','0.30')
+body T_HTML_BADTAGS_U_30_40	eval:html_range('t_bad_tag_unique_ratio','0.30','0.40')
+body T_HTML_BADTAGS_U_40_50	eval:html_range('t_bad_tag_unique_ratio','0.40','0.50')
+body T_HTML_BADTAGS_U_50_60	eval:html_range('t_bad_tag_unique_ratio','0.50','0.60')
+body T_HTML_BADTAGS_U_60_70	eval:html_range('t_bad_tag_unique_ratio','0.60','0.70')
+body T_HTML_BADTAGS_U_70_80	eval:html_range('t_bad_tag_unique_ratio','0.70','0.80')
+body T_HTML_BADTAGS_U_80_90	eval:html_range('t_bad_tag_unique_ratio','0.80','0.90')
+body T_HTML_BADTAGS_U_90_100	eval:html_range('t_bad_tag_unique_ratio','0.90','1.00')
+
+# HTML bad tag count for unique tags
+body T_HTML_BADTAGS_U_GT_0	eval:html_range('t_bad_tag_unique_count','0','4')
+body T_HTML_BADTAGS_U_GT_4	eval:html_range('t_bad_tag_unique_count','4','8')
+body T_HTML_BADTAGS_U_GT_8	eval:html_range('t_bad_tag_unique_count','8','16')
+body T_HTML_BADTAGS_U_GT_16	eval:html_range('t_bad_tag_unique_count','16','32')
+body T_HTML_BADTAGS_U_GT_32	eval:html_range('t_bad_tag_unique_count','32','64')
+body T_HTML_BADTAGS_U_GT_64	eval:html_range('t_bad_tag_unique_count','64','inf')
+
 # more portable replacement for RCVD_NUMERIC_HELO that doesn't rely on
 # Received headers using "helo=" prefix
 header T_RCVD_NUMERIC_HELO	X-Spam-Relays-Untrusted =~ / helo=\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} /