You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by qu...@apache.org on 2004/02/03 06:54:48 UTC

svn commit: rev 6440 - in incubator/spamassassin/trunk: lib/Mail/SpamAssassin rules

Author: quinlan
Date: Mon Feb  2 21:54:48 2004
New Revision: 6440

Modified:
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm
   incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf
Log:
add parser-based HTML obfuscation tests and a few tweaks to HTML rendering


Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm	Mon Feb  2 21:54:48 2004
@@ -190,10 +190,10 @@
   if ($tag eq "br") {
     push @{$self->{html_text}}, "\n";
   }
-  elsif ($tag eq "li" || $tag eq "td") {
+  elsif ($tag eq "li" || $tag eq "td" || $tag eq "dd") {
     push @{$self->{html_text}}, " ";
   }
-  elsif ($tag eq "p" || $tag eq "hr") {
+  elsif ($tag =~ /^(?:p|hr|blockquote|pre)$/) {
     push @{$self->{html_text}}, "\n\n";
   }
   elsif ($tag eq "img" && exists $attr->{alt} && $attr->{alt} ne "") {
@@ -691,6 +691,25 @@
   $self->html_font_invisible($text) if $text =~ /[^ \t\n\r\f\x0b\xa0]/;
 
   $text =~ s/^\n//s if $self->{html_last_tag} eq "br";
+
+  if (defined $self->{html_text}[-1]) {
+    my $before = $self->{html_text}[-1];
+    if ($before =~ /\S$/s && $text =~ /^\S/s) {
+      $self->{html}{t_obfu_nspc}++;
+      $self->{html}{t_obfu_nfmt}++ if $self->{html_last_tag} !~ /^(?:strong|b|em|font|a|u|span|sup|i)/;
+    }
+    else {
+      $self->{html}{t_nonobfu_nspc}++;
+    }
+    if ($before =~ /\w$/s && $text =~ /^\w/s) {
+      $self->{html}{t_obfu_word}++;
+      $self->{html}{t_obfu_wfmt}++ if $self->{html_last_tag} !~ /^(?:strong|b|em|font|a|u|span|sup|i)/;
+    }
+    else {
+      $self->{html}{t_nonobfu_word}++;
+    }
+  }
+
   push @{$self->{html_text}}, $text;
 }
 

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm	Mon Feb  2 21:54:48 2004
@@ -303,6 +303,18 @@
 	$self->{html_results}{t_bad_tag_unique_ratio} = ($self->{html_results}{tags_seen} - $self->{html_results}{elements_seen}) / $self->{html_results}{tags_seen};
 	$self->{html_results}{t_bad_tag_unique_count} = ($self->{html_results}{tags_seen} - $self->{html_results}{elements_seen});
       }
+      if (exists $self->{html_results}{tags}) {
+	$self->{html_results}{t_obfu_nspc_ratio} = $self->{html_results}{t_obfu_nspc} / $self->{html_results}{tags} if defined $self->{html_results}{t_obfu_nspc};
+	$self->{html_results}{t_obfu_word_ratio} = $self->{html_results}{t_obfu_word} / $self->{html_results}{tags} if defined $self->{html_results}{t_obfu_word};
+	$self->{html_results}{t_obfu_nfmt_ratio} = $self->{html_results}{t_obfu_nfmt} / $self->{html_results}{tags} if defined $self->{html_results}{t_obfu_nfmt};
+	$self->{html_results}{t_obfu_wfmt_ratio} = $self->{html_results}{t_obfu_wfmt} / $self->{html_results}{tags} if defined $self->{html_results}{t_obfu_wfmt};
+      }
+      if (exists $self->{html_results}{t_obfu_word} && exists $self->{html_results}{t_nonobfu_word}) {
+	$self->{html_results}{t_obfu_word2_ratio} = $self->{html_results}{t_obfu_word} / ($self->{html_results}{t_obfu_word} + $self->{html_results}{t_nonobfu_word});
+      }
+      if (exists $self->{html_results}{t_obfu_nspc} && exists $self->{html_results}{t_nonobfu_nspc}) {
+	$self->{html_results}{t_obfu_nspc2_ratio} = $self->{html_results}{t_obfu_nspc} / ($self->{html_results}{t_obfu_nspc} + $self->{html_results}{t_nonobfu_nspc});
+      }
     }
     else {
       $self->{'rendered_type'} = $self->{'type'};

Modified: incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf
==============================================================================
--- incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf	(original)
+++ incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf	Mon Feb  2 21:54:48 2004
@@ -474,3 +474,70 @@
 describe T_MSGID_SPAM_3_6	Message-ID has known spammer pattern
 header T_MSGID_SPAM_3_7		Message-Id =~ /<[a-z]{7,}\@(\S+\.)+\S+>/
 describe T_MSGID_SPAM_3_7	Message-ID has known spammer pattern
+
+# parser-based HTML obfuscation
+body T_HTML_OBFU_WORD_00_10	eval:html_range('t_obfu_word_ratio','0.00','0.10')
+body T_HTML_OBFU_WORD_10_20	eval:html_range('t_obfu_word_ratio','0.10','0.20')
+body T_HTML_OBFU_WORD_20_30	eval:html_range('t_obfu_word_ratio','0.20','0.30')
+body T_HTML_OBFU_WORD_30_40	eval:html_range('t_obfu_word_ratio','0.30','0.40')
+body T_HTML_OBFU_WORD_40_50	eval:html_range('t_obfu_word_ratio','0.40','0.50')
+body T_HTML_OBFU_WORD_50_60	eval:html_range('t_obfu_word_ratio','0.50','0.60')
+body T_HTML_OBFU_WORD_60_70	eval:html_range('t_obfu_word_ratio','0.60','0.70')
+body T_HTML_OBFU_WORD_70_80	eval:html_range('t_obfu_word_ratio','0.70','0.80')
+body T_HTML_OBFU_WORD_80_90	eval:html_range('t_obfu_word_ratio','0.80','0.90')
+body T_HTML_OBFU_WORD_90_100	eval:html_range('t_obfu_word_ratio','0.90','1.00')
+
+body T_HTML_OBFU_NSPC_00_10	eval:html_range('t_obfu_nspc_ratio','0.00','0.10')
+body T_HTML_OBFU_NSPC_10_20	eval:html_range('t_obfu_nspc_ratio','0.10','0.20')
+body T_HTML_OBFU_NSPC_20_30	eval:html_range('t_obfu_nspc_ratio','0.20','0.30')
+body T_HTML_OBFU_NSPC_30_40	eval:html_range('t_obfu_nspc_ratio','0.30','0.40')
+body T_HTML_OBFU_NSPC_40_50	eval:html_range('t_obfu_nspc_ratio','0.40','0.50')
+body T_HTML_OBFU_NSPC_50_60	eval:html_range('t_obfu_nspc_ratio','0.50','0.60')
+body T_HTML_OBFU_NSPC_60_70	eval:html_range('t_obfu_nspc_ratio','0.60','0.70')
+body T_HTML_OBFU_NSPC_70_80	eval:html_range('t_obfu_nspc_ratio','0.70','0.80')
+body T_HTML_OBFU_NSPC_80_90	eval:html_range('t_obfu_nspc_ratio','0.80','0.90')
+body T_HTML_OBFU_NSPC_90_100	eval:html_range('t_obfu_nspc_ratio','0.90','1.00')
+
+body T_HTML_OBFU_WFMT_00_10	eval:html_range('t_obfu_wfmt_ratio','0.00','0.10')
+body T_HTML_OBFU_WFMT_10_20	eval:html_range('t_obfu_wfmt_ratio','0.10','0.20')
+body T_HTML_OBFU_WFMT_20_30	eval:html_range('t_obfu_wfmt_ratio','0.20','0.30')
+body T_HTML_OBFU_WFMT_30_40	eval:html_range('t_obfu_wfmt_ratio','0.30','0.40')
+body T_HTML_OBFU_WFMT_40_50	eval:html_range('t_obfu_wfmt_ratio','0.40','0.50')
+body T_HTML_OBFU_WFMT_50_60	eval:html_range('t_obfu_wfmt_ratio','0.50','0.60')
+body T_HTML_OBFU_WFMT_60_70	eval:html_range('t_obfu_wfmt_ratio','0.60','0.70')
+body T_HTML_OBFU_WFMT_70_80	eval:html_range('t_obfu_wfmt_ratio','0.70','0.80')
+body T_HTML_OBFU_WFMT_80_90	eval:html_range('t_obfu_wfmt_ratio','0.80','0.90')
+body T_HTML_OBFU_WFMT_90_100	eval:html_range('t_obfu_wfmt_ratio','0.90','1.00')
+
+body T_HTML_OBFU_NFMT_00_10	eval:html_range('t_obfu_nfmt_ratio','0.00','0.10')
+body T_HTML_OBFU_NFMT_10_20	eval:html_range('t_obfu_nfmt_ratio','0.10','0.20')
+body T_HTML_OBFU_NFMT_20_30	eval:html_range('t_obfu_nfmt_ratio','0.20','0.30')
+body T_HTML_OBFU_NFMT_30_40	eval:html_range('t_obfu_nfmt_ratio','0.30','0.40')
+body T_HTML_OBFU_NFMT_40_50	eval:html_range('t_obfu_nfmt_ratio','0.40','0.50')
+body T_HTML_OBFU_NFMT_50_60	eval:html_range('t_obfu_nfmt_ratio','0.50','0.60')
+body T_HTML_OBFU_NFMT_60_70	eval:html_range('t_obfu_nfmt_ratio','0.60','0.70')
+body T_HTML_OBFU_NFMT_70_80	eval:html_range('t_obfu_nfmt_ratio','0.70','0.80')
+body T_HTML_OBFU_NFMT_80_90	eval:html_range('t_obfu_nfmt_ratio','0.80','0.90')
+body T_HTML_OBFU_NFMT_90_100	eval:html_range('t_obfu_nfmt_ratio','0.90','1.00')
+
+body T_HTML_OBFU_WORD2_00_10	eval:html_range('t_obfu_word2_ratio','0.00','0.10')
+body T_HTML_OBFU_WORD2_10_20	eval:html_range('t_obfu_word2_ratio','0.10','0.20')
+body T_HTML_OBFU_WORD2_20_30	eval:html_range('t_obfu_word2_ratio','0.20','0.30')
+body T_HTML_OBFU_WORD2_30_40	eval:html_range('t_obfu_word2_ratio','0.30','0.40')
+body T_HTML_OBFU_WORD2_40_50	eval:html_range('t_obfu_word2_ratio','0.40','0.50')
+body T_HTML_OBFU_WORD2_50_60	eval:html_range('t_obfu_word2_ratio','0.50','0.60')
+body T_HTML_OBFU_WORD2_60_70	eval:html_range('t_obfu_word2_ratio','0.60','0.70')
+body T_HTML_OBFU_WORD2_70_80	eval:html_range('t_obfu_word2_ratio','0.70','0.80')
+body T_HTML_OBFU_WORD2_80_90	eval:html_range('t_obfu_word2_ratio','0.80','0.90')
+body T_HTML_OBFU_WORD2_90_100	eval:html_range('t_obfu_word2_ratio','0.90','1.00')
+
+body T_HTML_OBFU_NSPC2_00_10	eval:html_range('t_obfu_nspc2_ratio','0.00','0.10')
+body T_HTML_OBFU_NSPC2_10_20	eval:html_range('t_obfu_nspc2_ratio','0.10','0.20')
+body T_HTML_OBFU_NSPC2_20_30	eval:html_range('t_obfu_nspc2_ratio','0.20','0.30')
+body T_HTML_OBFU_NSPC2_30_40	eval:html_range('t_obfu_nspc2_ratio','0.30','0.40')
+body T_HTML_OBFU_NSPC2_40_50	eval:html_range('t_obfu_nspc2_ratio','0.40','0.50')
+body T_HTML_OBFU_NSPC2_50_60	eval:html_range('t_obfu_nspc2_ratio','0.50','0.60')
+body T_HTML_OBFU_NSPC2_60_70	eval:html_range('t_obfu_nspc2_ratio','0.60','0.70')
+body T_HTML_OBFU_NSPC2_70_80	eval:html_range('t_obfu_nspc2_ratio','0.70','0.80')
+body T_HTML_OBFU_NSPC2_80_90	eval:html_range('t_obfu_nspc2_ratio','0.80','0.90')
+body T_HTML_OBFU_NSPC2_90_100	eval:html_range('t_obfu_nspc2_ratio','0.90','1.00')