You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by qu...@apache.org on 2004/02/03 06:54:48 UTC
svn commit: rev 6440 - in incubator/spamassassin/trunk: lib/Mail/SpamAssassin rules
Author: quinlan
Date: Mon Feb 2 21:54:48 2004
New Revision: 6440
Modified:
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm
incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf
Log:
add parser-based HTML obfuscation tests and a few tweaks to HTML rendering
Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm Mon Feb 2 21:54:48 2004
@@ -190,10 +190,10 @@
if ($tag eq "br") {
push @{$self->{html_text}}, "\n";
}
- elsif ($tag eq "li" || $tag eq "td") {
+ elsif ($tag eq "li" || $tag eq "td" || $tag eq "dd") {
push @{$self->{html_text}}, " ";
}
- elsif ($tag eq "p" || $tag eq "hr") {
+ elsif ($tag =~ /^(?:p|hr|blockquote|pre)$/) {
push @{$self->{html_text}}, "\n\n";
}
elsif ($tag eq "img" && exists $attr->{alt} && $attr->{alt} ne "") {
@@ -691,6 +691,25 @@
$self->html_font_invisible($text) if $text =~ /[^ \t\n\r\f\x0b\xa0]/;
$text =~ s/^\n//s if $self->{html_last_tag} eq "br";
+
+ if (defined $self->{html_text}[-1]) {
+ my $before = $self->{html_text}[-1];
+ if ($before =~ /\S$/s && $text =~ /^\S/s) {
+ $self->{html}{t_obfu_nspc}++;
+ $self->{html}{t_obfu_nfmt}++ if $self->{html_last_tag} !~ /^(?:strong|b|em|font|a|u|span|sup|i)/;
+ }
+ else {
+ $self->{html}{t_nonobfu_nspc}++;
+ }
+ if ($before =~ /\w$/s && $text =~ /^\w/s) {
+ $self->{html}{t_obfu_word}++;
+ $self->{html}{t_obfu_wfmt}++ if $self->{html_last_tag} !~ /^(?:strong|b|em|font|a|u|span|sup|i)/;
+ }
+ else {
+ $self->{html}{t_nonobfu_word}++;
+ }
+ }
+
push @{$self->{html_text}}, $text;
}
Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm Mon Feb 2 21:54:48 2004
@@ -303,6 +303,18 @@
$self->{html_results}{t_bad_tag_unique_ratio} = ($self->{html_results}{tags_seen} - $self->{html_results}{elements_seen}) / $self->{html_results}{tags_seen};
$self->{html_results}{t_bad_tag_unique_count} = ($self->{html_results}{tags_seen} - $self->{html_results}{elements_seen});
}
+ if (exists $self->{html_results}{tags}) {
+ $self->{html_results}{t_obfu_nspc_ratio} = $self->{html_results}{t_obfu_nspc} / $self->{html_results}{tags} if defined $self->{html_results}{t_obfu_nspc};
+ $self->{html_results}{t_obfu_word_ratio} = $self->{html_results}{t_obfu_word} / $self->{html_results}{tags} if defined $self->{html_results}{t_obfu_word};
+ $self->{html_results}{t_obfu_nfmt_ratio} = $self->{html_results}{t_obfu_nfmt} / $self->{html_results}{tags} if defined $self->{html_results}{t_obfu_nfmt};
+ $self->{html_results}{t_obfu_wfmt_ratio} = $self->{html_results}{t_obfu_wfmt} / $self->{html_results}{tags} if defined $self->{html_results}{t_obfu_wfmt};
+ }
+ if (exists $self->{html_results}{t_obfu_word} && exists $self->{html_results}{t_nonobfu_word}) {
+ $self->{html_results}{t_obfu_word2_ratio} = $self->{html_results}{t_obfu_word} / ($self->{html_results}{t_obfu_word} + $self->{html_results}{t_nonobfu_word});
+ }
+ if (exists $self->{html_results}{t_obfu_nspc} && exists $self->{html_results}{t_nonobfu_nspc}) {
+ $self->{html_results}{t_obfu_nspc2_ratio} = $self->{html_results}{t_obfu_nspc} / ($self->{html_results}{t_obfu_nspc} + $self->{html_results}{t_nonobfu_nspc});
+ }
}
else {
$self->{'rendered_type'} = $self->{'type'};
Modified: incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf
==============================================================================
--- incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf (original)
+++ incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf Mon Feb 2 21:54:48 2004
@@ -474,3 +474,70 @@
describe T_MSGID_SPAM_3_6 Message-ID has known spammer pattern
header T_MSGID_SPAM_3_7 Message-Id =~ /<[a-z]{7,}\@(\S+\.)+\S+>/
describe T_MSGID_SPAM_3_7 Message-ID has known spammer pattern
+
+# parser-based HTML obfuscation
+body T_HTML_OBFU_WORD_00_10 eval:html_range('t_obfu_word_ratio','0.00','0.10')
+body T_HTML_OBFU_WORD_10_20 eval:html_range('t_obfu_word_ratio','0.10','0.20')
+body T_HTML_OBFU_WORD_20_30 eval:html_range('t_obfu_word_ratio','0.20','0.30')
+body T_HTML_OBFU_WORD_30_40 eval:html_range('t_obfu_word_ratio','0.30','0.40')
+body T_HTML_OBFU_WORD_40_50 eval:html_range('t_obfu_word_ratio','0.40','0.50')
+body T_HTML_OBFU_WORD_50_60 eval:html_range('t_obfu_word_ratio','0.50','0.60')
+body T_HTML_OBFU_WORD_60_70 eval:html_range('t_obfu_word_ratio','0.60','0.70')
+body T_HTML_OBFU_WORD_70_80 eval:html_range('t_obfu_word_ratio','0.70','0.80')
+body T_HTML_OBFU_WORD_80_90 eval:html_range('t_obfu_word_ratio','0.80','0.90')
+body T_HTML_OBFU_WORD_90_100 eval:html_range('t_obfu_word_ratio','0.90','1.00')
+
+body T_HTML_OBFU_NSPC_00_10 eval:html_range('t_obfu_nspc_ratio','0.00','0.10')
+body T_HTML_OBFU_NSPC_10_20 eval:html_range('t_obfu_nspc_ratio','0.10','0.20')
+body T_HTML_OBFU_NSPC_20_30 eval:html_range('t_obfu_nspc_ratio','0.20','0.30')
+body T_HTML_OBFU_NSPC_30_40 eval:html_range('t_obfu_nspc_ratio','0.30','0.40')
+body T_HTML_OBFU_NSPC_40_50 eval:html_range('t_obfu_nspc_ratio','0.40','0.50')
+body T_HTML_OBFU_NSPC_50_60 eval:html_range('t_obfu_nspc_ratio','0.50','0.60')
+body T_HTML_OBFU_NSPC_60_70 eval:html_range('t_obfu_nspc_ratio','0.60','0.70')
+body T_HTML_OBFU_NSPC_70_80 eval:html_range('t_obfu_nspc_ratio','0.70','0.80')
+body T_HTML_OBFU_NSPC_80_90 eval:html_range('t_obfu_nspc_ratio','0.80','0.90')
+body T_HTML_OBFU_NSPC_90_100 eval:html_range('t_obfu_nspc_ratio','0.90','1.00')
+
+body T_HTML_OBFU_WFMT_00_10 eval:html_range('t_obfu_wfmt_ratio','0.00','0.10')
+body T_HTML_OBFU_WFMT_10_20 eval:html_range('t_obfu_wfmt_ratio','0.10','0.20')
+body T_HTML_OBFU_WFMT_20_30 eval:html_range('t_obfu_wfmt_ratio','0.20','0.30')
+body T_HTML_OBFU_WFMT_30_40 eval:html_range('t_obfu_wfmt_ratio','0.30','0.40')
+body T_HTML_OBFU_WFMT_40_50 eval:html_range('t_obfu_wfmt_ratio','0.40','0.50')
+body T_HTML_OBFU_WFMT_50_60 eval:html_range('t_obfu_wfmt_ratio','0.50','0.60')
+body T_HTML_OBFU_WFMT_60_70 eval:html_range('t_obfu_wfmt_ratio','0.60','0.70')
+body T_HTML_OBFU_WFMT_70_80 eval:html_range('t_obfu_wfmt_ratio','0.70','0.80')
+body T_HTML_OBFU_WFMT_80_90 eval:html_range('t_obfu_wfmt_ratio','0.80','0.90')
+body T_HTML_OBFU_WFMT_90_100 eval:html_range('t_obfu_wfmt_ratio','0.90','1.00')
+
+body T_HTML_OBFU_NFMT_00_10 eval:html_range('t_obfu_nfmt_ratio','0.00','0.10')
+body T_HTML_OBFU_NFMT_10_20 eval:html_range('t_obfu_nfmt_ratio','0.10','0.20')
+body T_HTML_OBFU_NFMT_20_30 eval:html_range('t_obfu_nfmt_ratio','0.20','0.30')
+body T_HTML_OBFU_NFMT_30_40 eval:html_range('t_obfu_nfmt_ratio','0.30','0.40')
+body T_HTML_OBFU_NFMT_40_50 eval:html_range('t_obfu_nfmt_ratio','0.40','0.50')
+body T_HTML_OBFU_NFMT_50_60 eval:html_range('t_obfu_nfmt_ratio','0.50','0.60')
+body T_HTML_OBFU_NFMT_60_70 eval:html_range('t_obfu_nfmt_ratio','0.60','0.70')
+body T_HTML_OBFU_NFMT_70_80 eval:html_range('t_obfu_nfmt_ratio','0.70','0.80')
+body T_HTML_OBFU_NFMT_80_90 eval:html_range('t_obfu_nfmt_ratio','0.80','0.90')
+body T_HTML_OBFU_NFMT_90_100 eval:html_range('t_obfu_nfmt_ratio','0.90','1.00')
+
+body T_HTML_OBFU_WORD2_00_10 eval:html_range('t_obfu_word2_ratio','0.00','0.10')
+body T_HTML_OBFU_WORD2_10_20 eval:html_range('t_obfu_word2_ratio','0.10','0.20')
+body T_HTML_OBFU_WORD2_20_30 eval:html_range('t_obfu_word2_ratio','0.20','0.30')
+body T_HTML_OBFU_WORD2_30_40 eval:html_range('t_obfu_word2_ratio','0.30','0.40')
+body T_HTML_OBFU_WORD2_40_50 eval:html_range('t_obfu_word2_ratio','0.40','0.50')
+body T_HTML_OBFU_WORD2_50_60 eval:html_range('t_obfu_word2_ratio','0.50','0.60')
+body T_HTML_OBFU_WORD2_60_70 eval:html_range('t_obfu_word2_ratio','0.60','0.70')
+body T_HTML_OBFU_WORD2_70_80 eval:html_range('t_obfu_word2_ratio','0.70','0.80')
+body T_HTML_OBFU_WORD2_80_90 eval:html_range('t_obfu_word2_ratio','0.80','0.90')
+body T_HTML_OBFU_WORD2_90_100 eval:html_range('t_obfu_word2_ratio','0.90','1.00')
+
+body T_HTML_OBFU_NSPC2_00_10 eval:html_range('t_obfu_nspc2_ratio','0.00','0.10')
+body T_HTML_OBFU_NSPC2_10_20 eval:html_range('t_obfu_nspc2_ratio','0.10','0.20')
+body T_HTML_OBFU_NSPC2_20_30 eval:html_range('t_obfu_nspc2_ratio','0.20','0.30')
+body T_HTML_OBFU_NSPC2_30_40 eval:html_range('t_obfu_nspc2_ratio','0.30','0.40')
+body T_HTML_OBFU_NSPC2_40_50 eval:html_range('t_obfu_nspc2_ratio','0.40','0.50')
+body T_HTML_OBFU_NSPC2_50_60 eval:html_range('t_obfu_nspc2_ratio','0.50','0.60')
+body T_HTML_OBFU_NSPC2_60_70 eval:html_range('t_obfu_nspc2_ratio','0.60','0.70')
+body T_HTML_OBFU_NSPC2_70_80 eval:html_range('t_obfu_nspc2_ratio','0.70','0.80')
+body T_HTML_OBFU_NSPC2_80_90 eval:html_range('t_obfu_nspc2_ratio','0.80','0.90')
+body T_HTML_OBFU_NSPC2_90_100 eval:html_range('t_obfu_nspc2_ratio','0.90','1.00')