You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2004/01/14 07:16:30 UTC
svn commit: rev 6166 - in incubator/spamassassin/trunk: lib/Mail/SpamAssassin rules
Author: felicity
Date: Tue Jan 13 22:16:30 2004
New Revision: 6166
Modified:
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf
Log:
Added new rules to determine the % difference between text/html and text/*
attachments in a multipart/alternative mail. It's got amazing hit-rate
and accuracy on my corpus even given the simple algorithm I used.
Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm Tue Jan 13 22:16:30 2004
@@ -3541,4 +3541,71 @@
###########################################################################
+sub multipart_alternative_difference {
+ my($self, $fulltext, $min, $max) = @_;
+
+ $self->_multipart_alternative_difference() unless ( exists $self->{madiff} );
+
+ if (($min == 0 || $self->{madiff} > $min) &&
+ ($max eq "undef" || $self->{madiff} <= $max)) {
+ return 1;
+ }
+ return 0;
+}
+
+sub _multipart_alternative_difference {
+ my($self) = @_;
+
+ my @ma = $self->{msg}->{mime_parts}->find_parts(qr@^multipart/alternative\b@i);
+
+ $self->{madiff} = 0;
+
+ # Exchange meeting requests come in as m/a text/html text/calendar ...
+ # Ignore any messages without a multipart/alternative section as well ...
+ if ( !@ma || (@ma == 1 && @{$ma[0]->{body_parts}} == 2 &&
+ $ma[0]->{body_parts}->[0]->{type} =~ m@^text/html\b@i &&
+ $ma[0]->{body_parts}->[1]->{type} =~ m@^text/calendar\b@i) ) {
+ return;
+ }
+
+ # Only deal with text/plain and text/html ...
+ foreach my $part ( @ma ) {
+ my %html = ();
+ my %text = ();
+
+ my @txt = $part->find_parts(qr@^text\b@i);
+ foreach my $text ( @txt ) {
+ my $rnd = $text->{'rendered'};
+
+ if ( $text->{'rendered_type'} =~ m@^text/html\b@i ) {
+ foreach my $w ( grep(/\w/,split(/\s+/,$rnd)) ) {
+ next if ( $w =~ /^URI:/ );
+ $html{$w}++;
+ }
+ }
+ else {
+ foreach my $w ( grep(/\w/,split(/\s+/,$rnd)) ) {
+ $text{$w}++;
+ }
+ }
+ }
+
+ my $orig = keys %html;
+ next if ( $orig == 0 );
+
+ while( my($k,$v) = each %text ) {
+ delete $html{$k} if ( exists $html{$k} && $html{$k}-$text{$k} < 1 );
+ }
+
+ my $diff = scalar(keys %html)/$orig*100;
+ $self->{madiff} = $diff if ( $diff > $self->{madiff} );
+
+ dbg(sprintf "madiff: left: %d, orig: %d, max-difference: %0.2f%%", scalar(keys %html), $orig, $self->{madiff});
+ }
+
+ return;
+}
+
+###########################################################################
+
1;
Modified: incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf
==============================================================================
--- incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf (original)
+++ incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf Tue Jan 13 22:16:30 2004
@@ -370,3 +370,24 @@
uri T_URI_UNPRINTABLE /%0/
describe T_URI_UNPRINTABLE URI contains unprintable characters
+# These rules use a simple algorithm to determine if the text and html
+# parts of an multipart/alternative message are different...
+# Even with the simple algorithm, it's amazing! - 2004.01.14, tvd
+#
+#OVERALL% SPAM% HAM% S/O RANK SCORE NAME
+# 140567 59829 80738 0.426 0.00 0.00 (all messages)
+#100.000 42.5626 57.4374 0.426 0.00 0.00 (all messages as %)
+# 29.308 68.8529 0.0050 1.000 1.00 0.01 T_MPART_ALT_DIFF_99
+# 29.353 68.9549 0.0074 1.000 1.00 0.01 T_MPART_ALT_DIFF_98
+# 29.429 69.1220 0.0149 1.000 1.00 0.01 T_MPART_ALT_DIFF_97
+# 29.470 69.2123 0.0198 1.000 1.00 0.01 T_MPART_ALT_DIFF_96
+# 29.483 69.2323 0.0285 1.000 1.00 0.01 T_MPART_ALT_DIFF_95
+# 29.771 69.8892 0.0421 0.999 1.00 0.01 T_MPART_ALT_DIFF_90
+#
+#
+body T_MPART_ALT_DIFF_90 eval:multipart_alternative_difference('90', '100')
+body T_MPART_ALT_DIFF_95 eval:multipart_alternative_difference('95', '100')
+body T_MPART_ALT_DIFF_96 eval:multipart_alternative_difference('96', '100')
+body T_MPART_ALT_DIFF_97 eval:multipart_alternative_difference('97', '100')
+body T_MPART_ALT_DIFF_98 eval:multipart_alternative_difference('98', '100')
+body T_MPART_ALT_DIFF_99 eval:multipart_alternative_difference('99', '100')