You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2004/01/14 07:16:30 UTC

svn commit: rev 6166 - in incubator/spamassassin/trunk: lib/Mail/SpamAssassin rules

Author: felicity
Date: Tue Jan 13 22:16:30 2004
New Revision: 6166

Modified:
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
   incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf
Log:
Added new rules to determine the % difference between text/html and text/*
attachments in a multipart/alternative mail.  It's got amazing hit-rate
and accuracy on my corpus even given the simple algorithm I used.


Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm	Tue Jan 13 22:16:30 2004
@@ -3541,4 +3541,71 @@
 
 ###########################################################################
 
+sub multipart_alternative_difference {
+  my($self, $fulltext, $min, $max) = @_;
+
+  $self->_multipart_alternative_difference() unless ( exists $self->{madiff} );
+
+  if (($min == 0 || $self->{madiff} > $min) &&
+      ($max eq "undef" || $self->{madiff} <= $max)) {
+      return 1;
+  }
+  return 0;
+}
+
+sub _multipart_alternative_difference {
+  my($self) = @_;
+
+  my @ma = $self->{msg}->{mime_parts}->find_parts(qr@^multipart/alternative\b@i);
+
+  $self->{madiff} = 0;
+
+  # Exchange meeting requests come in as m/a text/html text/calendar ...
+  # Ignore any messages without a multipart/alternative section as well ...
+  if ( !@ma || (@ma == 1 && @{$ma[0]->{body_parts}} == 2 &&
+  		$ma[0]->{body_parts}->[0]->{type} =~ m@^text/html\b@i && 
+		$ma[0]->{body_parts}->[1]->{type} =~ m@^text/calendar\b@i) ) {
+    return;
+  }
+
+  # Only deal with text/plain and text/html ...
+  foreach my $part ( @ma ) {
+    my %html = ();
+    my %text = ();
+
+    my @txt = $part->find_parts(qr@^text\b@i);
+    foreach my $text ( @txt ) {
+      my $rnd = $text->{'rendered'};
+
+      if ( $text->{'rendered_type'} =~ m@^text/html\b@i ) {
+        foreach my $w ( grep(/\w/,split(/\s+/,$rnd)) ) {
+          next if ( $w =~ /^URI:/ );
+          $html{$w}++;
+        }
+      }
+      else {
+        foreach my $w ( grep(/\w/,split(/\s+/,$rnd)) ) {
+          $text{$w}++;
+        }
+      }
+    }
+
+    my $orig = keys %html;
+    next if ( $orig == 0 );
+
+    while( my($k,$v) = each %text ) {
+      delete $html{$k} if ( exists $html{$k} && $html{$k}-$text{$k} < 1 );
+    }
+
+    my $diff = scalar(keys %html)/$orig*100;
+    $self->{madiff} = $diff if ( $diff > $self->{madiff} );
+
+    dbg(sprintf "madiff: left: %d, orig: %d, max-difference: %0.2f%%", scalar(keys %html), $orig, $self->{madiff});
+  }
+
+  return;
+}
+
+###########################################################################
+
 1;

Modified: incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf
==============================================================================
--- incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf	(original)
+++ incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf	Tue Jan 13 22:16:30 2004
@@ -370,3 +370,24 @@
 uri	T_URI_UNPRINTABLE	/%0/
 describe T_URI_UNPRINTABLE	URI contains unprintable characters
 
+# These rules use a simple algorithm to determine if the text and html
+# parts of an multipart/alternative message are different...
+# Even with the simple algorithm, it's amazing!  - 2004.01.14, tvd
+#
+#OVERALL%   SPAM%     HAM%     S/O    RANK   SCORE  NAME
+# 140567    59829    80738    0.426   0.00    0.00  (all messages)
+#100.000  42.5626  57.4374    0.426   0.00    0.00  (all messages as %)
+# 29.308  68.8529   0.0050    1.000   1.00    0.01  T_MPART_ALT_DIFF_99
+# 29.353  68.9549   0.0074    1.000   1.00    0.01  T_MPART_ALT_DIFF_98
+# 29.429  69.1220   0.0149    1.000   1.00    0.01  T_MPART_ALT_DIFF_97
+# 29.470  69.2123   0.0198    1.000   1.00    0.01  T_MPART_ALT_DIFF_96
+# 29.483  69.2323   0.0285    1.000   1.00    0.01  T_MPART_ALT_DIFF_95
+# 29.771  69.8892   0.0421    0.999   1.00    0.01  T_MPART_ALT_DIFF_90
+#
+#
+body T_MPART_ALT_DIFF_90      eval:multipart_alternative_difference('90', '100')
+body T_MPART_ALT_DIFF_95      eval:multipart_alternative_difference('95', '100')
+body T_MPART_ALT_DIFF_96      eval:multipart_alternative_difference('96', '100')
+body T_MPART_ALT_DIFF_97      eval:multipart_alternative_difference('97', '100')
+body T_MPART_ALT_DIFF_98      eval:multipart_alternative_difference('98', '100')
+body T_MPART_ALT_DIFF_99      eval:multipart_alternative_difference('99', '100')