You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by mm...@apache.org on 2014/12/19 20:39:52 UTC

svn commit: r1646848 - in /spamassassin/trunk/lib/Mail/SpamAssassin: Message.pm Plugin/Bayes.pm

Author: mmartinec
Date: Fri Dec 19 19:39:51 2014
New Revision: 1646848

URL: http://svn.apache.org/r1646848
Log:
Bug 7115: Adding SHA digests of MIME parts as Bayes tokens allows bayes to 'see' non-textual content

Modified:
    spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm?rev=1646848&r1=1646847&r2=1646848&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm Fri Dec 19 19:39:51 2014
@@ -46,6 +46,11 @@ use strict;
 use warnings;
 use re 'taint';
 
+BEGIN {
+  eval { require Digest::SHA; import Digest::SHA qw(sha1 sha1_hex); 1 }
+  or do { require Digest::SHA1; import Digest::SHA1 qw(sha1 sha1_hex) }
+}
+
 use Mail::SpamAssassin;
 use Mail::SpamAssassin::Message::Node;
 use Mail::SpamAssassin::Message::Metadata;
@@ -117,6 +122,25 @@ sub new {
   $self->{master_deadline} = $opts->{'master_deadline'};
   $self->{suppl_attrib} = $opts->{'suppl_attrib'};
 
+  if ($self->{suppl_attrib}) {  # caller-provided additional information
+    # pristine_body_length is currently used by an eval test check_body_length
+    # Possible To-Do: Base the length on the @message array later down?
+    if (defined $self->{suppl_attrib}{body_size}) {
+      # Optional info provided by a caller; should reflect the original
+      # message body size if provided, and as such it may differ from the
+      # $self->{pristine_body} size, e.g. when the caller passed a truncated
+      # message to SpamAssassin, or when counting line-endings differently.
+      $self->{pristine_body_length} = $self->{suppl_attrib}{body_size};
+    }
+    if (ref $self->{suppl_attrib}{mimepart_digests}) {
+      # Optional info provided by a caller: an array of digest codes (e.g. SHA1)
+      # of each MIME part. Should reflect the original message if provided.
+      # As such it may differ from digests calculated by get_mimepart_digests(),
+      # e.g. when the caller passed a truncated message to SpamAssassin.
+      $self->{mimepart_digests} = $self->{suppl_attrib}{mimepart_digests};
+    }
+  }
+
   bless($self,$class);
 
   # create the metadata holder class
@@ -298,17 +322,8 @@ sub new {
   # will get modified below
   $self->{'pristine_body'} = join('', @message);
 
-  # pristine_body_length is currently used by an eval test check_body_length.  
-  # Possible To-Do: Base the length on the @message array later down?
-  # Or a different copy of the message post decoding?
-  if ($self->{suppl_attrib} && defined $self->{suppl_attrib}{body_size}) {
-    # optional info provided by a caller; should reflect the original
-    # message body size if provided, and as such it may differ from the
-    # $self->{pristine_body} size, e.g. when the caller passed a truncated
-    # message to SpamAssassin, or when counting line-endings differently
-    $self->{'pristine_body_length'} = $self->{suppl_attrib}{body_size};
-  } else {
-    $self->{'pristine_body_length'} = length($self->{'pristine_body'});
+  if (!defined $self->{pristine_body_length}) {
+    $self->{'pristine_body_length'} = length $self->{'pristine_body'};
   }
 
   # iterate over lines in reverse order
@@ -1051,6 +1066,20 @@ sub _parse_normal {
 }
 
 # ---------------------------------------------------------------------------
+
+sub get_mimepart_digests {
+  my ($self) = @_;
+
+  if (!exists $self->{mimepart_digests}) {
+    # traverse all parts which are leaves, recursively
+    $self->{mimepart_digests} =
+      [ map(sha1_hex($_->decode) . ':' . lc($_->{type}||''),
+            $self->find_parts(qr/^/,1,1)) ];
+  }
+  return $self->{mimepart_digests};
+}
+
+# ---------------------------------------------------------------------------
 
 sub get_rendered_body_text_array {
   my ($self) = @_;

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm?rev=1646848&r1=1646847&r2=1646848&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm Fri Dec 19 19:39:51 2014
@@ -1040,6 +1040,7 @@ sub _get_msgdata_from_permsgstatus {
   my $msgdata = { };
   $msgdata->{bayes_token_body} = $msg->{msg}->get_visible_rendered_body_text_array();
   $msgdata->{bayes_token_inviz} = $msg->{msg}->get_invisible_rendered_body_text_array();
+  $msgdata->{bayes_mimepart_digests} = $msg->{msg}->get_mimepart_digests();
   @{$msgdata->{bayes_token_uris}} = $msg->get_uri_list();
   return $msgdata;
 }
@@ -1068,6 +1069,23 @@ sub tokenize {
                                     @{$msgdata->{bayes_token_inviz}});
   }
 
+  # add digests and Content-Type of all MIME parts
+  if (ref $msgdata->{bayes_mimepart_digests}) {
+    my %shorthand = (  # some frequent MIME part contents for human readability
+     'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/plain'=> 'Empty-Plaintext',
+     'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/html' => 'Empty-HTML',
+     'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/xml'  => 'Empty-XML',
+     'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/plain'=> 'OneNL-Plaintext',
+     'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/html' => 'OneNL-HTML',
+     '71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/plain'=> 'TwoNL-Plaintext',
+     '71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/html' => 'TwoNL-HTML',
+    );
+    my(@t) = map('MIME:' . ($shorthand{$_} || $_),
+                 @{ $msgdata->{bayes_mimepart_digests} });
+    dbg("bayes: mime-part token %s", $_) for @t;
+    push (@tokens, @t);
+  }
+
   # Tokenize the headers
   my %hdrs = $self->_tokenize_headers ($msg);
   while( my($prefix, $value) = each %hdrs ) {