You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by mm...@apache.org on 2014/12/19 20:39:52 UTC
svn commit: r1646848 - in /spamassassin/trunk/lib/Mail/SpamAssassin:
Message.pm Plugin/Bayes.pm
Author: mmartinec
Date: Fri Dec 19 19:39:51 2014
New Revision: 1646848
URL: http://svn.apache.org/r1646848
Log:
Bug 7115: Adding SHA digests of MIME parts as Bayes tokens allows bayes to 'see' non-textual content
Modified:
spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm?rev=1646848&r1=1646847&r2=1646848&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm Fri Dec 19 19:39:51 2014
@@ -46,6 +46,11 @@ use strict;
use warnings;
use re 'taint';
+BEGIN {
+ eval { require Digest::SHA; import Digest::SHA qw(sha1 sha1_hex); 1 }
+ or do { require Digest::SHA1; import Digest::SHA1 qw(sha1 sha1_hex) }
+}
+
use Mail::SpamAssassin;
use Mail::SpamAssassin::Message::Node;
use Mail::SpamAssassin::Message::Metadata;
@@ -117,6 +122,25 @@ sub new {
$self->{master_deadline} = $opts->{'master_deadline'};
$self->{suppl_attrib} = $opts->{'suppl_attrib'};
+ if ($self->{suppl_attrib}) { # caller-provided additional information
+ # pristine_body_length is currently used by an eval test check_body_length
+ # Possible To-Do: Base the length on the @message array later down?
+ if (defined $self->{suppl_attrib}{body_size}) {
+ # Optional info provided by a caller; should reflect the original
+ # message body size if provided, and as such it may differ from the
+ # $self->{pristine_body} size, e.g. when the caller passed a truncated
+ # message to SpamAssassin, or when counting line-endings differently.
+ $self->{pristine_body_length} = $self->{suppl_attrib}{body_size};
+ }
+ if (ref $self->{suppl_attrib}{mimepart_digests}) {
+ # Optional info provided by a caller: an array of digest codes (e.g. SHA1)
+ # of each MIME part. Should reflect the original message if provided.
+ # As such it may differ from digests calculated by get_mimepart_digests(),
+ # e.g. when the caller passed a truncated message to SpamAssassin.
+ $self->{mimepart_digests} = $self->{suppl_attrib}{mimepart_digests};
+ }
+ }
+
bless($self,$class);
# create the metadata holder class
@@ -298,17 +322,8 @@ sub new {
# will get modified below
$self->{'pristine_body'} = join('', @message);
- # pristine_body_length is currently used by an eval test check_body_length.
- # Possible To-Do: Base the length on the @message array later down?
- # Or a different copy of the message post decoding?
- if ($self->{suppl_attrib} && defined $self->{suppl_attrib}{body_size}) {
- # optional info provided by a caller; should reflect the original
- # message body size if provided, and as such it may differ from the
- # $self->{pristine_body} size, e.g. when the caller passed a truncated
- # message to SpamAssassin, or when counting line-endings differently
- $self->{'pristine_body_length'} = $self->{suppl_attrib}{body_size};
- } else {
- $self->{'pristine_body_length'} = length($self->{'pristine_body'});
+ if (!defined $self->{pristine_body_length}) {
+ $self->{'pristine_body_length'} = length $self->{'pristine_body'};
}
# iterate over lines in reverse order
@@ -1051,6 +1066,20 @@ sub _parse_normal {
}
# ---------------------------------------------------------------------------
+
+sub get_mimepart_digests {
+ my ($self) = @_;
+
+ if (!exists $self->{mimepart_digests}) {
+ # traverse all parts which are leaves, recursively
+ $self->{mimepart_digests} =
+ [ map(sha1_hex($_->decode) . ':' . lc($_->{type}||''),
+ $self->find_parts(qr/^/,1,1)) ];
+ }
+ return $self->{mimepart_digests};
+}
+
+# ---------------------------------------------------------------------------
sub get_rendered_body_text_array {
my ($self) = @_;
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm?rev=1646848&r1=1646847&r2=1646848&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm Fri Dec 19 19:39:51 2014
@@ -1040,6 +1040,7 @@ sub _get_msgdata_from_permsgstatus {
my $msgdata = { };
$msgdata->{bayes_token_body} = $msg->{msg}->get_visible_rendered_body_text_array();
$msgdata->{bayes_token_inviz} = $msg->{msg}->get_invisible_rendered_body_text_array();
+ $msgdata->{bayes_mimepart_digests} = $msg->{msg}->get_mimepart_digests();
@{$msgdata->{bayes_token_uris}} = $msg->get_uri_list();
return $msgdata;
}
@@ -1068,6 +1069,23 @@ sub tokenize {
@{$msgdata->{bayes_token_inviz}});
}
+ # add digests and Content-Type of all MIME parts
+ if (ref $msgdata->{bayes_mimepart_digests}) {
+ my %shorthand = ( # some frequent MIME part contents for human readability
+ 'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/plain'=> 'Empty-Plaintext',
+ 'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/html' => 'Empty-HTML',
+ 'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/xml' => 'Empty-XML',
+ 'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/plain'=> 'OneNL-Plaintext',
+ 'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/html' => 'OneNL-HTML',
+ '71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/plain'=> 'TwoNL-Plaintext',
+ '71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/html' => 'TwoNL-HTML',
+ );
+ my(@t) = map('MIME:' . ($shorthand{$_} || $_),
+ @{ $msgdata->{bayes_mimepart_digests} });
+ dbg("bayes: mime-part token %s", $_) for @t;
+ push (@tokens, @t);
+ }
+
# Tokenize the headers
my %hdrs = $self->_tokenize_headers ($msg);
while( my($prefix, $value) = each %hdrs ) {