You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by mm...@apache.org on 2014/12/24 01:27:01 UTC
svn commit: r1647707 - in /spamassassin/trunk/lib/Mail/SpamAssassin: Conf.pm
Plugin/Bayes.pm
Author: mmartinec
Date: Wed Dec 24 00:27:01 2014
New Revision: 1647707
URL: http://svn.apache.org/r1647707
Log:
Bug 7115: Adding SHA digests of MIME parts as Bayes tokens allows bayes to see non-textual content - added configirability
Modified:
spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm?rev=1647707&r1=1647706&r2=1647707&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm Wed Dec 24 00:27:01 2014
@@ -1929,6 +1929,76 @@ for details on how Bayes auto-learning i
type => $CONF_TYPE_BOOL,
});
+=item bayes_token_sources (default: header visible invisible uri)
+
+Controls which sources in a mail message can contribute tokens (e.g. words,
+phrases, etc.) to a Bayes classifier. The argument is a space-separated list
+of keywords: I<header>, I<visible>, I<invisible>, I<uri>, I<mimepart>), each
+of which may be prefixed by a I<no> to indicate its exclusion. Additionally
+two reserved keywords are allowed: I<all> and I<none> (or: I<noall>). The list
+of keywords is processed sequentially: a keyword I<all> adds all available
+keywords to a set being built, a I<none> or I<noall> clears the set, other
+non-negated keywords are added to the set, and negated keywords are removed
+from the set. Keywords are case-insensitive.
+
+The default set is: I<header> I<visible> I<invisible> I<uri>, which is
+equivalent for example to: I<All> I<NoMIMEpart>. The reason why I<mimepart>
+is not currently in a default set is that it is a newer source (introduced
+with SpamAssassin version 3.4.1) and not much experience has yet been gathered
+regarding its usefulness.
+
+See also option C<bayes_ignore_header> for a fine-grained control on individual
+header fields under the umbrella of a more general keyword I<header> here.
+
+Keywords imply the following data sources:
+
+=over 4
+
+=item I<header> - tokens collected from a message header section
+
+=item I<visible> - words from visible text (plain or HTML) in a message body
+
+=item I<invisible> - hidden/invisible text in HTML parts of a message body
+
+=item I<uri> - URIs collected from a message body
+
+=item I<mimepart> - digests (hashes) of all MIME parts (textual or non-textual) of a message, computed after Base64 and quoted-printable decoding, suffixed by their Content-Type
+
+=item I<all> - adds all the above keywords to the set being assembled
+
+=item I<none> or I<noall> - removes all keywords from the set
+
+=back
+
+The C<bayes_token_sources> directive may appear multiple times, its keywords
+are interpreted sequentially, adding or removing items from the final set
+as they appear in their order in C<bayes_token_sources> directive(s).
+
+=cut
+
+ push (@cmds, {
+ setting => 'bayes_token_sources',
+ default => { map(($_,1), qw(header visible invisible uri)) }, # mimepart
+ type => $CONF_TYPE_HASH_KEY_VALUE,
+ code => sub {
+ my ($self, $key, $value, $line) = @_;
+ return $MISSING_REQUIRED_VALUE if $value eq '';
+ my $h = ($self->{bayes_token_sources} ||= {});
+ my %all_kw = map(($_,1), qw(header visible invisible uri mimepart));
+ foreach (split(' ', lc $value)) {
+ if (/^(none|noall)\z/) {
+ %$h = ();
+ } elsif ($_ eq 'all') {
+ %$h = %all_kw;
+ } elsif (/^(no)?(.+)\z/s && exists $all_kw{$2}) {
+ $h->{$2} = defined $1 ? 0 : 1;
+ } else {
+ return $INVALID_VALUE;
+ }
+ }
+ }
+ });
+
=item bayes_ignore_header header_name
If you receive mail filtered by upstream mail systems, like
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm?rev=1647707&r1=1647706&r2=1647707&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm Wed Dec 24 00:27:01 2014
@@ -290,8 +290,8 @@ sub spamd_child_init {
sub check_bayes {
my ($self, $pms, $fulltext, $min, $max) = @_;
- return 0 if (!$pms->{conf}->{use_learner});
- return 0 if (!$pms->{conf}->{use_bayes} || !$pms->{conf}->{use_bayes_rules});
+ return 0 if (!$self->{conf}->{use_learner});
+ return 0 if (!$self->{conf}->{use_bayes} || !$self->{conf}->{use_bayes_rules});
if (!exists ($pms->{bayes_score})) {
my $timer = $self->{main}->time_method("check_bayes");
@@ -302,7 +302,7 @@ sub check_bayes {
($min == 0 || $pms->{bayes_score} > $min) &&
($max eq "undef" || $pms->{bayes_score} <= $max))
{
- if ($pms->{conf}->{detailed_bayes_score}) {
+ if ($self->{conf}->{detailed_bayes_score}) {
$pms->test_log(sprintf ("score: %3.4f, hits: %s",
$pms->{bayes_score},
$pms->{bayes_hits}));
@@ -1035,13 +1035,18 @@ sub get_body_from_msg {
}
sub _get_msgdata_from_permsgstatus {
- my ($self, $msg) = @_;
+ my ($self, $pms) = @_;
+ my $t_src = $self->{conf}->{bayes_token_sources};
my $msgdata = { };
- $msgdata->{bayes_token_body} = $msg->{msg}->get_visible_rendered_body_text_array();
- $msgdata->{bayes_token_inviz} = $msg->{msg}->get_invisible_rendered_body_text_array();
- $msgdata->{bayes_mimepart_digests} = $msg->{msg}->get_mimepart_digests();
- @{$msgdata->{bayes_token_uris}} = $msg->get_uri_list();
+ $msgdata->{bayes_token_body} =
+ $pms->{msg}->get_visible_rendered_body_text_array() if $t_src->{visible};
+ $msgdata->{bayes_token_inviz} =
+ $pms->{msg}->get_invisible_rendered_body_text_array() if $t_src->{invisible};
+ $msgdata->{bayes_mimepart_digests} =
+ $pms->{msg}->get_mimepart_digests() if $t_src->{mimepart};
+ @{$msgdata->{bayes_token_uris}} =
+ $pms->get_uri_list() if $t_src->{uri};
return $msgdata;
}
@@ -1051,26 +1056,37 @@ sub _get_msgdata_from_permsgstatus {
sub tokenize {
my ($self, $msg, $msgdata) = @_;
- # the body
- my @tokens = map { $self->_tokenize_line ($_, '', 1) }
- @{$msgdata->{bayes_token_body}};
+ my $t_src = $self->{conf}->{bayes_token_sources};
+ my @tokens;
+ # visible tokens from the body
+ if ($msgdata->{bayes_token_body}) {
+ dbg("bayes: tokenizing body");
+ push(@tokens, map($self->_tokenize_line ($_, '', 1),
+ @{$msgdata->{bayes_token_body}} ));
+ }
# the URI list
- push (@tokens, map { $self->_tokenize_line ($_, '', 2) }
- @{$msgdata->{bayes_token_uris}});
-
+ if ($msgdata->{bayes_token_uris}) {
+ dbg("bayes: tokenizing uri");
+ push(@tokens, map($self->_tokenize_line ($_, '', 2),
+ @{$msgdata->{bayes_token_uris}} ));
+ }
# add invisible tokens
- if (ADD_INVIZ_TOKENS_I_PREFIX) {
- push (@tokens, map { $self->_tokenize_line ($_, "I*:", 1) }
- @{$msgdata->{bayes_token_inviz}});
- }
- if (ADD_INVIZ_TOKENS_NO_PREFIX) {
- push (@tokens, map { $self->_tokenize_line ($_, "", 1) }
- @{$msgdata->{bayes_token_inviz}});
+ if ($msgdata->{bayes_token_inviz}) {
+ dbg("bayes: tokenizing invisible");
+ if (ADD_INVIZ_TOKENS_I_PREFIX) {
+ push(@tokens, map($self->_tokenize_line ($_, "I*:", 1),
+ @{$msgdata->{bayes_token_inviz}} ));
+ }
+ if (ADD_INVIZ_TOKENS_NO_PREFIX) {
+ push(@tokens, map($self->_tokenize_line ($_, "", 1),
+ @{$msgdata->{bayes_token_inviz}} ));
+ }
}
# add digests and Content-Type of all MIME parts
- if (ref $msgdata->{bayes_mimepart_digests}) {
+ if ($msgdata->{bayes_mimepart_digests}) {
+ dbg("bayes: tokenizing mime parts");
my %shorthand = ( # some frequent MIME part contents for human readability
'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/plain'=> 'Empty-Plaintext',
'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/html' => 'Empty-HTML',
@@ -1087,9 +1103,12 @@ sub tokenize {
}
# Tokenize the headers
- my %hdrs = $self->_tokenize_headers ($msg);
- while( my($prefix, $value) = each %hdrs ) {
- push(@tokens, $self->_tokenize_line ($value, "H$prefix:", 0));
+ if ($t_src->{header}) {
+ dbg("bayes: tokenizing header");
+ my %hdrs = $self->_tokenize_headers ($msg);
+ while( my($prefix, $value) = each %hdrs ) {
+ push(@tokens, $self->_tokenize_line ($value, "H$prefix:", 0));
+ }
}
# Go ahead and uniq the array, skip null tokens (can happen sometimes)