You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by mm...@apache.org on 2014/12/24 01:27:01 UTC
svn commit: r1647707 - in /spamassassin/trunk/lib/Mail/SpamAssassin: Conf.pm Plugin/Bayes.pm

Author: mmartinec
Date: Wed Dec 24 00:27:01 2014
New Revision: 1647707

URL: http://svn.apache.org/r1647707
Log:
Bug 7115: Adding SHA digests of MIME parts as Bayes tokens allows bayes to see non-textual content - added configirability

Modified:
    spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm?rev=1647707&r1=1647706&r2=1647707&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm Wed Dec 24 00:27:01 2014
@@ -1929,6 +1929,76 @@ for details on how Bayes auto-learning i
     type => $CONF_TYPE_BOOL,
   });
 
+=item bayes_token_sources  (default: header visible invisible uri)
+
+Controls which sources in a mail message can contribute tokens (e.g. words,
+phrases, etc.) to a Bayes classifier. The argument is a space-separated list
+of keywords: I<header>, I<visible>, I<invisible>, I<uri>, I<mimepart>), each
+of which may be prefixed by a I<no> to indicate its exclusion. Additionally
+two reserved keywords are allowed: I<all> and I<none> (or: I<noall>). The list
+of keywords is processed sequentially: a keyword I<all> adds all available
+keywords to a set being built, a I<none> or I<noall> clears the set, other
+non-negated keywords are added to the set, and negated keywords are removed
+from the set. Keywords are case-insensitive.
+
+The default set is: I<header> I<visible> I<invisible> I<uri>, which is
+equivalent for example to: I<All> I<NoMIMEpart>. The reason why I<mimepart>
+is not currently in a default set is that it is a newer source (introduced
+with SpamAssassin version 3.4.1) and not much experience has yet been gathered
+regarding its usefulness.
+
+See also option C<bayes_ignore_header> for a fine-grained control on individual
+header fields under the umbrella of a more general keyword I<header> here.
+
+Keywords imply the following data sources:
+
+=over 4
+
+=item I<header> - tokens collected from a message header section
+
+=item I<visible> - words from visible text (plain or HTML) in a message body
+
+=item I<invisible> - hidden/invisible text in HTML parts of a message body
+
+=item I<uri> - URIs collected from a message body
+
+=item I<mimepart> - digests (hashes) of all MIME parts (textual or non-textual) of a message, computed after Base64 and quoted-printable decoding, suffixed by their Content-Type
+
+=item I<all> - adds all the above keywords to the set being assembled
+
+=item I<none> or I<noall> - removes all keywords from the set
+
+=back
+
+The C<bayes_token_sources> directive may appear multiple times, its keywords
+are interpreted sequentially, adding or removing items from the final set
+as they appear in their order in C<bayes_token_sources> directive(s).
+
+=cut
+
+  push (@cmds, {
+    setting => 'bayes_token_sources',
+    default => { map(($_,1), qw(header visible invisible uri)) },  # mimepart
+    type => $CONF_TYPE_HASH_KEY_VALUE,
+    code => sub {
+      my ($self, $key, $value, $line) = @_;
+      return $MISSING_REQUIRED_VALUE  if $value eq '';
+      my $h = ($self->{bayes_token_sources} ||= {});
+      my %all_kw = map(($_,1), qw(header visible invisible uri mimepart));
+      foreach (split(' ', lc $value)) {
+        if (/^(none|noall)\z/) {
+          %$h = ();
+        } elsif ($_ eq 'all') {
+          %$h = %all_kw;
+        } elsif (/^(no)?(.+)\z/s && exists $all_kw{$2}) {
+          $h->{$2} = defined $1 ? 0 : 1;
+        } else {
+          return $INVALID_VALUE;
+        }
+      }
+    }
+  });
+
 =item bayes_ignore_header header_name
 
 If you receive mail filtered by upstream mail systems, like

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm?rev=1647707&r1=1647706&r2=1647707&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm Wed Dec 24 00:27:01 2014
@@ -290,8 +290,8 @@ sub spamd_child_init {
 sub check_bayes {
   my ($self, $pms, $fulltext, $min, $max) = @_;
 
-  return 0 if (!$pms->{conf}->{use_learner});
-  return 0 if (!$pms->{conf}->{use_bayes} || !$pms->{conf}->{use_bayes_rules});
+  return 0 if (!$self->{conf}->{use_learner});
+  return 0 if (!$self->{conf}->{use_bayes} || !$self->{conf}->{use_bayes_rules});
 
   if (!exists ($pms->{bayes_score})) {
     my $timer = $self->{main}->time_method("check_bayes");
@@ -302,7 +302,7 @@ sub check_bayes {
       ($min == 0 || $pms->{bayes_score} > $min) &&
       ($max eq "undef" || $pms->{bayes_score} <= $max))
   {
-      if ($pms->{conf}->{detailed_bayes_score}) {
+      if ($self->{conf}->{detailed_bayes_score}) {
         $pms->test_log(sprintf ("score: %3.4f, hits: %s",
                                  $pms->{bayes_score},
                                  $pms->{bayes_hits}));
@@ -1035,13 +1035,18 @@ sub get_body_from_msg {
 }
 
 sub _get_msgdata_from_permsgstatus {
-  my ($self, $msg) = @_;
+  my ($self, $pms) = @_;
 
+  my $t_src = $self->{conf}->{bayes_token_sources};
   my $msgdata = { };
-  $msgdata->{bayes_token_body} = $msg->{msg}->get_visible_rendered_body_text_array();
-  $msgdata->{bayes_token_inviz} = $msg->{msg}->get_invisible_rendered_body_text_array();
-  $msgdata->{bayes_mimepart_digests} = $msg->{msg}->get_mimepart_digests();
-  @{$msgdata->{bayes_token_uris}} = $msg->get_uri_list();
+  $msgdata->{bayes_token_body} =
+    $pms->{msg}->get_visible_rendered_body_text_array() if $t_src->{visible};
+  $msgdata->{bayes_token_inviz} =
+    $pms->{msg}->get_invisible_rendered_body_text_array() if $t_src->{invisible};
+  $msgdata->{bayes_mimepart_digests} =
+    $pms->{msg}->get_mimepart_digests() if $t_src->{mimepart};
+  @{$msgdata->{bayes_token_uris}} =
+    $pms->get_uri_list() if $t_src->{uri};
   return $msgdata;
 }
 
@@ -1051,26 +1056,37 @@ sub _get_msgdata_from_permsgstatus {
 sub tokenize {
   my ($self, $msg, $msgdata) = @_;
 
-  # the body
-  my @tokens = map { $self->_tokenize_line ($_, '', 1) }
-                                    @{$msgdata->{bayes_token_body}};
+  my $t_src = $self->{conf}->{bayes_token_sources};
+  my @tokens;
 
+  # visible tokens from the body
+  if ($msgdata->{bayes_token_body}) {
+    dbg("bayes: tokenizing body");
+    push(@tokens, map($self->_tokenize_line ($_, '', 1),
+                      @{$msgdata->{bayes_token_body}} ));
+  }
   # the URI list
-  push (@tokens, map { $self->_tokenize_line ($_, '', 2) }
-                                    @{$msgdata->{bayes_token_uris}});
-
+  if ($msgdata->{bayes_token_uris}) {
+    dbg("bayes: tokenizing uri");
+    push(@tokens, map($self->_tokenize_line ($_, '', 2),
+                      @{$msgdata->{bayes_token_uris}} ));
+  }
   # add invisible tokens
-  if (ADD_INVIZ_TOKENS_I_PREFIX) {
-    push (@tokens, map { $self->_tokenize_line ($_, "I*:", 1) }
-                                    @{$msgdata->{bayes_token_inviz}});
-  }
-  if (ADD_INVIZ_TOKENS_NO_PREFIX) {
-    push (@tokens, map { $self->_tokenize_line ($_, "", 1) }
-                                    @{$msgdata->{bayes_token_inviz}});
+  if ($msgdata->{bayes_token_inviz}) {
+    dbg("bayes: tokenizing invisible");
+    if (ADD_INVIZ_TOKENS_I_PREFIX) {
+      push(@tokens, map($self->_tokenize_line ($_, "I*:", 1),
+                        @{$msgdata->{bayes_token_inviz}} ));
+    }
+    if (ADD_INVIZ_TOKENS_NO_PREFIX) {
+      push(@tokens, map($self->_tokenize_line ($_, "", 1),
+                        @{$msgdata->{bayes_token_inviz}} ));
+    }
   }
 
   # add digests and Content-Type of all MIME parts
-  if (ref $msgdata->{bayes_mimepart_digests}) {
+  if ($msgdata->{bayes_mimepart_digests}) {
+    dbg("bayes: tokenizing mime parts");
     my %shorthand = (  # some frequent MIME part contents for human readability
      'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/plain'=> 'Empty-Plaintext',
      'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/html' => 'Empty-HTML',
@@ -1087,9 +1103,12 @@ sub tokenize {
   }
 
   # Tokenize the headers
-  my %hdrs = $self->_tokenize_headers ($msg);
-  while( my($prefix, $value) = each %hdrs ) {
-    push(@tokens, $self->_tokenize_line ($value, "H$prefix:", 0));
+  if ($t_src->{header}) {
+    dbg("bayes: tokenizing header");
+    my %hdrs = $self->_tokenize_headers ($msg);
+    while( my($prefix, $value) = each %hdrs ) {
+      push(@tokens, $self->_tokenize_line ($value, "H$prefix:", 0));
+    }
   }
 
   # Go ahead and uniq the array, skip null tokens (can happen sometimes)