You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by mm...@apache.org on 2014/12/29 16:44:13 UTC

svn commit: r1648372 - /spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm

Author: mmartinec
Date: Mon Dec 29 15:44:13 2014
New Revision: 1648372

URL: http://svn.apache.org/r1648372
Log:
Bug 7115, more informative bayes debugging: report number of tokens for each source

Modified:
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm?rev=1648372&r1=1648371&r2=1648372&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm Mon Dec 29 15:44:13 2014
@@ -1061,32 +1061,33 @@ sub tokenize {
 
   # visible tokens from the body
   if ($msgdata->{bayes_token_body}) {
-    dbg("bayes: tokenizing body");
-    push(@tokens, map($self->_tokenize_line ($_, '', 1),
-                      @{$msgdata->{bayes_token_body}} ));
+    my(@t) = map($self->_tokenize_line ($_, '', 1),
+                 @{$msgdata->{bayes_token_body}} );
+    dbg("bayes: tokenized body: %d tokens", scalar @t);
+    push(@tokens, @t);
   }
   # the URI list
   if ($msgdata->{bayes_token_uris}) {
-    dbg("bayes: tokenizing uri");
-    push(@tokens, map($self->_tokenize_line ($_, '', 2),
-                      @{$msgdata->{bayes_token_uris}} ));
+    my(@t) = map($self->_tokenize_line ($_, '', 2),
+                 @{$msgdata->{bayes_token_uris}} );
+    dbg("bayes: tokenized uri: %d tokens", scalar @t);
+    push(@tokens, @t);
   }
   # add invisible tokens
   if ($msgdata->{bayes_token_inviz}) {
-    dbg("bayes: tokenizing invisible");
-    if (ADD_INVIZ_TOKENS_I_PREFIX) {
-      push(@tokens, map($self->_tokenize_line ($_, "I*:", 1),
-                        @{$msgdata->{bayes_token_inviz}} ));
-    }
-    if (ADD_INVIZ_TOKENS_NO_PREFIX) {
-      push(@tokens, map($self->_tokenize_line ($_, "", 1),
-                        @{$msgdata->{bayes_token_inviz}} ));
+    my $tokprefix;
+    if (ADD_INVIZ_TOKENS_I_PREFIX)  { $tokprefix = 'I*:' }
+    if (ADD_INVIZ_TOKENS_NO_PREFIX) { $tokprefix = '' }
+    if (defined $tokprefix) {
+      my(@t) = map($self->_tokenize_line ($_, $tokprefix, 1),
+                   @{$msgdata->{bayes_token_inviz}} );
+      dbg("bayes: tokenized invisible: %d tokens", scalar @t);
+      push(@tokens, @t);
     }
   }
 
   # add digests and Content-Type of all MIME parts
   if ($msgdata->{bayes_mimepart_digests}) {
-    dbg("bayes: tokenizing mime parts");
     my %shorthand = (  # some frequent MIME part contents for human readability
      'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/plain'=> 'Empty-Plaintext',
      'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/html' => 'Empty-HTML',
@@ -1098,25 +1099,28 @@ sub tokenize {
     );
     my(@t) = map('MIME:' . ($shorthand{$_} || $_),
                  @{ $msgdata->{bayes_mimepart_digests} });
+    dbg("bayes: tokenized mime parts: %d tokens", scalar @t);
     dbg("bayes: mime-part token %s", $_) for @t;
-    push (@tokens, @t);
+    push(@tokens, @t);
   }
 
   # Tokenize the headers
   if ($t_src->{header}) {
-    dbg("bayes: tokenizing header");
+    my(@t);
     my %hdrs = $self->_tokenize_headers ($msg);
     while( my($prefix, $value) = each %hdrs ) {
-      push(@tokens, $self->_tokenize_line ($value, "H$prefix:", 0));
+      push(@t, $self->_tokenize_line ($value, "H$prefix:", 0));
     }
+    dbg("bayes: tokenized header: %d tokens", scalar @t);
+    push(@tokens, @t);
   }
 
   # Go ahead and uniq the array, skip null tokens (can happen sometimes)
   # generate an SHA1 hash and take the lower 40 bits as our token
   my %tokens;
   foreach my $token (@tokens) {
-    next unless length($token); # skip 0 length tokens
-    $tokens{substr(sha1($token), -5)} = $token;
+    # skip empty tokens
+    $tokens{substr(sha1($token), -5)} = $token  if $token ne '';
   }
 
   # return the keys == tokens ...