You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by pa...@apache.org on 2004/10/08 20:59:26 UTC

svn commit: rev 54104 - spamassassin/branches/3.0/lib/Mail/SpamAssassin

Author: parker
Date: Fri Oct  8 11:59:26 2004
New Revision: 54104

Modified:
   spamassassin/branches/3.0/lib/Mail/SpamAssassin/Bayes.pm
   spamassassin/branches/3.0/lib/Mail/SpamAssassin/Plugin.pm
Log:
Bug 3876: flatten tokens hash returned by tokenizer to save memory, updated Plugin docs to show change, and pass in additional hash for bayes_scan hook

Modified: spamassassin/branches/3.0/lib/Mail/SpamAssassin/Bayes.pm
==============================================================================
--- spamassassin/branches/3.0/lib/Mail/SpamAssassin/Bayes.pm	(original)
+++ spamassassin/branches/3.0/lib/Mail/SpamAssassin/Bayes.pm	Fri Oct  8 11:59:26 2004
@@ -358,7 +358,7 @@
   my %tokens;
   foreach my $token (@tokens) {
     next unless length($token); # skip 0 length tokens
-    $tokens{substr(sha1($token), -5)} = { 'raw_token' => $token };
+    $tokens{substr(sha1($token), -5)} = $token;
   }
 
   # return the keys == tokens ...
@@ -1207,11 +1207,10 @@
     my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata};
     my $prob = $self->compute_prob_for_token($token, $ns, $nn, $tok_spam, $tok_ham);
     if (defined($prob)) {
-      $pw{$token} = $prob;
-      $msgtokens->{$token}->{pw} = $prob;
-      $msgtokens->{$token}->{spam_count} = $tok_spam;
-      $msgtokens->{$token}->{ham_count} = $tok_ham;
-      $msgtokens->{$token}->{atime} = $atime;
+      $pw{$token}->{prob} = $prob;
+      $pw{$token}->{spam_count} = $tok_spam;
+      $pw{$token}->{ham_count} = $tok_ham;
+      $pw{$token}->{atime} = $atime;
     }
   }
 
@@ -1245,21 +1244,21 @@
   my @touch_tokens;
 
   for (sort {
-              abs($pw{$b} - 0.5) <=> abs($pw{$a} - 0.5)
+              abs($pw{$b}->{prob} - 0.5) <=> abs($pw{$a}->{prob} - 0.5)
             } keys %pw)
   {
     if ($count-- < 0) { last; }
-    my $pw = $pw{$_};
+    my $pw = $pw{$_}->{prob};
     next if (abs($pw - 0.5) < $self->{robinson_min_prob_strength});
 
     # What's more expensive, scanning headers for HAMMYTOKENS and
     # SPAMMYTOKENS tags that aren't there or collecting data that
     # won't be used?  Just collecting the data is certainly simpler.
     #
-    my $raw_token = $msgtokens->{$_}->{raw_token} || "(unknown)";
-    my $s = $msgtokens->{$_}->{spam_count};
-    my $n = $msgtokens->{$_}->{ham_count};
-    my $a = $msgtokens->{$_}->{atime};
+    my $raw_token = $msgtokens->{$_} || "(unknown)";
+    my $s = $pw{$_}->{spam_count};
+    my $n = $pw{$_}->{ham_count};
+    my $a = $pw{$_}->{atime};
     push @$tinfo_spammy, [$raw_token,$pw,$s,$n,$a] if $pw >= 0.5 && ++$tcount_spammy;
     push @$tinfo_hammy,  [$raw_token,$pw,$s,$n,$a] if $pw <  0.5 && ++$tcount_hammy;
 
@@ -1302,6 +1301,7 @@
   }
 
   $self->{main}->call_plugins("bayes_scan", { toksref => $msgtokens,
+					      probsref => \%pw,
 					      score => $score,
 					      msgatime => $msgatime,
 					      significant_tokens => \@touch_tokens,

Modified: spamassassin/branches/3.0/lib/Mail/SpamAssassin/Plugin.pm
==============================================================================
--- spamassassin/branches/3.0/lib/Mail/SpamAssassin/Plugin.pm	(original)
+++ spamassassin/branches/3.0/lib/Mail/SpamAssassin/Plugin.pm	Fri Oct  8 11:59:26 2004
@@ -359,10 +359,24 @@
 
 Reference to hash returned by call to tokenize.  The hash takes the
 format of:
+
 {
-  'SHA1 Hash Value' => { 'raw_token' => 'raw (original) value' }
+
+  'SHA1 Hash Value' => 'raw (original) value'
+
 }
 
+NOTE: This data structure has changed since it was originally introduced
+in version 3.0.0.  The values are no longer perl anonymous hashes, they
+are a single string containing the raw token value.  You can test for
+backwards compatability by checking to see if the value for a key is a
+reference to a perl HASH, for instance:
+
+if (ref($toksref->{$sometokenkey}) eq 'HASH') {...
+
+If it is, then you are using the old interface, otherwise you are using
+the current interface.
+
 =item isspam
 
 Boolean value stating what flavor of message the tokens represent, if
@@ -414,16 +428,27 @@
 =item toksref
 
 Reference to hash returned by call to tokenize.  See bayes_learn
-documentation for additional information on the format.  If the token
-was found in the database it will contain some additional information:
+documentation for additional information on the format.
+
+=item probsref
+
+Reference to hash of calculated probabilities for tokens found in
+the database.
 
 {
-  'SHA1 Hash Value' => { 'raw_token' => 'raw (original) value',
-                         'pw' => 'calculated probability',
+
+  'SHA1 Hash Value' => {
+
+                         'prob' => 'calculated probability',
+
                          'spam_count' => 'Total number of spam msgs w/ token',
+
                          'ham_count' => 'Total number of ham msgs w/ token',
+
                          'atime' => 'Atime value for token in database'
+
                        }
+
 }
 
 =item score