You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by pa...@apache.org on 2004/10/08 20:59:26 UTC
svn commit: rev 54104 - spamassassin/branches/3.0/lib/Mail/SpamAssassin
Author: parker
Date: Fri Oct 8 11:59:26 2004
New Revision: 54104
Modified:
spamassassin/branches/3.0/lib/Mail/SpamAssassin/Bayes.pm
spamassassin/branches/3.0/lib/Mail/SpamAssassin/Plugin.pm
Log:
Bug 3876: flatten tokens hash returned by tokenizer to save memory, updated Plugin docs to show change, and pass in additional hash for bayes_scan hook
Modified: spamassassin/branches/3.0/lib/Mail/SpamAssassin/Bayes.pm
==============================================================================
--- spamassassin/branches/3.0/lib/Mail/SpamAssassin/Bayes.pm (original)
+++ spamassassin/branches/3.0/lib/Mail/SpamAssassin/Bayes.pm Fri Oct 8 11:59:26 2004
@@ -358,7 +358,7 @@
my %tokens;
foreach my $token (@tokens) {
next unless length($token); # skip 0 length tokens
- $tokens{substr(sha1($token), -5)} = { 'raw_token' => $token };
+ $tokens{substr(sha1($token), -5)} = $token;
}
# return the keys == tokens ...
@@ -1207,11 +1207,10 @@
my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata};
my $prob = $self->compute_prob_for_token($token, $ns, $nn, $tok_spam, $tok_ham);
if (defined($prob)) {
- $pw{$token} = $prob;
- $msgtokens->{$token}->{pw} = $prob;
- $msgtokens->{$token}->{spam_count} = $tok_spam;
- $msgtokens->{$token}->{ham_count} = $tok_ham;
- $msgtokens->{$token}->{atime} = $atime;
+ $pw{$token}->{prob} = $prob;
+ $pw{$token}->{spam_count} = $tok_spam;
+ $pw{$token}->{ham_count} = $tok_ham;
+ $pw{$token}->{atime} = $atime;
}
}
@@ -1245,21 +1244,21 @@
my @touch_tokens;
for (sort {
- abs($pw{$b} - 0.5) <=> abs($pw{$a} - 0.5)
+ abs($pw{$b}->{prob} - 0.5) <=> abs($pw{$a}->{prob} - 0.5)
} keys %pw)
{
if ($count-- < 0) { last; }
- my $pw = $pw{$_};
+ my $pw = $pw{$_}->{prob};
next if (abs($pw - 0.5) < $self->{robinson_min_prob_strength});
# What's more expensive, scanning headers for HAMMYTOKENS and
# SPAMMYTOKENS tags that aren't there or collecting data that
# won't be used? Just collecting the data is certainly simpler.
#
- my $raw_token = $msgtokens->{$_}->{raw_token} || "(unknown)";
- my $s = $msgtokens->{$_}->{spam_count};
- my $n = $msgtokens->{$_}->{ham_count};
- my $a = $msgtokens->{$_}->{atime};
+ my $raw_token = $msgtokens->{$_} || "(unknown)";
+ my $s = $pw{$_}->{spam_count};
+ my $n = $pw{$_}->{ham_count};
+ my $a = $pw{$_}->{atime};
push @$tinfo_spammy, [$raw_token,$pw,$s,$n,$a] if $pw >= 0.5 && ++$tcount_spammy;
push @$tinfo_hammy, [$raw_token,$pw,$s,$n,$a] if $pw < 0.5 && ++$tcount_hammy;
@@ -1302,6 +1301,7 @@
}
$self->{main}->call_plugins("bayes_scan", { toksref => $msgtokens,
+ probsref => \%pw,
score => $score,
msgatime => $msgatime,
significant_tokens => \@touch_tokens,
Modified: spamassassin/branches/3.0/lib/Mail/SpamAssassin/Plugin.pm
==============================================================================
--- spamassassin/branches/3.0/lib/Mail/SpamAssassin/Plugin.pm (original)
+++ spamassassin/branches/3.0/lib/Mail/SpamAssassin/Plugin.pm Fri Oct 8 11:59:26 2004
@@ -359,10 +359,24 @@
Reference to hash returned by call to tokenize. The hash takes the
format of:
+
{
- 'SHA1 Hash Value' => { 'raw_token' => 'raw (original) value' }
+
+ 'SHA1 Hash Value' => 'raw (original) value'
+
}
+NOTE: This data structure has changed since it was originally introduced
+in version 3.0.0. The values are no longer perl anonymous hashes, they
+are a single string containing the raw token value. You can test for
+backwards compatability by checking to see if the value for a key is a
+reference to a perl HASH, for instance:
+
+if (ref($toksref->{$sometokenkey}) eq 'HASH') {...
+
+If it is, then you are using the old interface, otherwise you are using
+the current interface.
+
=item isspam
Boolean value stating what flavor of message the tokens represent, if
@@ -414,16 +428,27 @@
=item toksref
Reference to hash returned by call to tokenize. See bayes_learn
-documentation for additional information on the format. If the token
-was found in the database it will contain some additional information:
+documentation for additional information on the format.
+
+=item probsref
+
+Reference to hash of calculated probabilities for tokens found in
+the database.
{
- 'SHA1 Hash Value' => { 'raw_token' => 'raw (original) value',
- 'pw' => 'calculated probability',
+
+ 'SHA1 Hash Value' => {
+
+ 'prob' => 'calculated probability',
+
'spam_count' => 'Total number of spam msgs w/ token',
+
'ham_count' => 'Total number of ham msgs w/ token',
+
'atime' => 'Atime value for token in database'
+
}
+
}
=item score