You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2006/04/03 12:22:03 UTC
svn commit: r390991 - /spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm
Author: jm
Date: Mon Apr 3 03:21:59 2006
New Revision: 390991
URL: http://svn.apache.org/viewcvs?rev=390991&view=rev
Log:
remove some long-dead code for 'log_raw_counts'; use some slightly more efficient code in a bayes hot-spot; don't maintain a separate count of tokens where an array-length check will do
Modified:
spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm?rev=390991&r1=390990&r2=390991&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm Mon Apr 3 03:21:59 2006
@@ -229,7 +229,7 @@
my $self = {
'main' => $main,
'conf' => $main->{conf},
- 'log_raw_counts' => 0,
+ ## 'log_raw_counts' => 0, # see compute_prob_for_token()
'use_ignores' => 1,
'tz' => Mail::SpamAssassin::Util::local_tz(),
};
@@ -1063,9 +1063,15 @@
($Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT + $robn);
}
- if ($self->{log_raw_counts}) {
- $self->{raw_counts} .= " s=$s,n=$n ";
- }
+ # 'log_raw_counts' is used to log the raw data for the Bayes equations during
+ # a mass-check, allowing the S and X constants to be optimized quickly
+ # without requiring re-tokenization of the messages for each attempt. There's
+ # really no need for this code to be uncommented in normal use, however. It
+ # has never been publicly documented, so commenting it out is fine. ;)
+
+ ## if ($self->{log_raw_counts}) {
+ ## $self->{raw_counts} .= " s=$s,n=$n ";
+ ## }
return $prob;
}
@@ -1171,9 +1177,9 @@
my ($ns, $nn) = $self->{store}->nspam_nham_get();
- if ($self->{log_raw_counts}) {
- $self->{raw_counts} = " ns=$ns nn=$nn ";
- }
+ ## if ($self->{log_raw_counts}) { # see compute_prob_for_token()
+ ## $self->{raw_counts} = " ns=$ns nn=$nn ";
+ ## }
dbg("bayes: corpus size: nspam = $ns, nham = $nn");
@@ -1183,17 +1189,19 @@
my $tokensdata = $self->{store}->tok_get_all(keys %{$msgtokens});
- my %pw;
+ my %pw = ();
foreach my $tokendata (@{$tokensdata}) {
my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata};
my $prob = $self->compute_prob_for_token($token, $ns, $nn, $tok_spam, $tok_ham);
- if (defined($prob)) {
- $pw{$token}->{prob} = $prob;
- $pw{$token}->{spam_count} = $tok_spam;
- $pw{$token}->{ham_count} = $tok_ham;
- $pw{$token}->{atime} = $atime;
- }
+ next unless defined $prob;
+
+ $pw{$token} = {
+ prob => $prob,
+ spam_count => $tok_spam,
+ ham_count => $tok_ham,
+ atime => $atime
+ };
}
# If none of the tokens were found in the DB, we're going to skip
@@ -1219,38 +1227,44 @@
my $count = N_SIGNIFICANT_TOKENS;
my @sorted = ();
- my ($tcount_spammy,$tcount_hammy) = (0,0);
+ my @touch_tokens;
my $tinfo_spammy = $permsgstatus->{bayes_token_info_spammy} = [];
my $tinfo_hammy = $permsgstatus->{bayes_token_info_hammy} = [];
- my @touch_tokens;
+ my %tok_strength = map { $_ => (abs($pw{$_}->{prob} - 0.5)) } keys %pw;
+ my $log_each_token = (would_log('dbg', 'bayes') > 1);
- for (sort {
- abs($pw{$b}->{prob} - 0.5) <=> abs($pw{$a}->{prob} - 0.5)
+ foreach my $tok (sort {
+ $tok_strength{$b} <=> $tok_strength{$a}
} keys %pw)
{
if ($count-- < 0) { last; }
- my $pw = $pw{$_}->{prob};
- next if (abs($pw - 0.5) <
+ next if ($tok_strength{$tok} <
$Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH);
+ my $pw = $pw{$tok}->{prob};
+
# What's more expensive, scanning headers for HAMMYTOKENS and
# SPAMMYTOKENS tags that aren't there or collecting data that
# won't be used? Just collecting the data is certainly simpler.
#
- my $raw_token = $msgtokens->{$_} || "(unknown)";
- my $s = $pw{$_}->{spam_count};
- my $n = $pw{$_}->{ham_count};
- my $a = $pw{$_}->{atime};
- push @$tinfo_spammy, [$raw_token,$pw,$s,$n,$a] if $pw >= 0.5 && ++$tcount_spammy;
- push @$tinfo_hammy, [$raw_token,$pw,$s,$n,$a] if $pw < 0.5 && ++$tcount_hammy;
+ my $raw_token = $msgtokens->{$tok} || "(unknown)";
+ my $s = $pw{$tok}->{spam_count};
+ my $n = $pw{$tok}->{ham_count};
+ my $a = $pw{$tok}->{atime};
+
+ if ($pw < 0.5) {
+ push @$tinfo_hammy, [$raw_token,$pw,$s,$n,$a];
+ } else {
+ push @$tinfo_spammy, [$raw_token,$pw,$s,$n,$a];
+ }
push (@sorted, $pw);
# update the atime on this token, it proved useful
- push(@touch_tokens, $_);
+ push(@touch_tokens, $tok);
- if (would_log('dbg', 'bayes') > 1) {
+ if ($log_each_token) {
dbg("bayes: token '$raw_token' => $pw");
}
}
@@ -1277,9 +1291,9 @@
$permsgstatus->{bayes_nspam} = $ns;
$permsgstatus->{bayes_nham} = $nn;
- if ($self->{log_raw_counts}) {
- print "#Bayes-Raw-Counts: $self->{raw_counts}\n";
- }
+ ## if ($self->{log_raw_counts}) { # see compute_prob_for_token()
+ ## print "#Bayes-Raw-Counts: $self->{raw_counts}\n";
+ ## }
$self->{main}->call_plugins("bayes_scan", { toksref => $msgtokens,
probsref => \%pw,
@@ -1315,8 +1329,10 @@
$self->{store}->untie_db();
}
- $permsgstatus->{tag_data}{BAYESTCHAMMY} = $tcount_hammy;
- $permsgstatus->{tag_data}{BAYESTCSPAMMY} = $tcount_spammy;
+ $permsgstatus->{tag_data}{BAYESTCHAMMY} =
+ ($tinfo_hammy ? scalar @{$tinfo_hammy} : 0);
+ $permsgstatus->{tag_data}{BAYESTCSPAMMY} =
+ ($tinfo_spammy ? scalar @{$tinfo_spammy} : 0);
$permsgstatus->{tag_data}{BAYESTCLEARNED} = $tcount_learned;
$permsgstatus->{tag_data}{BAYESTC} = $tcount_total;