You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2005/04/10 22:44:06 UTC

svn commit: r160803 - in spamassassin/trunk: MANIFEST lib/Mail/SpamAssassin/Bayes.pm lib/Mail/SpamAssassin/Bayes/ lib/Mail/SpamAssassin/Bayes/CombineChi.pm lib/Mail/SpamAssassin/Bayes/CombineNaiveBayes.pm lib/Mail/SpamAssassin/Conf.pm

Author: jm
Date: Sun Apr 10 13:44:04 2005
New Revision: 160803

URL: http://svn.apache.org/viewcvs?view=rev&rev=160803
Log:
bug 3842: inactivate support for naive-Bayes probability combining, by abstracting into a new separate implementation class

Added:
    spamassassin/trunk/lib/Mail/SpamAssassin/Bayes/
    spamassassin/trunk/lib/Mail/SpamAssassin/Bayes/CombineChi.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Bayes/CombineNaiveBayes.pm
Modified:
    spamassassin/trunk/MANIFEST
    spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm

Modified: spamassassin/trunk/MANIFEST
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/MANIFEST?view=diff&r1=160802&r2=160803
==============================================================================
--- spamassassin/trunk/MANIFEST (original)
+++ spamassassin/trunk/MANIFEST Sun Apr 10 13:44:04 2005
@@ -28,6 +28,8 @@
 lib/Mail/SpamAssassin/ArchiveIterator.pm
 lib/Mail/SpamAssassin/AutoWhitelist.pm
 lib/Mail/SpamAssassin/Bayes.pm
+lib/Mail/SpamAssassin/Bayes/CombineChi.pm
+lib/Mail/SpamAssassin/Bayes/CombineNaiveBayes.pm
 lib/Mail/SpamAssassin/BayesStore.pm
 lib/Mail/SpamAssassin/BayesStore/DBM.pm
 lib/Mail/SpamAssassin/BayesStore/MySQL.pm

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm?view=diff&r1=160802&r2=160803
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm Sun Apr 10 13:44:04 2005
@@ -56,6 +56,11 @@
 
 use Mail::SpamAssassin;
 use Mail::SpamAssassin::PerMsgStatus;
+
+# pick ONLY ONE of these combining implementations.
+use Mail::SpamAssassin::Bayes::CombineChi;
+# use Mail::SpamAssassin::Bayes::CombineNaiveBayes;
+
 use Digest::SHA1 qw(sha1 sha1_hex);
 
 use vars qw{
@@ -206,23 +211,6 @@
 # into the <0.5 range for nonspam and >0.5 for spam.
 use constant USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS => 1;
 
-# Value for 'x' in the f(w) equation.
-# "Let x = the number used when n [hits] is 0."
-use constant CHI_ROBINSON_X_CONSTANT  => 0.538;
-use constant GARY_ROBINSON_X_CONSTANT => 0.600;
-
-# Value for 's' in the f(w) equation.  "We can see s as the "strength" (hence
-# the use of "s") of an original assumed expectation ... relative to how
-# strongly we want to consider our actual collected data."  Low 's' means
-# trust collected data more strongly.
-use constant CHI_ROBINSON_S_CONSTANT  => 0.100;
-use constant GARY_ROBINSON_S_CONSTANT => 0.160;
-
-# Should we ignore tokens with probs very close to the middle ground (.5)?
-# tokens need to be outside the [ .5-MPS, .5+MPS ] range to be used.
-use constant CHI_ROBINSON_MIN_PROB_STRENGTH  => 0.346;
-use constant GARY_ROBINSON_MIN_PROB_STRENGTH => 0.430;
-
 # How many of the most significant tokens should we use for the p(w)
 # calculation?
 use constant N_SIGNIFICANT_TOKENS => 150;
@@ -270,8 +258,6 @@
   $self;
 }
 
-###########################################################################
-
 sub finish {
   my $self = shift;
   #if (!$self->{conf}->{use_bayes}) { return; }
@@ -282,6 +268,8 @@
   $self->{store}->untie_db();
 }
 
+sub sa_die { Mail::SpamAssassin::sa_die(@_); }
+
 ###########################################################################
 
 sub sanity_check_is_untied {
@@ -306,25 +294,6 @@
   # use of hapaxes.  Set on bayes object, since it controls prob
   # computation.
   $self->{use_hapaxes} = $self->{conf}->{bayes_use_hapaxes};
-
-  # Use chi-squared combining instead of Gary-combining (Robinson/Graham-style
-  # naive-Bayesian)?
-  $self->{use_chi_sq_combining} = $self->{conf}->{bayes_use_chi2_combining};
-
-  # Use the appropriate set of constants; the different systems have different
-  # optimum settings for these.  (TODO: should these be exposed through Conf?)
-  if ($self->{use_chi_sq_combining}) {
-    $self->{robinson_x_constant} = CHI_ROBINSON_X_CONSTANT;
-    $self->{robinson_s_constant} = CHI_ROBINSON_S_CONSTANT;
-    $self->{robinson_min_prob_strength} = CHI_ROBINSON_MIN_PROB_STRENGTH;
-  } else {
-    $self->{robinson_x_constant} = GARY_ROBINSON_X_CONSTANT;
-    $self->{robinson_s_constant} = GARY_ROBINSON_S_CONSTANT;
-    $self->{robinson_min_prob_strength} = GARY_ROBINSON_MIN_PROB_STRENGTH;
-  }
-
-  $self->{robinson_s_times_x} =
-      ($self->{robinson_x_constant} * $self->{robinson_s_constant});
 }
 
 ###########################################################################
@@ -1089,9 +1058,9 @@
     # use Robinson's f(x) equation for low-n tokens, instead of just
     # ignoring them
     my $robn = $s+$n;
-    $prob = ($self->{robinson_s_times_x} + ($robn * $prob))
+    $prob = ($Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X + ($robn * $prob))
                              /
-		  ($self->{robinson_s_constant} + $robn);
+            ($Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT + $robn);
   }
 
   if ($self->{log_raw_counts}) {
@@ -1121,16 +1090,17 @@
   if (!$self->{use_hapaxes}) {return 0 if ($ns + $nn < 2);}
 
   return 0 if $Ns == 0 || $Nn == 0;
-  return 0 if abs( $prob - 0.5 ) < $self->{robinson_min_prob_strength};
+  return 0 if abs( $prob - 0.5 ) <
+                $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
 
   my ($Na,$na,$Nb,$nb) = $prob > 0.5 ? ($Nn,$nn,$Ns,$ns) : ($Ns,$ns,$Nn,$nn);
-  my $p = 0.5 - $self->{robinson_min_prob_strength};
+  my $p = 0.5 - $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
 
   return int( 1.0 - 1e-6 + $nb * $Na * $p / ($Nb * ( 1 - $p )) ) - $na
     unless USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS;
 
-  my $s = $self->{robinson_s_constant};
-  my $sx = $self->{robinson_s_times_x};
+  my $s = $Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT;
+  my $sx = $Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X;
   my $a = $Nb * ( 1 - $p );
   my $b = $Nb * ( $sx + $nb * ( 1 - $p ) - $p * $s ) - $p * $Na * $nb;
   my $c = $Na * $nb * ( $sx - $p * ( $s + $nb ) );
@@ -1261,7 +1231,8 @@
   {
     if ($count-- < 0) { last; }
     my $pw = $pw{$_}->{prob};
-    next if (abs($pw - 0.5) < $self->{robinson_min_prob_strength});
+    next if (abs($pw - 0.5) < 
+                $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH);
 
     # What's more expensive, scanning headers for HAMMYTOKENS and
     # SPAMMYTOKENS tags that aren't there or collecting data that
@@ -1289,11 +1260,7 @@
     goto skip;
   }
 
-  if ($self->{use_chi_sq_combining}) {
-    $score = chi_squared_probs_combine ($ns, $nn, @sorted);
-  } else {
-    $score = robinson_naive_bayes_probs_combine (@sorted);
-  }
+  $score = Mail::SpamAssassin::Bayes::Combine::combine($ns, $nn, \@sorted);
 
   # Couldn't come up with a probability?
   goto skip unless defined $score;
@@ -1391,92 +1358,6 @@
   }
 
   return;
-}
-
-###########################################################################
-
-sub sa_die { Mail::SpamAssassin::sa_die(@_); }
-
-###########################################################################
-
-sub robinson_naive_bayes_probs_combine {
-  my (@sorted) = @_;
-
-  my $wc = scalar @sorted;
-  return unless $wc;
-
-  my $P = 1;
-  my $Q = 1;
-
-  foreach my $pw (@sorted) {
-    $P *= (1-$pw);
-    $Q *= $pw;
-  }
-  $P = 1 - ($P ** (1 / $wc));
-  $Q = 1 - ($Q ** (1 / $wc));
-  return (1 + ($P - $Q) / ($P + $Q)) / 2.0;
-}
-
-###########################################################################
-
-# Chi-squared function
-sub chi2q {
-  my ($x2, $v) = @_;
-
-  die "bayes: v must be even in chi2q(x2, v)" if $v & 1;
-  my $m = $x2 / 2.0;
-  my ($sum, $term);
-  $sum = $term = exp(0 - $m);
-  for my $i (1 .. (($v/2)-1)) {
-    $term *= $m / $i;
-    $sum += $term;
-  }
-  return $sum < 1.0 ? $sum : 1.0;
-}
-
-# Chi-Squared method. Produces mostly boolean $result,
-# but with a grey area.
-sub chi_squared_probs_combine  {
-  my ($ns, $nn, @sorted) = @_;
-  # @sorted contains an array of the probabilities
-  my $wc = scalar @sorted;
-  return unless $wc;
-
-  my ($H, $S);
-  my ($Hexp, $Sexp);
-  $Hexp = $Sexp = 0;
-
-  # see bug 3118
-  my $totmsgs = ($ns + $nn);
-  if ($totmsgs == 0) { return; }
-  $S = ($ns / $totmsgs);
-  $H = ($nn / $totmsgs);
-
-  use POSIX qw(frexp);
-
-  foreach my $prob (@sorted) {
-    $S *= 1.0 - $prob;
-    $H *= $prob;
-    if ($S < 1e-200) {
-      my $e;
-      ($S, $e) = frexp($S);
-      $Sexp += $e;
-    }
-    if ($H < 1e-200) {
-      my $e;
-      ($H, $e) = frexp($H);
-      $Hexp += $e;
-    }
-  }
-
-  use constant LN2 => log(2);
-
-  $S = log($S) + $Sexp * LN2;
-  $H = log($H) + $Hexp * LN2;
-
-  $S = 1.0 - chi2q(-2.0 * $S, 2 * $wc);
-  $H = 1.0 - chi2q(-2.0 * $H, 2 * $wc);
-  return (($S - $H) + 1.0) / 2.0;
 }
 
 ###########################################################################

Added: spamassassin/trunk/lib/Mail/SpamAssassin/Bayes/CombineChi.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes/CombineChi.pm?view=auto&rev=160803
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Bayes/CombineChi.pm (added)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Bayes/CombineChi.pm Sun Apr 10 13:44:04 2005
@@ -0,0 +1,120 @@
+# Chi-square probability combining and related constants.
+#
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+
+# this package is a no-op; the real impl code is in another pkg.
+package Mail::SpamAssassin::Bayes::CombineChi; 1;
+
+# Force into another package, so our symbols will appear in that namespace with
+# no indirection, for speed.  Other combiners must do the same, since Bayes.pm
+# uses this namespace directly. This means only one combiner can be loaded at
+# any time.
+package Mail::SpamAssassin::Bayes::Combine;
+
+use strict;
+use warnings;
+use bytes;
+
+use POSIX qw(frexp);
+use constant LN2 => log(2);
+
+# Value for 'x' in Gary Robinson's f(w) equation.
+# "Let x = the number used when n [hits] is 0."
+our $FW_X_CONSTANT = 0.538;
+
+# Value for 's' in the f(w) equation.  "We can see s as the "strength" (hence
+# the use of "s") of an original assumed expectation ... relative to how
+# strongly we want to consider our actual collected data."  Low 's' means
+# trust collected data more strongly.
+our $FW_S_CONSTANT = 0.100;
+
+# (s . x) for the f(w) equation.
+our $FW_S_DOT_X = ($FW_X_CONSTANT * $FW_S_CONSTANT);
+
+# Should we ignore tokens with probs very close to the middle ground (.5)?
+# tokens need to be outside the [ .5-MPS, .5+MPS ] range to be used.
+our $MIN_PROB_STRENGTH = 0.346;
+
+###########################################################################
+
+# Chi-Squared method. Produces mostly boolean $result,
+# but with a grey area.
+sub combine {
+  my ($ns, $nn, $sortedref) = @_;
+
+  # @$sortedref contains an array of the probabilities
+  my $wc = scalar @$sortedref;
+  return unless $wc;
+
+  my ($H, $S);
+  my ($Hexp, $Sexp);
+  $Hexp = $Sexp = 0;
+
+  # see bug 3118
+  my $totmsgs = ($ns + $nn);
+  if ($totmsgs == 0) { return; }
+  $S = ($ns / $totmsgs);
+  $H = ($nn / $totmsgs);
+
+  foreach my $prob (@$sortedref) {
+    $S *= 1.0 - $prob;
+    $H *= $prob;
+    if ($S < 1e-200) {
+      my $e;
+      ($S, $e) = frexp($S);
+      $Sexp += $e;
+    }
+    if ($H < 1e-200) {
+      my $e;
+      ($H, $e) = frexp($H);
+      $Hexp += $e;
+    }
+  }
+
+  $S = log($S) + $Sexp * LN2;
+  $H = log($H) + $Hexp * LN2;
+
+  # note: previous versions used (2 * $wc) as second arg ($v), but the chi2q()
+  # fn then just used ($v/2) internally!  changed to simply supply $wc as
+  # ($halfv) directly instead to avoid redundant doubling and halving.  The
+  # side-effect is that chi2q() uses a different API now, but it's only used
+  # here anyway.
+
+  $S = 1.0 - chi2q(-2.0 * $S, $wc);
+  $H = 1.0 - chi2q(-2.0 * $H, $wc);
+  return (($S - $H) + 1.0) / 2.0;
+}
+
+# Chi-squared function (API changed; see comment above)
+sub chi2q {
+  my ($x2, $halfv) = @_;    
+
+  my $m = $x2 / 2.0;
+  my ($sum, $term);
+  $sum = $term = exp(0 - $m);
+  
+  # replace 'for my $i (1 .. (($v/2)-1))' idiom, which creates a temp
+  # array, with a plain C-style for loop
+  my $i;
+  for ($i = 1; $i < $halfv; $i++) {
+    $term *= $m / $i;
+    $sum += $term;
+  }
+  return $sum < 1.0 ? $sum : 1.0;
+}
+
+1;

Added: spamassassin/trunk/lib/Mail/SpamAssassin/Bayes/CombineNaiveBayes.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes/CombineNaiveBayes.pm?view=auto&rev=160803
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Bayes/CombineNaiveBayes.pm (added)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Bayes/CombineNaiveBayes.pm Sun Apr 10 13:44:04 2005
@@ -0,0 +1,73 @@
+# Naive-Bayesian-style probability combining and related constants.
+#
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+
+# this package is a no-op; the real impl code is in another pkg.
+package Mail::SpamAssassin::Bayes::CombineNaiveBayes; 1;
+
+# Force into another package, so our symbols will appear in that namespace with
+# no indirection, for speed.  Other combiners must do the same, since Bayes.pm
+# uses this namespace directly. This means only one combiner can be loaded at
+# any time.
+package Mail::SpamAssassin::Bayes::Combine;
+
+use strict;
+use warnings;
+use bytes;
+
+###########################################################################
+
+# Value for 'x' in Gary Robinson's f(w) equation.
+# "Let x = the number used when n [hits] is 0."
+our $FW_X_CONSTANT = 0.600;
+
+# Value for 's' in the f(w) equation.  "We can see s as the "strength" (hence
+# the use of "s") of an original assumed expectation ... relative to how
+# strongly we want to consider our actual collected data."  Low 's' means
+# trust collected data more strongly.
+our $FW_S_CONSTANT = 0.160;
+
+# (s . x) for the f(w) equation.
+our $FW_S_DOT_X = ($FW_X_CONSTANT * $FW_S_CONSTANT);
+
+# Should we ignore tokens with probs very close to the middle ground (.5)?
+# tokens need to be outside the [ .5-MPS, .5+MPS ] range to be used.
+our $MIN_PROB_STRENGTH = 0.430;
+
+###########################################################################
+
+# Combine probabilities using Gary Robinson's naive-Bayesian-style
+# combiner
+sub combine {
+  my ($ns, $nn, $sortedref) = @_;
+
+  my $wc = scalar @$sortedref;
+  return unless $wc;
+
+  my $P = 1;
+  my $Q = 1;
+
+  foreach my $pw (@$sortedref) {
+    $P *= (1-$pw);
+    $Q *= $pw;
+  }
+  $P = 1 - ($P ** (1 / $wc));
+  $Q = 1 - ($Q ** (1 / $wc));
+  return (1 + ($P - $Q) / ($P + $Q)) / 2.0;
+}
+
+1;

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm?view=diff&r1=160802&r2=160803
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm Sun Apr 10 13:44:04 2005
@@ -1208,21 +1208,6 @@
     type => $CONF_TYPE_BOOL
   });
 
-=item bayes_use_chi2_combining		(default: 1)
-
-Should the Bayesian classifier use chi-squared combining, instead of
-Robinson/Graham-style naive Bayesian combining?  Chi-squared produces
-more 'extreme' output results, but may be more resistant to changes
-in corpus size etc.
-
-=cut
-
-  push (@cmds, {
-    setting => 'bayes_use_chi2_combining',
-    default => 1,
-    type => $CONF_TYPE_BOOL
-  });
-
 =item bayes_journal_max_size		(default: 102400)
 
 SpamAssassin will opportunistically sync the journal and the database.