You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2004/01/30 03:54:34 UTC

svn commit: rev 6353 - in incubator/spamassassin/trunk: . lib/Mail/SpamAssassin rules sql t tools

Author: jm
Date: Thu Jan 29 18:54:33 2004
New Revision: 6353

Added:
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStoreDBM.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStoreSQL.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/SQLBasedAddrList.pm
   incubator/spamassassin/trunk/sql/README.awl
   incubator/spamassassin/trunk/sql/README.bayes
   incubator/spamassassin/trunk/sql/awl_mysql.sql
   incubator/spamassassin/trunk/sql/bayes_mysql.sql
   incubator/spamassassin/trunk/sql/bayes_pg.sql
   incubator/spamassassin/trunk/sql/bayes_sqlite.sql
   incubator/spamassassin/trunk/t/bayesdbm.t   (contents, props changed)
   incubator/spamassassin/trunk/t/bayessql.t   (contents, props changed)
   incubator/spamassassin/trunk/t/sql_based_whitelist.t   (contents, props changed)
   incubator/spamassassin/trunk/tools/convert_awl_dbm_to_sql
   incubator/spamassassin/trunk/tools/convert_bayes_dbm_to_sql
Modified:
   incubator/spamassassin/trunk/INSTALL
   incubator/spamassassin/trunk/MANIFEST
   incubator/spamassassin/trunk/MANIFEST.SKIP
   incubator/spamassassin/trunk/Makefile.PL
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/AutoWhitelist.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStore.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/CmdLearn.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm
   incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf
   incubator/spamassassin/trunk/sql/README
   incubator/spamassassin/trunk/t/SATest.pm
Log:
bug 195: SQL support for AWL and Bayes storage, thanks to Michael Parker

Modified: incubator/spamassassin/trunk/INSTALL
==============================================================================
--- incubator/spamassassin/trunk/INSTALL	(original)
+++ incubator/spamassassin/trunk/INSTALL	Thu Jan 29 18:54:33 2004
@@ -400,8 +400,15 @@
   - Time::HiRes    (from CPAN)
 
     If this module is installed, the processing times are logged more
-    exactly by spamd.
+    precisely by spamd.
 
+
+  - DBI *and* DBD driver/modules for your database  (from CPAN)
+
+    If you intend to use SpamAssassin with an SQL database backend for
+    user configuration data, Bayes storage, or AWL storage, you will need
+    to have these installed; both the basic DBI module and the driver for
+    your database.
 
 
 What Next?

Modified: incubator/spamassassin/trunk/MANIFEST
==============================================================================
--- incubator/spamassassin/trunk/MANIFEST	(original)
+++ incubator/spamassassin/trunk/MANIFEST	Thu Jan 29 18:54:33 2004
@@ -27,6 +27,8 @@
 lib/Mail/SpamAssassin/AutoWhitelist.pm
 lib/Mail/SpamAssassin/Bayes.pm
 lib/Mail/SpamAssassin/BayesStore.pm
+lib/Mail/SpamAssassin/BayesStoreDBM.pm
+lib/Mail/SpamAssassin/BayesStoreSQL.pm
 lib/Mail/SpamAssassin/CmdLearn.pm
 lib/Mail/SpamAssassin/Conf.pm
 lib/Mail/SpamAssassin/ConfSourceSQL.pm
@@ -46,6 +48,7 @@
 lib/Mail/SpamAssassin/Received.pm
 lib/Mail/SpamAssassin/Reporter.pm
 lib/Mail/SpamAssassin/SHA1.pm
+lib/Mail/SpamAssassin/SQLBasedAddrList.pm
 lib/Mail/SpamAssassin/TextCat.pm
 lib/Mail/SpamAssassin/UnixLocker.pm
 lib/Mail/SpamAssassin/Util.pm
@@ -226,6 +229,7 @@
 t/spamd_report.t
 t/spamd_report_ifspam.t
 t/spamd_stop.t
+t/sql_based_whitelist.t
 t/spamd_symbols.t
 t/spamd_unix.t
 t/spamd_utf8.t
@@ -238,6 +242,8 @@
 t/zz_cleanup.t
 tools/README.speedtest
 tools/check_whitelist
+tools/convert_awl_dbm_to_sql
+tools/convert_bayes_dbm_to_sql
 tools/mboxsplit
 tools/sa-stats.pl
 tools/speedtest
@@ -246,3 +252,38 @@
 tools/triplets.pl
 lib/Mail/SpamAssassin/Plugin.pm
 lib/Mail/SpamAssassin/PluginHandler.pm
+lib/Mail/SpamAssassin/BayesStoreDBM.pm
+lib/Mail/SpamAssassin/BayesStoreSQL.pm
+lib/Mail/SpamAssassin/SQLBasedAddrList.pm
+sql/README.awl
+sql/README.bayes
+sql/awl_mysql.sql
+sql/bayes_mysql.sql
+sql/bayes_pg.sql
+sql/bayes_sqlite.sql
+t/bayesdbm.t
+t/bayessql.t
+t/sql_based_whitelist.t
+tools/convert_awl_dbm_to_sql
+tools/convert_bayes_dbm_to_sql
+t/data/whitelists/action.eff.org
+t/data/whitelists/mlist_mailman_message
+t/data/whitelists/amazon_co_uk_ship
+t/data/whitelists/amazon_com_ship
+t/data/whitelists/cert.org
+t/data/whitelists/debian_bts_reassign
+t/data/whitelists/linuxplanet
+t/data/whitelists/lp.org
+t/data/whitelists/media_unspun
+t/data/whitelists/mlist_yahoo_groups_message
+t/data/whitelists/mypoints
+t/data/whitelists/neat_net_tricks
+t/data/whitelists/netcenter-direct_de
+t/data/whitelists/oracle_net_techblast
+t/data/whitelists/orbitz.com
+t/data/whitelists/paypal.com
+t/data/whitelists/register.com_password
+t/data/whitelists/ryanairmail.com
+t/data/whitelists/sf.net
+t/data/whitelists/winxpnews.com
+t/data/whitelists/yahoo-inc.com

Modified: incubator/spamassassin/trunk/MANIFEST.SKIP
==============================================================================
--- incubator/spamassassin/trunk/MANIFEST.SKIP	(original)
+++ incubator/spamassassin/trunk/MANIFEST.SKIP	Thu Jan 29 18:54:33 2004
@@ -112,3 +112,5 @@
 build/2.60_change_summary
 build/replace_license_blocks
 sa-learn
+t/bayessql.cf
+t/sql_based_whitelist.cf

Modified: incubator/spamassassin/trunk/Makefile.PL
==============================================================================
--- incubator/spamassassin/trunk/Makefile.PL	(original)
+++ incubator/spamassassin/trunk/Makefile.PL	Thu Jan 29 18:54:33 2004
@@ -214,8 +214,9 @@
 
         'doc', 'pod2htm*',
 
-        't/do_net', 't/log',
+        't/bayessql.cf', 't/do_net', 't/log', 't/sql_based_whitelist.cf',
       )
+
     },
 
     'AUTHOR'   => 'Justin Mason <jm...@jmason.org>',
@@ -339,6 +340,54 @@
 }
 $makefile{'macro'}{'RUN_NET_TESTS'} = yesno($opt{'run_net_tests'});
 
+$opt{'run_awl_sql_tests'} = prompt('Run SQL Based AutoWhitelist Tests (additional information required) (y/n)', "n");
+print "\n";
+
+$opt{'run_awl_sql_tests'} = bool($opt{'run_awl_sql_tests'});
+if ($opt{'run_awl_sql_tests'}) {
+  my $user_awl_dsn = prompt("SQL AWL DSN (user_awl_dsn): ", "dbi:mysql:spamassassin:localhost");
+  my $user_awl_sql_username = prompt("SQL AWL DB username (user_awl_sql_username): ", "");
+  my $user_awl_sql_password = prompt("SQL AWL DB password (user_awl_sql_password): ", "");
+  my $user_awl_sql_table = prompt("SQL AWL tablename (user_awl_sql_table): ", "awl") || 'awl';
+  print "\n";
+
+  open(FILE, ">t/sql_based_whitelist.cf");
+  print FILE "user_awl_dsn $user_awl_dsn\n";
+  # These two can be blank and the conf parser doesn't really like
+  # blank variables, so do not print them if blank
+  print FILE "user_awl_sql_username $user_awl_sql_username\n" if ($user_awl_sql_username);
+  print FILE "user_awl_sql_password $user_awl_sql_password\n" if ($user_awl_sql_password);
+  print FILE "user_awl_sql_table $user_awl_sql_table\n";
+
+  close(FILE);
+}
+else {
+  unlink("t/sql_based_whitelist.cf");
+}
+
+$opt{'run_bayes_sql_tests'} = prompt("Run Bayes SQL storage tests (additional information required)? (y/n)", 'n');
+print "\n";
+
+$opt{'run_bayes_sql_tests'} = bool($opt{'run_bayes_sql_tests'});
+if ($opt{'run_bayes_sql_tests'}) {
+  my $bayes_sql_dsn = prompt("Bayes SQL DSN (bayes_sql_dsn): ", "dbi:mysql:spamassassin:localhost");
+  my $bayes_sql_username = prompt("Bayes SQL DB username (bayes_sql_username): ", "");
+  my $bayes_sql_password = prompt("Bayes SQL DB password (bayes_sql_password): ", "");
+  print "\n";
+  
+  open(FILE, ">t/bayessql.cf");
+  print FILE "bayes_sql_dsn $bayes_sql_dsn\n";
+  # These two can be blank and the conf parser doesn't really like
+  # blank variables, so do not print them if blank
+  print FILE "bayes_sql_username $bayes_sql_username\n" if ($bayes_sql_username);
+  print FILE "bayes_sql_password $bayes_sql_password\n" if ($bayes_sql_password);
+  close(FILE);
+}
+else {
+  unlink("t/bayessql.cf");
+}
+
+#######################################################################
 
 # Now dump the Makefile
 WriteMakefile(%makefile);

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/AutoWhitelist.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/AutoWhitelist.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/AutoWhitelist.pm	Thu Jan 29 18:54:33 2004
@@ -108,8 +108,11 @@
       if (defined $noipent->{count} && $noipent->{count} > 0) {
 	dbg ("AWL: found entry w/o IP address for $addr: replacing with $origip");
 	$self->{checker}->remove_entry($noipent);
-	$self->{entry} = $noipent;
-	$self->{entry}->{addr} = $fulladdr;
+        # Now assign proper entry the count and totscore values of the no ip entry
+        # instead of assigning the whole value to avoid wiping out any information added
+        # to the previous entry.
+	$self->{entry}->{count} = $noipent->{count};
+	$self->{entry}->{totscore} = $noipent->{totscore};
       }
     }
   }

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm	Thu Jan 29 18:54:33 2004
@@ -47,7 +47,6 @@
 use bytes;
 
 use Mail::SpamAssassin;
-use Mail::SpamAssassin::BayesStore;
 use Mail::SpamAssassin::PerMsgStatus;
 use Mail::SpamAssassin::SHA1 qw(sha1);
 
@@ -220,6 +219,7 @@
 sub new {
   my $class = shift;
   $class = ref($class) || $class;
+
   my ($main) = @_;
   my $self = {
     'main'              => $main,
@@ -235,7 +235,21 @@
   };
   bless ($self, $class);
 
-  $self->{store} = new Mail::SpamAssassin::BayesStore ($self);
+  if ($self->{conf}->{bayes_store_module}) {
+    my $module = $self->{conf}->{bayes_store_module};
+    my $store;
+
+    eval '
+      require '.$module.';
+      $store = '.$module.'->new($self);
+    ';
+    if ($@) { die $@; }
+    $self->{store} = $store;
+  }
+  else {
+    require Mail::SpamAssassin::BayesStoreDBM;
+    $self->{store} = Mail::SpamAssassin::BayesStoreDBM->new($self);
+  }
 
   $self;
 }
@@ -326,9 +340,6 @@
 
   my $in_headers = ($tokprefix ne '');
 
-  my($bv) = ($self->{store}->get_magic_tokens())[6];
-  my $magic_re = $self->{store}->get_magic_re($bv);
-
   # include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings,
   # and ISO-8859-15 alphas.  Do not split on @'s; better results keeping it.
   # Some useful tokens: "$31,000,000" "www.clock-speed.net" "f*ck" "Hits!"
@@ -354,7 +365,7 @@
     $token =~ s/^[-'"\.,]+//;        # trim non-alphanum chars at start or end
     $token =~ s/[-'"\.,]+$//;        # so we don't get loads of '"foo' tokens
 
-    next if ( $token =~ /$magic_re/ ); # skip false magic tokens
+    next if ( $self->{store}->is_magic_token($token) ); # skip false magic tokens
 
     # *do* keep 3-byte tokens; there's some solid signs in there
     my $len = length($token);
@@ -714,9 +725,9 @@
   }
 
   $self->{store}->seen_put ($msgid, ($isspam ? 's' : 'h'));
-  $self->{store}->add_touches_to_journal();
-
+  $self->{store}->cleanup();
   dbg("bayes: Learned '$msgid'");
+
   1;
 }
 
@@ -800,7 +811,7 @@
   }
 
   $self->{store}->seen_delete ($msgid);
-  $self->{store}->add_touches_to_journal();
+  $self->{store}->cleanup();
   1;
 }
 
@@ -872,8 +883,8 @@
   my ($self, $sync, $expire, $opts) = @_;
   if (!$self->{conf}->{use_bayes}) { return 0; }
 
-  dbg("Syncing Bayes journal and expiring old tokens...");
-  $self->{store}->sync_journal($opts) if ( $sync );
+  dbg("Syncing Bayes and expiring old tokens...");
+  $self->{store}->sync($opts) if ( $sync );
   $self->{store}->expire_old_tokens($opts) if ( $expire );
   dbg("Syncing complete.");
 
@@ -884,9 +895,13 @@
 
 # compute the probability that that token is spammish
 sub compute_prob_for_token {
-  my ($self, $token, $ns, $nn) = @_;
+  my ($self, $token, $ns, $nn, $s, $n, $atime) = @_;
 
-  my ($s, $n, $atime) = $self->{store}->tok_get ($token);
+  # we allow the caller to give us the token information, just
+  # to save a potentially expensive lookup
+  if (!defined($s) || !defined($n) || !defined($atime)) {
+    ($s, $n, $atime) = $self->{store}->tok_get ($token);
+  }
   return if ($s == 0 && $n == 0);
 
   if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
@@ -1094,7 +1109,7 @@
     print "#Bayes-Raw-Counts: $self->{raw_counts}\n";
   }
 
-  $self->{store}->add_touches_to_journal();
+  $self->{store}->cleanup();
 
   $self->opportunistic_calls();
   $self->{store}->untie_db();
@@ -1109,17 +1124,17 @@
 sub opportunistic_calls {
   my($self) = @_;
 
-  # Is an expire or journal sync running?
+  # Is an expire or sync running?
   my $running_expire = $self->{store}->get_running_expire_tok();
   if ( defined $running_expire && $running_expire+$OPPORTUNISTIC_LOCK_VALID > time() ) { return; }
 
-  # handle expiry and journal syncing
+  # handle expiry and syncing
   if ($self->{store}->expiry_due()) {
     $self->{store}->set_running_expire_tok();
     $self->sync(1,1);
     # don't need to unlock since the expire will have done that. ;)
   }
-  elsif ( $self->{store}->journal_sync_due() ) {
+  elsif ( $self->{store}->sync_due() ) {
     $self->{store}->set_running_expire_tok();
     $self->sync(1,0);
     $self->{store}->remove_running_expire_tok();
@@ -1219,8 +1234,11 @@
 
   return 0 unless $self->{conf}->{use_bayes};
   return 0 unless $self->{store}->tie_db_readonly();
+  
+  my @vars = $self->{store}->get_storage_variables();
+
+  my($sb,$ns,$nh,$nt,$le,$oa,$bv,$js,$ad,$er,$na) = @vars;
 
-  my($sb,$ns,$nh,$nt,$le,$oa,$bv,$js,$ad,$er,$na) = $self->{store}->get_magic_tokens();
   $sb = $self->{store}->scan_count_get() if ( $bv < 1 ); # we want current scan count, not scan base count
 
   my $template = '%3.3f %10d %10d %10d  %s'."\n";
@@ -1242,18 +1260,8 @@
   }
 
   if ( $toks ) {
-    my $magic_re = $self->{store}->get_magic_re($bv);
-
-    foreach my $tok (keys %{$self->{store}->{db_toks}}) {
-      next if ($tok =~ /$magic_re/); # skip magic tokens
-      next if (defined $regex && ($tok !~ /$regex/o));
-
-      my $prob = $self->compute_prob_for_token($tok, $ns, $nh);
-      $prob ||= 0.5;
-
-      my ($ts, $th, $atime) = $self->{store}->tok_get ($tok);
-      printf $template,$prob,$ts,$th,$atime,$tok;
-    }
+    # let the store sort out the db_toks
+    $self->{store}->dump_db_toks($template, $regex, @vars);
   }
 
   if (!$self->{main}->{learn_caller_will_untie}) {

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStore.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStore.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStore.pm	Thu Jan 29 18:54:33 2004
@@ -14,103 +14,77 @@
 # limitations under the License.
 # </...@LICENSE>
 
+=head1 NAME
+
+Mail::SpamAssassin::BayesStore - Bayesian Storage Module
+
+=head1 SYNOPSIS
+
+=head1 DESCRIPTION
+
+This is the public API for the Bayesian store methods.  Any implementation of
+the storage module must implement these methods.
+
+=cut
+
 package Mail::SpamAssassin::BayesStore;
 
 use strict;
 use bytes;
-use Fcntl;
 
-use Mail::SpamAssassin;
-use Mail::SpamAssassin::Util;
-use File::Basename;
-use File::Spec;
-use File::Path;
-
-use constant HAS_DB_FILE => eval { require DB_File; };
-
-use vars qw{
-  @ISA
-  @DBNAMES @DB_EXTENSIONS
-  $NSPAM_MAGIC_TOKEN $NHAM_MAGIC_TOKEN $LAST_EXPIRE_MAGIC_TOKEN $LAST_JOURNAL_SYNC_MAGIC_TOKEN
-  $NTOKENS_MAGIC_TOKEN $OLDEST_TOKEN_AGE_MAGIC_TOKEN $LAST_EXPIRE_REDUCE_MAGIC_TOKEN
-  $RUNNING_EXPIRE_MAGIC_TOKEN $DB_VERSION_MAGIC_TOKEN $LAST_ATIME_DELTA_MAGIC_TOKEN
-  $NEWEST_TOKEN_AGE_MAGIC_TOKEN
-};
-
-@ISA = qw();
-
-# db layout (quoting Matt):
-#
-# > need five db files though to make it real fast:
-# [probs] 1. ngood and nbad (two entries, so could be a flat file rather 
-# than a db file).	(now 2 entries in db_toks)
-# [toks]  2. good token -> number seen
-# [toks]  3. bad token -> number seen (both are packed into 1 entry in 1 db)
-# [probs]  4. Consolidated good token -> probability
-# [probs]  5. Consolidated bad token -> probability
-# > As you add new mails, you update the entry in 2 or 3, then regenerate
-# > the entry for that token in 4 or 5.
-# > Then as you test a new mail, you just need to pull the probability
-# > direct from 4 and 5, and generate the overall probability. A simple and
-# > very fast operation. 
-#
-# jm: we use probs as overall probability. <0.5 = ham, >0.5 = spam
-#
-# update: probs is no longer maintained as a db, to keep on-disk and in-core
-# usage down.
-#
-# also, added a new one to support forgetting, auto-learning, and
-# auto-forgetting for refiled mails:
-# [seen]  6. a list of Message-IDs of messages already learnt from. values
-# are 's' for learnt-as-spam, 'h' for learnt-as-ham.
-#
-# and another, called [scancount] to model the scan-count for expiry.
-# This is not a database.  Instead it increases by one byte for each
-# message scanned (note: scanned, not learned).
-
-@DBNAMES = qw(toks seen);
-
-# Possible file extensions used by the kinds of database files DB_File
-# might create.  We need these so we can create a new file and rename
-# it into place.
-@DB_EXTENSIONS = ('', '.db');
-
-# These are the magic tokens we use to track stuff in the DB.
-# The format is '^M^A^G^I^C' followed by any string you want.
-# None of the control chars will be in a real token.
-$DB_VERSION_MAGIC_TOKEN		= "\015\001\007\011\003DBVERSION";
-$LAST_ATIME_DELTA_MAGIC_TOKEN	= "\015\001\007\011\003LASTATIMEDELTA";
-$LAST_EXPIRE_MAGIC_TOKEN	= "\015\001\007\011\003LASTEXPIRE";
-$LAST_EXPIRE_REDUCE_MAGIC_TOKEN	= "\015\001\007\011\003LASTEXPIREREDUCE";
-$LAST_JOURNAL_SYNC_MAGIC_TOKEN	= "\015\001\007\011\003LASTJOURNALSYNC";
-$NEWEST_TOKEN_AGE_MAGIC_TOKEN	= "\015\001\007\011\003NEWESTAGE";
-$NHAM_MAGIC_TOKEN		= "\015\001\007\011\003NHAM";
-$NSPAM_MAGIC_TOKEN		= "\015\001\007\011\003NSPAM";
-$NTOKENS_MAGIC_TOKEN		= "\015\001\007\011\003NTOKENS";
-$OLDEST_TOKEN_AGE_MAGIC_TOKEN	= "\015\001\007\011\003OLDESTAGE";
-$RUNNING_EXPIRE_MAGIC_TOKEN	= "\015\001\007\011\003RUNNINGEXPIRE";
+=head1 METHODS
+
+=head2 new
+
+public class (Mail::SpamAssassin::BayesStore) new (Mail::SpamAssassin::Bayes $bayes)
 
-use constant DB_VERSION => 2;	# what version of DB do we use?
+Description:
+This method creates a new instance of the Mail::SpamAssassin::BayesStore
+object.  You must pass in an instance of the Mail::SpamAssassin:Bayes object,
+which is stashed for use throughout the module.
 
-###########################################################################
+=cut
 
 sub new {
-  my $class = shift;
+  my ($class, $bayes) = @_;
+
   $class = ref($class) || $class;
-  my ($bayes) = @_;
+
   my $self = {
-    'bayes'             => $bayes,
-    'already_tied'	=> 0,
-    'is_locked'		=> 0,
-    'string_to_journal' => '',
-    'db_version'	=> undef,
-  };
+	      'bayes'                => $bayes,
+	      'supported_db_version' => 0,
+	      'db_version'	     => undef,
+	     };
+
   bless ($self, $class);
 
   $self;
 }
 
-###########################################################################
+=head2 DB_VERSION
+
+public instance (Integer) DB_VERSION ()
+
+Description:
+This method returns the currently supported database version for the
+implementation.
+
+=cut
+
+sub DB_VERSION {
+  my ($self) = @_;
+  return $self->{supported_db_version};
+}
+
+=head2 read_db_configs
+
+public instance () read_db_configs ()
+
+Description:
+This method reads any needed config variables from the configuration
+object and then calls the Mail::SpamAssassin::Bayes read_db_configs method.
+
+=cut
 
 sub read_db_configs {
   my ($self) = @_;
@@ -133,342 +107,95 @@
   $self->{bayes}->read_db_configs();
 }
 
-###########################################################################
-
-sub tie_db_readonly {
-  my ($self) = @_;
+=head2 tie_db_readonly
 
-  if (!HAS_DB_FILE) {
-    dbg ("bayes: DB_File module not installed, cannot use Bayes");
-    return 0;
-  }
+public instance (Boolean) tie_db_readonly ()
 
-  # return if we've already tied to the db's, using the same mode
-  # (locked/unlocked) as before.
-  return 1 if ($self->{already_tied} && $self->{is_locked} == 0);
-
-  my $main = $self->{bayes}->{main};
-  if (!defined($main->{conf}->{bayes_path})) {
-    dbg ("bayes_path not defined");
-    return 0;
-  }
+Description:
+This method opens up the database in readonly mode.
 
-  $self->read_db_configs();
+=cut
 
-  my $path = $main->sed_path ($main->{conf}->{bayes_path});
-
-  my $found=0;
-  for my $ext (@DB_EXTENSIONS) { if (-f $path.'_toks'.$ext) { $found=1; last; } }
-
-  if (!$found) {
-    dbg ("bayes: no dbs present, cannot scan: ${path}_toks");
-    return 0;
-  }
-
-  foreach my $dbname (@DBNAMES) {
-    my $name = $path.'_'.$dbname;
-    my $db_var = 'db_'.$dbname;
-    dbg("bayes: $$ tie-ing to DB file R/O $name");
-    # untie %{$self->{$db_var}} if (tied %{$self->{$db_var}});
-    tie %{$self->{$db_var}},"DB_File",$name, O_RDONLY,
-		 (oct ($main->{conf}->{bayes_file_mode}) & 0666)
-       or goto failed_to_tie;
-  }
+sub tie_db_readonly {
+  my ($self) = @_;
+  die "tie_db_readonly: not implemented\n";
+}
 
-  $self->{db_version} = ($self->get_magic_tokens())[6];
-  dbg("bayes: found bayes db version ".$self->{db_version});
+=head2 tie_db_writable
 
-  # If the DB version is one we don't understand, abort!
-  if ( $self->check_db_version() ) {
-    dbg("bayes: bayes db version ".$self->{db_version}." is newer than we understand, aborting!");
-    $self->untie_db();
-    return 0;
-  }
+public instance (Boolean) tie_db_writable ()
 
-  if ( $self->{db_version} < 2 ) { # older versions use scancount
-    $self->{scan_count_little_file} = $path.'_msgcount';
-  }
+Description:
+This method opens up the database in writable mode.
 
-  $self->{already_tied} = 1;
-  return 1;
+Any callers of this methods should ensure that they call untie_db()
+afterwards.
 
-failed_to_tie:
-  warn "Cannot open bayes databases ${path}_* R/O: tie failed: $!\n";
-  return 0;
-}
+=cut
 
-# tie() to the databases, read-write and locked.  Any callers of
-# this should ensure they call untie_db() afterwards!
-#
 sub tie_db_writable {
   my ($self) = @_;
-
-  if (!HAS_DB_FILE) {
-    dbg ("bayes: DB_File module not installed, cannot use Bayes");
-    return 0;
-  }
-
-  # return if we've already tied to the db's, using the same mode
-  # (locked/unlocked) as before.
-  return 1 if ($self->{already_tied} && $self->{is_locked} == 1);
-
-  my $main = $self->{bayes}->{main};
-  if (!defined($main->{conf}->{bayes_path})) {
-    dbg ("bayes_path not defined");
-    return 0;
-  }
-
-  $self->read_db_configs();
-
-  my $path = $main->sed_path ($main->{conf}->{bayes_path});
-
-  my $found=0;
-  for my $ext (@DB_EXTENSIONS) { if (-f $path.'_toks'.$ext) { $found=1; last; } }
-
-  my $parentdir = dirname ($path);
-  if (!-d $parentdir) {
-    # run in an eval(); if mkpath has no perms, it calls die()
-    eval {
-      mkpath ($parentdir, 0, (oct ($main->{conf}->{bayes_file_mode}) & 0777));
-    };
-  }
-
-  my $tout;
-  if ($main->{learn_wait_for_lock}) {
-    $tout = 300;       # TODO: Dan to write better lock code
-  } else {
-    $tout = 10;
-  }
-  if ($main->{locker}->safe_lock ($path, $tout)) {
-    $self->{locked_file} = $path;
-    $self->{is_locked} = 1;
-  } else {
-    warn "Cannot open bayes databases ${path}_* R/W: lock failed: $!\n";
-    return 0;
-  }
-
-  my $umask = umask 0;
-  foreach my $dbname (@DBNAMES) {
-    my $name = $path.'_'.$dbname;
-    my $db_var = 'db_'.$dbname;
-    dbg("bayes: $$ tie-ing to DB file R/W $name");
-    tie %{$self->{$db_var}},"DB_File",$name, O_RDWR|O_CREAT,
-		 (oct ($main->{conf}->{bayes_file_mode}) & 0666)
-       or goto failed_to_tie;
-  }
-  umask $umask;
-
-  # set our cache to what version DB we're using
-  $self->{db_version} = ($self->get_magic_tokens())[6];
-  dbg("bayes: found bayes db version ".$self->{db_version});
-
-  # figure out if we can read the current DB and if we need to do a
-  # DB version update and do it if necessary if either has a problem,
-  # fail immediately
-  #
-  if ( $found && $self->upgrade_db() ) {
-    $self->untie_db();
-    return 0;
-  }
-  elsif ( !$found ) { # new DB, make sure we know that ...
-    $self->{db_version} = $self->{db_toks}->{$DB_VERSION_MAGIC_TOKEN} = DB_VERSION;
-    $self->{db_toks}->{$NTOKENS_MAGIC_TOKEN} = 0; # no tokens in the db ...
-    dbg("bayes: new db, set db version ".$self->{db_version}." and 0 tokens");
-  }
-
-  $self->{already_tied} = 1;
-  return 1;
-
-failed_to_tie:
-  my $err = $!;
-  umask $umask;
-  if ($self->{is_locked}) {
-    $self->{bayes}->{main}->{locker}->safe_unlock ($self->{locked_file});
-    $self->{is_locked} = 0;
-  }
-  warn "Cannot open bayes databases ${path}_* R/W: tie failed: $err\n";
-  return 0;
+  die "tie_db_writable: not implemented\n";
 }
 
-# Do we understand how to deal with this DB version?
-sub check_db_version {
-  my ($self) = @_;
-  my $db_ver = ($self->get_magic_tokens())[6];
+=head2 untie_db
 
-  if ( $db_ver > DB_VERSION ) { # current DB is newer, ignore the DB!
-    warn "bayes: Found DB Version $db_ver, but can only handle up to version ".DB_VERSION."\n";
-    return 1;
-  }
+public instance () untie_db ()
 
-  return 0;
-}
+Description:
+This method unties the database.
 
-# Check to see if we need to upgrade the DB, and do so if necessary
-sub upgrade_db {
-  my ($self) = @_;
-
-  return 0 if ( $self->{db_version} == DB_VERSION );
-  if ( $self->check_db_version() ) {
-    dbg("bayes: bayes db version ".$self->{db_version}." is newer than we understand, aborting!");
-    return 1;
-  }
-
-  # If the current DB version is lower than the new version, upgrade!
-  # Do conversions in order so we can go 1 -> 3, make sure to update $self->{db_version}
-
-  dbg("bayes: detected bayes db format ".$self->{db_version}.", upgrading");
+=cut
 
-  # since DB_File will not shrink a database (!!), we need to *create*
-  # a new one instead.
-  my $main = $self->{bayes}->{main};
-  my $path = $main->sed_path ($main->{conf}->{bayes_path});
-  my $name = $path.'_toks';
-
-  # older version's journal files are likely not in the same format as the new ones, so remove it.
-  my $jpath = $self->get_journal_filename();
-  if ( -f $jpath ) {
-    dbg("bayes: old journal file found, removing.");
-    warn "Couldn't remove $jpath: $!" if ( !unlink $jpath );
-  }
-
-  if ( $self->{db_version} < 2 ) {
-    dbg ("bayes: upgrading database format from v".$self->{db_version}." to v2");
-
-    my($DB_NSPAM_MAGIC_TOKEN, $DB_NHAM_MAGIC_TOKEN, $DB_NTOKENS_MAGIC_TOKEN);
-    my($DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN, $DB_LAST_EXPIRE_MAGIC_TOKEN);
+sub untie_db {
+  my $self = shift;
+  die "untie_db: not implemented\n";
+}
 
-    # Magic tokens for version 0, defined as '**[A-Z]+'
-    if ( $self->{db_version} == 0 ) {
-      $DB_NSPAM_MAGIC_TOKEN			= '**NSPAM';
-      $DB_NHAM_MAGIC_TOKEN			= '**NHAM';
-      $DB_NTOKENS_MAGIC_TOKEN			= '**NTOKENS';
-      #$DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN		= '**OLDESTAGE';
-      #$DB_LAST_EXPIRE_MAGIC_TOKEN		= '**LASTEXPIRE';
-      #$DB_SCANCOUNT_BASE_MAGIC_TOKEN		= '**SCANBASE';
-      #$DB_RUNNING_EXPIRE_MAGIC_TOKEN		= '**RUNNINGEXPIRE';
-    }
-    else {
-      $DB_NSPAM_MAGIC_TOKEN			= "\015\001\007\011\003NSPAM";
-      $DB_NHAM_MAGIC_TOKEN			= "\015\001\007\011\003NHAM";
-      $DB_NTOKENS_MAGIC_TOKEN			= "\015\001\007\011\003NTOKENS";
-      #$DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN		= "\015\001\007\011\003OLDESTAGE";
-      #$DB_LAST_EXPIRE_MAGIC_TOKEN		= "\015\001\007\011\003LASTEXPIRE";
-      #$DB_SCANCOUNT_BASE_MAGIC_TOKEN		= "\015\001\007\011\003SCANBASE";
-      #$DB_RUNNING_EXPIRE_MAGIC_TOKEN		= "\015\001\007\011\003RUNNINGEXPIRE";
-    }
+=head2 calculate_expire_delta
 
-    # remember when we started ...
-    my $started = time;
-    my $newatime = $started;
-
-    # use O_EXCL to avoid races (bonus paranoia, since we should be locked
-    # anyway)
-    my %new_toks;
-    my $umask = umask 0;
-    tie %new_toks, "DB_File", "${name}.new", O_RDWR|O_CREAT|O_EXCL,
-          (oct ($main->{conf}->{bayes_file_mode}) & 0666) or return 1;
-    umask $umask;
-
-    # add the magic tokens to the new db.
-    $new_toks{$NSPAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NSPAM_MAGIC_TOKEN};
-    $new_toks{$NHAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NHAM_MAGIC_TOKEN};
-    $new_toks{$NTOKENS_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NTOKENS_MAGIC_TOKEN};
-    $new_toks{$DB_VERSION_MAGIC_TOKEN} = 2; # we're now a DB version 2 file
-    $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $newatime;
-    $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = $newatime;
-    $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $newatime;
-    $new_toks{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = $newatime;
-    $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = 0;
-    $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} = 0;
-
-    my $magic_re = $self->get_magic_re($self->{db_version});
-
-    # deal with the data tokens
-    my ($tok, $packed);
-    while (($tok, $packed) = each %{$self->{db_toks}}) {
-      next if ($tok =~ /$magic_re/); # skip magic tokens
+public instance (\%) calculate_expire_delta (Integer $newest_atime,
+                                             Integer $start,
+                                             Integer $max_expire_mult)
 
-      my ($ts, $th, $atime) = $self->tok_unpack ($packed);
-      $new_toks{$tok} = $self->tok_pack ($ts, $th, $newatime);
-    }
+Description:
+This method performs a calculation on the data to determine the optimum
+atime for token expiration.
 
+=cut
 
-    # now untie so we can do renames
-    untie %{$self->{db_toks}};
-    untie %new_toks;
-
-    # This is the critical phase (moving files around), so don't allow
-    # it to be interrupted.
-    local $SIG{'INT'} = 'IGNORE';
-    local $SIG{'HUP'} = 'IGNORE';
-    local $SIG{'TERM'} = 'IGNORE';
-
-    # older versions used scancount, so kill the stupid little file ...
-    my $msgc = $path.'_msgcount';
-    if ( -f $msgc ) {
-      dbg("bayes: old msgcount file found, removing.");
-      if ( !unlink $msgc ) {
-        warn "Couldn't remove $msgc: $!";
-      }
-    }
+sub calculate_expire_delta {
+  my ($self, $newest_atime, $start, $max_expire_mult) = @_;
+  die "calculate_expire_delta: not implemented\n";
+}
 
-    # now rename in the new one.  Try several extensions
-    for my $ext (@DB_EXTENSIONS) {
-      my $newf = $name.'.new'.$ext;
-      my $oldf = $name.$ext;
-      next unless (-f $newf);
-      if (!rename ($newf, $oldf)) {
-        warn "rename $newf to $oldf failed: $!\n";
-        return 1;
-      }
-    }
+=head2 token_expiration
 
-    # re-tie to the new db in read-write mode ...
-    tie %{$self->{db_toks}},"DB_File", $name, O_RDWR|O_CREAT,
-	 (oct ($main->{conf}->{bayes_file_mode}) & 0666) or return 1;
+public instance (Integer, Integer,
+                 Integer, Integer) token_expiration(\% $opts,
+                                                    Integer $newest_atime,
+                                                    Integer $newdelta)
 
-    dbg ("bayes: upgraded database format from v".$self->{db_version}." to v2 in ".(time - $started)." seconds");
-    $self->{db_version} = 2; # need this for other functions which check
-  }
+Description:
+This method performs the database specific expiration of tokens based on
+the passed in C<$newest_atime> and C<$newdelta>.
 
-  # if ( $self->{db_version} == 2 ) {
-  #   ...
-  #   $self->{db_version} = 3; # need this for other functions which check
-  # }
-  # ... and so on.
+=cut
 
-  return 0;
+sub token_expiration {
+  my ($self, $opts, $newest_atime, $newdelta) = @_;
+  die "token_expiration: not implemented\n";
 }
 
-###########################################################################
-
-sub untie_db {
-  my $self = shift;
-  dbg("bayes: $$ untie-ing");
-
-  foreach my $dbname (@DBNAMES) {
-    my $db_var = 'db_'.$dbname;
+=head2 expire_old_tokens
 
-    if (exists $self->{$db_var}) {
-      dbg ("bayes: $$ untie-ing $db_var");
-      untie %{$self->{$db_var}};
-      delete $self->{$db_var};
-    }
-  }
+public instance (Boolean) expire_old_tokens (\% hashref)
 
-  if ($self->{is_locked}) {
-    dbg ("bayes: files locked, now unlocking lock");
-    $self->{bayes}->{main}->{locker}->safe_unlock ($self->{locked_file});
-    $self->{is_locked} = 0;
-  }
-
-  $self->{already_tied} = 0;
-  $self->{db_version} = undef;
-}
+Description:
+This method expires old tokens from the database.
 
-###########################################################################
+=cut
 
-# Do an expiry run.
 sub expire_old_tokens {
   my ($self, $opts) = @_;
   my $ret;
@@ -492,6 +219,16 @@
   $ret;
 }
 
+=head2 expire_old_tokens_trapped
+
+public instance (Boolean) expire_old_tokens_trapped (\% $opts)
+
+Description:
+This methods does the actual token expiration.
+
+XXX More docs here about the methodology and what not
+=cut
+
 sub expire_old_tokens_trapped {
   my ($self, $opts) = @_;
 
@@ -504,27 +241,8 @@
     return 0;
   }
 
-  my $deleted = 0;
-  my $kept = 0;
-  my $num_lowfreq = 0;
-  my $num_hapaxes = 0;
   my $started = time();
-  my @magic = $self->get_magic_tokens();
-
-  # since DB_File will not shrink a database (!!), we need to *create*
-  # a new one instead.
-  my $main = $self->{bayes}->{main};
-  my $path = $main->sed_path ($main->{conf}->{bayes_path});
-
-  # use a temporary PID-based suffix just in case another one was
-  # created previously by an interrupted expire
-  my $tmpsuffix = "expire$$";
-  my $tmpdbname = $path.'_toks.'.$tmpsuffix;
-
-  my $magic_re = $self->get_magic_re(DB_VERSION);
-
-  # Figure out atime delta as necessary
-  my $too_old = 0;
+  my @vars = $self->get_storage_variables();
 
   # How many tokens do we want to keep?
   my $goal_reduction = int($self->{expiry_max_db_size} * 0.75); # expire to 75% of max_db
@@ -535,32 +253,32 @@
     dbg("bayes: expiry keep size too small, resetting to 100,000 tokens");
   }
   # Now turn goal_reduction into how many to expire.
-  $goal_reduction = $magic[3] - $goal_reduction;
-  dbg("bayes: token count: ".$magic[3].", final goal reduction size: $goal_reduction");
+  $goal_reduction = $vars[3] - $goal_reduction;
+  dbg("bayes: token count: ".$vars[3].", final goal reduction size: $goal_reduction");
 
   if ( $goal_reduction < 1000 ) { # too few tokens to expire, abort.
     dbg("bayes: reduction goal of $goal_reduction is under 1,000 tokens.  skipping expire.");
-    $self->{db_toks}->{$LAST_EXPIRE_MAGIC_TOKEN} = time();
+    $self->set_last_expire(time());
     $self->remove_running_expire_tok(); # this won't be cleaned up, so do it now.
     return 1; # we want to indicate things ran as expected
   }
 
   # Estimate new atime delta based on the last atime delta
   my $newdelta = 0;
-  if ( $magic[9] > 0 ) {
+  if ( $vars[9] > 0 ) {
     # newdelta = olddelta * old / goal;
     # this may seem backwards, but since we're talking delta here,
     # not actual atime, we want smaller atimes to expire more tokens,
     # and visa versa.
     #
-    $newdelta = int($magic[8] * $magic[9] / $goal_reduction);
+    $newdelta = int($vars[8] * $vars[9] / $goal_reduction);
   }
 
   # Calculate size difference between last expiration token removal
   # count and the current goal removal count.
-  my $ratio = ($magic[9] == 0 || $magic[9] > $goal_reduction) ? $magic[9]/$goal_reduction : $goal_reduction/$magic[9];
+  my $ratio = ($vars[9] == 0 || $vars[9] > $goal_reduction) ? $vars[9]/$goal_reduction : $goal_reduction/$vars[9];
 
-  dbg("bayes: First pass?  Current: ".time().", Last: ".$magic[4].", atime: ".$magic[8].", count: ".$magic[9].", newdelta: $newdelta, ratio: $ratio");
+  dbg("bayes: First pass?  Current: ".time().", Last: ".$vars[4].", atime: ".$vars[8].", count: ".$vars[9].", newdelta: $newdelta, ratio: $ratio");
 
   ## ESTIMATION PHASE
   #
@@ -577,32 +295,16 @@
   # - difference of last reduction to current goal reduction is > 50%
   #   if the two values are out of balance, estimating atime is going to be funky, recompute
   #
-  if ( (time() - $magic[4] > 86400*30) || ($magic[8] < 43200) || ($magic[9] < 1000) || ($newdelta < 43200) || ($ratio > 1.5) ) {
+  if ( (time() - $vars[4] > 86400*30) || ($vars[8] < 43200) || ($vars[9] < 1000)
+       || ($newdelta < 43200) || ($ratio > 1.5) ) {
     dbg("bayes: Can't use estimation method for expiry, something fishy, calculating optimal atime delta (first pass)");
+
     my $start = 43200; # exponential search starting at ...?  1/2 day, 1, 2, 4, 8, 16, ...
-    my %delta = (); # use a hash since an array is going to be very sparse
     my $max_expire_mult = 512; # $max_expire_mult * $start = max expire time (256 days), power of 2.
 
-    # do the first pass, figure out atime delta
-    my ($tok, $packed);
-    while (($tok, $packed) = each %{$self->{db_toks}}) {
-      next if ($tok =~ /$magic_re/); # skip magic tokens
-
-      my ($ts, $th, $atime) = $self->tok_unpack ($packed);
-
-      # Go through from $start * 1 to $start * 512, mark how many tokens we would expire
-      my $token_age = $magic[10] - $atime;
-      for( my $i = 1; $i <= $max_expire_mult; $i<<=1 ) {
-        if ( $token_age >= $start * $i ) {
-          $delta{$i}++;
-	}
-	else {
-	  # If the token age is less than the expire delta, it'll be
-	  # less for all upcoming checks too, so abort early.
-	  last;
-	}
-      }
-    }
+    my %delta = $self->calculate_expire_delta($vars[10], $start, $max_expire_mult);
+
+    return 0 unless (%delta);
 
     # This will skip the for loop if debugging isn't enabled ...
     if ( $Mail::SpamAssassin::DEBUG->{'enabled'} ) {
@@ -612,7 +314,7 @@
 	  dbg("bayes: ".$start*$i."\t".(exists $delta{$i} ? $delta{$i} : 0));
       }
     }
-
+  
     # Now figure out which max_expire_mult value gives the closest results to goal_reduction, without
     # going over ...  Go from the largest delta backwards so the reduction size increases
     # (tokens that expire at 4 also expire at 3, 2, and 1, so 1 will always be the largest expiry...)
@@ -621,7 +323,7 @@
       next unless exists $delta{$max_expire_mult};
       if ($delta{$max_expire_mult} > $goal_reduction) {
         $max_expire_mult<<=1; # the max expire is actually the next power of 2 out
-	last;
+        last;
       }
     }
 
@@ -637,7 +339,7 @@
     #
     if ( !exists $delta{$max_expire_mult} || $delta{$max_expire_mult} < 1000 ) {
       dbg("bayes: couldn't find a good delta atime, need more token difference, skipping expire.");
-      $self->{db_toks}->{$LAST_EXPIRE_MAGIC_TOKEN} = time();
+      $self->set_last_expire(time());
       $self->remove_running_expire_tok(); # this won't be cleaned up, so do it now.
       return 1; # we want to indicate things ran as expected
     }
@@ -649,92 +351,7 @@
     dbg("bayes: Can do estimation method for expiry, skipping first pass.");
   }
 
-  # clean out any leftover db copies from previous runs
-  for my $ext (@DB_EXTENSIONS) { unlink ($tmpdbname.$ext); }
-
-  # use O_EXCL to avoid races (bonus paranoia, since we should be locked
-  # anyway)
-  my %new_toks;
-  my $umask = umask 0;
-  tie %new_toks, "DB_File", $tmpdbname, O_RDWR|O_CREAT|O_EXCL,
-	       (oct ($main->{conf}->{bayes_file_mode}) & 0666);
-  umask $umask;
-  my $oldest;
-
-  my $showdots = $opts->{showdots};
-  if ($showdots) { print STDERR "\n"; }
-
-  # We've chosen a new atime delta if we've gotten here, so record it for posterity.
-  $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = $newdelta;
-
-  # Figure out how old is too old...
-  $too_old = $magic[10] - $newdelta; # tooold = newest - delta
-
-  # Go ahead and do the move to new db/expire run now ...
-  my ($tok, $packed);
-  while (($tok, $packed) = each %{$self->{db_toks}}) {
-    next if ($tok =~ /$magic_re/); # skip magic tokens
-
-    my ($ts, $th, $atime) = $self->tok_unpack ($packed);
-
-    if ($atime < $too_old) {
-      $deleted++;
-    } else {
-      $new_toks{$tok} = $self->tok_pack ($ts, $th, $atime); $kept++;
-      if (!defined($oldest) || $atime < $oldest) { $oldest = $atime; }
-      if ($ts + $th == 1) {
-	$num_hapaxes++;
-      } elsif ($ts < 8 && $th < 8) {
-	$num_lowfreq++;
-      }
-    }
-
-    if ((($kept + $deleted) % 1000) == 0) {
-      if ($showdots) { print STDERR "."; }
-      $self->set_running_expire_tok();
-    }
-  }
-
-  # and add the magic tokens.  don't add the expire_running token.
-  $new_toks{$DB_VERSION_MAGIC_TOKEN} = DB_VERSION;
-
-  # We haven't changed messages of each type seen, so just copy over.
-  $new_toks{$NSPAM_MAGIC_TOKEN} = $magic[1];
-  $new_toks{$NHAM_MAGIC_TOKEN} = $magic[2];
-
-  # We magically haven't removed the newest token, so just copy that value over.
-  $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $magic[10];
-
-  # The rest of these have been modified, so replace as necessary.
-  $new_toks{$NTOKENS_MAGIC_TOKEN} = $kept;
-  $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = time();
-  $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $oldest;
-  $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} = $deleted;
-
-  # now untie so we can do renames
-  untie %{$self->{db_toks}};
-  untie %new_toks;
-
-  # This is the critical phase (moving files around), so don't allow
-  # it to be interrupted.  Scope the signal changes.
-  {
-    local $SIG{'INT'} = 'IGNORE';
-    local $SIG{'HUP'} = 'IGNORE';
-    local $SIG{'TERM'} = 'IGNORE';
-
-    # now rename in the new one.  Try several extensions
-    for my $ext (@DB_EXTENSIONS) {
-      my $newf = $tmpdbname.$ext;
-      my $oldf = $path.'_toks'.$ext;
-      next unless (-f $newf);
-      if (!rename ($newf, $oldf)) {
-	warn "rename $newf to $oldf failed: $!\n";
-      }
-    }
-  }
-
-  # Call untie_db() so we unlock correctly.
-  $self->untie_db();
+  my ($kept, $deleted, $num_hapaxes, $num_lowfreq) = $self->token_expiration($opts, $newdelta, @vars);
 
   my $done = time();
 
@@ -742,6 +359,7 @@
   my $msg2 = "$kept entries kept, $deleted deleted";
 
   if ($opts->{verbose}) {
+
     my $hapax_pc = ($num_hapaxes * 100) / $kept;
     my $lowfreq_pc = ($num_lowfreq * 100) / $kept;
     print "$msg\n$msg2\n";
@@ -751,53 +369,46 @@
     dbg ("$msg: $msg2");
   }
 
-  1;
+  return 1;
 }
 
-###########################################################################
+=head2 sync_due
 
-# Is a journal sync due?
-sub journal_sync_due {
-  my ($self) = @_;
+public instance (Boolean) sync_due ()
 
-  return 0 if ( $self->{db_version} < DB_VERSION ); # don't bother doing old db versions
+Description:
+This methods determines if a sync is due.
 
-  my $conf = $self->{bayes}->{main}->{conf};
-  return 0 if ( $conf->{bayes_journal_max_size} == 0 );
+=cut
 
-  my @magic = $self->get_magic_tokens();
-  dbg("Bayes DB journal sync: last sync: ".$magic[7],'bayes','-1');
+sub sync_due {
+  my ($self) = @_;
+  die "sync_due: not implemented\n";
+}
 
-  ## Ok, should we do a sync?
+=head2 expiry_due
 
-  # Not if the journal file doesn't exist, it's not a file, or it's 0 bytes long.
-  return 0 unless (stat($self->get_journal_filename()) && -f _);
+public instance (Boolean) expiry_due ()
 
-  # Yes if the file size is larger than the specified maximum size.
-  return 1 if (-s _ > $conf->{bayes_journal_max_size});
+Description:
+This methods determines if an expire is due.
 
-  # Yes if it's been at least a day since the last sync.
-  return 1 if (time - $magic[7] > 86400);
+=cut
 
-  # No, I guess not.
-  return 0;
-}
-
-# Is an expiry run due to occur?
 sub expiry_due {
   my ($self) = @_;
 
   $self->read_db_configs();	# make sure this has happened here
 
-  # is the database too small for expiry?  (Do *not* use "scalar keys",
-  # as this will iterate through the entire db counting them!)
-  my @magic = $self->get_magic_tokens();
-  my $ntoks = $magic[3];
-
   # If force expire was called, do the expire no matter what.
   return 1 if ($self->{bayes}->{main}->{learn_force_expire});
 
-  my $last_expire = time() - $magic[4];
+  # is the database too small for expiry?  (Do *not* use "scalar keys",
+  # as this will iterate through the entire db counting them!)
+  my @vars = $self->get_storage_variables();
+  my $ntoks = $vars[3];
+
+  my $last_expire = time() - $vars[4];
   if (!$self->{bayes}->{main}->{ignore_safety_expire_timeout}) {
     # if we're not ignoring the safety timeout, don't run an expire more
     # than once every 12 hours.
@@ -809,14 +420,14 @@
     return 0 if ($last_expire < 300);
   }
 
-  dbg("Bayes DB expiry: Tokens in DB: $ntoks, Expiry max size: ".$self->{expiry_max_db_size}.", Oldest atime: ".$magic[5].", Newest atime: ".$magic[10].", Last expire: ".$magic[4].", Current time: ".time(),'bayes','-1');
+  dbg("Bayes DB expiry: Tokens in DB: $ntoks, Expiry max size: ".$self->{expiry_max_db_size}.", Oldest atime: ".$vars[5].", Newest atime: ".$vars[10].", Last expire: ".$vars[4].", Current time: ".time(),'bayes','-1');
 
   my $conf = $self->{bayes}->{main}->{conf};
   if ($ntoks <= 100000 ||			# keep at least 100k tokens
       $conf->{bayes_auto_expire} == 0 ||	# config says don't expire
       $self->{expiry_max_db_size} > $ntoks ||	# not enough tokens to cause an expire
-      $magic[10]-$magic[5] < 43200 ||		# delta between oldest and newest < 12h
-      $self->{db_version} < DB_VERSION		# ignore old db formats
+      $vars[10]-$vars[5] < 43200 ||		# delta between oldest and newest < 12h
+      $self->{db_version} < $self->DB_VERSION # ignore old db formats
       ) {
     return 0;
   }
@@ -824,686 +435,315 @@
   return 1;
 }
 
-###########################################################################
-# db_seen reading APIs
+=head2 seen_get
 
-sub seen_get {
-  my ($self, $msgid) = @_;
-  $self->{db_seen}->{$msgid};
-}
+public instance (Char) seen_get (String $msgid)
 
-sub seen_put {
-  my ($self, $msgid, $seen) = @_;
+Description:
+This method retrieves the stored value, if any, for C<$msgid>.  The return
+value is the stored string ('s' for spam and 'h' for ham) or undef if
+C<$msgid> is not found.
 
-  if ($self->{bayes}->{main}->{learn_to_journal}) {
-    $self->defer_update ("m $seen $msgid");
-  }
-  else {
-    $self->{db_seen}->{$msgid} = $seen;
-  }
-}
+=cut
 
-sub seen_delete {
+sub seen_get {
   my ($self, $msgid) = @_;
-
-  if ($self->{bayes}->{main}->{learn_to_journal}) {
-    $self->defer_update ("m f $msgid");
-  }
-  else {
-    delete $self->{db_seen}->{$msgid};
-  }
+  die "seen_get: not implemented\n";
 }
 
-###########################################################################
-# db reading APIs
+=head2 seen_put
 
-sub tok_get {
-  my ($self, $tok) = @_;
-  $self->tok_unpack ($self->{db_toks}->{$tok});
-}
- 
-sub nspam_nham_get {
-  my ($self) = @_;
-  my @magic = $self->get_magic_tokens();
-  ($magic[1], $magic[2]);
-}
+public instance (Boolean) seen_put (String $msgid, Char $flag)
 
-# return the magic tokens in a specific order:
-# 0: scan count base
-# 1: number of spam
-# 2: number of ham
-# 3: number of tokens in db
-# 4: last expire atime
-# 5: oldest token in db atime
-# 6: db version value
-# 7: last journal sync
-# 8: last atime delta
-# 9: last expire reduction count
-# 10: newest token in db atime
-#
-sub get_magic_tokens {
-  my ($self) = @_;
-  my @values;
+Description:
+This method records C<$msgid> as the type given by C<$flag>.  C<$flag> is
+one of two values 's' for spam and 'h' for ham.
 
-  my $db_ver = $self->{db_toks}->{$DB_VERSION_MAGIC_TOKEN};
-  if ( !$db_ver || $db_ver =~ /\D/ ) { $db_ver = 0; }
+=cut
 
-  if ( $db_ver == 0 ) {
-    my $DB0_NSPAM_MAGIC_TOKEN = '**NSPAM';
-    my $DB0_NHAM_MAGIC_TOKEN = '**NHAM';
-    my $DB0_OLDEST_TOKEN_AGE_MAGIC_TOKEN = '**OLDESTAGE';
-    my $DB0_LAST_EXPIRE_MAGIC_TOKEN = '**LASTEXPIRE';
-    my $DB0_NTOKENS_MAGIC_TOKEN = '**NTOKENS';
-    my $DB0_SCANCOUNT_BASE_MAGIC_TOKEN = '**SCANBASE';
-
-    @values = (
-      $self->{db_toks}->{$DB0_SCANCOUNT_BASE_MAGIC_TOKEN},
-      $self->{db_toks}->{$DB0_NSPAM_MAGIC_TOKEN},
-      $self->{db_toks}->{$DB0_NHAM_MAGIC_TOKEN},
-      $self->{db_toks}->{$DB0_NTOKENS_MAGIC_TOKEN},
-      $self->{db_toks}->{$DB0_LAST_EXPIRE_MAGIC_TOKEN},
-      $self->{db_toks}->{$DB0_OLDEST_TOKEN_AGE_MAGIC_TOKEN},
-      0,
-      0,
-      0,
-      0,
-      0,
-    );
-  }
-  elsif ( $db_ver == 1 ) {
-    my $DB1_NSPAM_MAGIC_TOKEN			= "\015\001\007\011\003NSPAM";
-    my $DB1_NHAM_MAGIC_TOKEN			= "\015\001\007\011\003NHAM";
-    my $DB1_OLDEST_TOKEN_AGE_MAGIC_TOKEN	= "\015\001\007\011\003OLDESTAGE";
-    my $DB1_LAST_EXPIRE_MAGIC_TOKEN		= "\015\001\007\011\003LASTEXPIRE";
-    my $DB1_NTOKENS_MAGIC_TOKEN			= "\015\001\007\011\003NTOKENS";
-    my $DB1_SCANCOUNT_BASE_MAGIC_TOKEN		= "\015\001\007\011\003SCANBASE";
-
-    @values = (
-      $self->{db_toks}->{$DB1_SCANCOUNT_BASE_MAGIC_TOKEN},
-      $self->{db_toks}->{$DB1_NSPAM_MAGIC_TOKEN},
-      $self->{db_toks}->{$DB1_NHAM_MAGIC_TOKEN},
-      $self->{db_toks}->{$DB1_NTOKENS_MAGIC_TOKEN},
-      $self->{db_toks}->{$DB1_LAST_EXPIRE_MAGIC_TOKEN},
-      $self->{db_toks}->{$DB1_OLDEST_TOKEN_AGE_MAGIC_TOKEN},
-      1,
-      0,
-      0,
-      0,
-      0,
-    );
-  }
-  elsif ( $db_ver == 2 ) {
-    my $DB2_LAST_ATIME_DELTA_MAGIC_TOKEN	= "\015\001\007\011\003LASTATIMEDELTA";
-    my $DB2_LAST_EXPIRE_MAGIC_TOKEN		= "\015\001\007\011\003LASTEXPIRE";
-    my $DB2_LAST_EXPIRE_REDUCE_MAGIC_TOKEN	= "\015\001\007\011\003LASTEXPIREREDUCE";
-    my $DB2_LAST_JOURNAL_SYNC_MAGIC_TOKEN	= "\015\001\007\011\003LASTJOURNALSYNC";
-    my $DB2_NEWEST_TOKEN_AGE_MAGIC_TOKEN	= "\015\001\007\011\003NEWESTAGE";
-    my $DB2_NHAM_MAGIC_TOKEN			= "\015\001\007\011\003NHAM";
-    my $DB2_NSPAM_MAGIC_TOKEN			= "\015\001\007\011\003NSPAM";
-    my $DB2_NTOKENS_MAGIC_TOKEN			= "\015\001\007\011\003NTOKENS";
-    my $DB2_OLDEST_TOKEN_AGE_MAGIC_TOKEN	= "\015\001\007\011\003OLDESTAGE";
-    my $DB2_RUNNING_EXPIRE_MAGIC_TOKEN		= "\015\001\007\011\003RUNNINGEXPIRE";
-
-    @values = (
-      0,
-      $self->{db_toks}->{$DB2_NSPAM_MAGIC_TOKEN},
-      $self->{db_toks}->{$DB2_NHAM_MAGIC_TOKEN},
-      $self->{db_toks}->{$DB2_NTOKENS_MAGIC_TOKEN},
-      $self->{db_toks}->{$DB2_LAST_EXPIRE_MAGIC_TOKEN},
-      $self->{db_toks}->{$DB2_OLDEST_TOKEN_AGE_MAGIC_TOKEN},
-      2,
-      $self->{db_toks}->{$DB2_LAST_JOURNAL_SYNC_MAGIC_TOKEN},
-      $self->{db_toks}->{$DB2_LAST_ATIME_DELTA_MAGIC_TOKEN},
-      $self->{db_toks}->{$DB2_LAST_EXPIRE_REDUCE_MAGIC_TOKEN},
-      $self->{db_toks}->{$DB2_NEWEST_TOKEN_AGE_MAGIC_TOKEN},
-    );
-  }
+sub seen_put {
+  my ($self, $msgid, $flag) = @_;
+  die "seen_put: not implemented\n";
+}
 
+=head2 seen_delete
 
-  foreach ( @values ) {
-    if ( !$_ || $_ =~ /\D/ ) { $_ = 0; }
-  }
+public instance (Boolean) seen_delete (String $msgid)
 
-  return @values;
-}
+Description:
+This method removes C<$msgid> from storage.
 
+=cut
 
-## Don't bother using get_magic_tokens here.  This token should only
-## ever exist when we're running expire, so we don't want to convert it if
-## it's there and we're not expiring ...
-sub get_running_expire_tok {
-  my ($self) = @_;
-  my $running = $self->{db_toks}->{$RUNNING_EXPIRE_MAGIC_TOKEN};
-  if (!$running || $running =~ /\D/) { return undef; }
-  return $running;
+sub seen_delete {
+  my ($self, $msgid) = @_;
+  die "seen_delete: not implemented\n";
 }
 
-sub set_running_expire_tok {
-  my ($self) = @_;
-  $self->{db_toks}->{$RUNNING_EXPIRE_MAGIC_TOKEN} = time();
-}
+=head2 get_storage_variables
 
-sub remove_running_expire_tok {
-  my ($self) = @_;
-  delete $self->{db_toks}->{$RUNNING_EXPIRE_MAGIC_TOKEN};
-}
+public instance (@) get_storage_variables ()
 
-###########################################################################
+Description:
+This method retrieves the various administrative variables used by
+the Bayes storage implementation.
 
-# db abstraction: allow deferred writes, since we will be frequently
-# writing while checking.
+The values returned in the array are in the following order:
 
-sub tok_count_change {
-  my ($self, $ds, $dh, $tok, $atime) = @_;
+0: scan count base
 
-  $atime = 0 unless defined $atime;
+1: number of spam
 
-  if ($self->{bayes}->{main}->{learn_to_journal}) {
-    $self->defer_update ("c $ds $dh $atime $tok");
-  } else {
-    $self->tok_sync_counters ($ds, $dh, $atime, $tok);
-  }
-}
- 
-sub nspam_nham_change {
-  my ($self, $ds, $dh) = @_;
+2: number of ham
 
-  if ($self->{bayes}->{main}->{learn_to_journal}) {
-    $self->defer_update ("n $ds $dh");
-  } else {
-    $self->tok_sync_nspam_nham ($ds, $dh);
-  }
-}
+3: number of tokens in db
 
-sub tok_touch {
-  my ($self, $tok, $atime) = @_;
-  $self->defer_update ("t $atime $tok");
-}
+4: last expire atime
 
-sub defer_update {
-  my ($self, $str) = @_;
-  $self->{string_to_journal} .= "$str\n";
-}
+5: oldest token in db atime
 
-###########################################################################
+6: db version value
 
-sub add_touches_to_journal {
-  my ($self) = @_;
+7: last journal sync
 
-  my $nbytes = length ($self->{string_to_journal});
-  return if ($nbytes == 0);
+8: last atime delta
 
-  my $path = $self->get_journal_filename();
+9: last expire reduction count
 
-  # use append mode, write atomically, then close, so simultaneous updates are
-  # not lost
-  my $conf = $self->{bayes}->{main}->{conf};
-  my $umask = umask(0777 - (oct ($conf->{bayes_file_mode}) & 0666));
-  if (!open (OUT, ">>".$path)) {
-    warn "cannot write to $path, Bayes db update ignored\n";
-    umask $umask; # reset umask
-    return;
-  }
+10: newest token in db atime
 
-  # do not use print() here, it will break up the buffer if it's >8192 bytes,
-  # which could result in two sets of tokens getting mixed up and their
-  # touches missed.
-  my $writ = 0;
-  while ($writ < $nbytes) {
-    my $len = syswrite (OUT, $self->{string_to_journal}, $nbytes-$writ);
-
-    if (!defined $len || $len < 0) {
-      # argh, write failure, give up
-      $len = 0 unless ( defined $len );
-      warn "write failed to Bayes journal $path ($len of $nbytes)!\n";
-      last;
-    }
+=cut
 
-    $writ += $len;
-    if ($len < $nbytes) {
-      # this should not happen on filesystem writes!  Still, try to recover
-      # anyway, but be noisy about it so the admin knows
-      warn "partial write to Bayes journal $path ($len of $nbytes), recovering.\n";
-      $self->{string_to_journal} = substr ($self->{string_to_journal}, $len);
-    }
-  }
+sub get_storage_variables {
+  my ($self) = @_;
+  die "get_storage_variables: not implemented\n";
+}
 
-  if (!close OUT) {
-    warn "cannot write to $path, Bayes db update ignored\n";
-  }
-  umask $umask; # reset umask
+=head2 dump_db_toks
 
-  $self->{string_to_journal} = '';
-}
+public instance () dump_db_toks (String $template, String $regex, @ @vars)
 
-# Return a qr'd RE to match a token with the correct format's magic token
-sub get_magic_re {
-  my ($self, $db_ver) = @_;
+Description:
+This method loops over all tokens, computing the probability for the token
+and then printing it out according to the passed in template.
 
-  if ( $db_ver >= 1 ) {
-    return qr/^\015\001\007\011\003/;
-  }
+=cut
 
-  # When in doubt, assume v0
-  return qr/^\*\*[A-Z]+$/;
+sub dump_db_toks {
+  my ($self, $template, $regex, @vars) = @_;
+  die "dump_db_toks: not implemented\n";
 }
 
-###########################################################################
-# And this method reads the journal and applies the changes in one
-# (locked) transaction.
+=head2 set_last_expire
 
-sub sync_journal {
-  my ($self, $opts) = @_;
-  my $ret = 0;
+public instance (Boolean) _set_last_expire (Integer $time)
 
-  my $path = $self->get_journal_filename();
+Description:
+This method sets the last expire time.
 
-  # if $path doesn't exist, or it's not a file, or is 0 bytes in length, return
-  if ( !stat($path) || !-f _ || -z _ ) { return 0; }
+=cut
 
-  eval {
-    local $SIG{'__DIE__'};	# do not run user die() traps in here
-    if ($self->tie_db_writable()) {
-      $ret = $self->sync_journal_trapped($opts, $path);
-    }
-  };
-  my $err = $@;
-
-  # ok, untie from write-mode if we can
-  if (!$self->{bayes}->{main}->{learn_caller_will_untie}) {
-    $self->untie_db();
-  }
-
-  # handle any errors that may have occurred
-  if ($err) {
-    warn "bayes: $err\n";
-    return 0;
-  }
-
-  $ret;
+sub set_last_expire {
+  my ($self, $time) = @_;
+  die "set_last_expire: not implemented\n";
 }
 
-sub sync_journal_trapped {
-  my ($self, $opts, $path) = @_;
+=head2 get_running_expire_tok
 
-  # Flag that we're doing work
-  $self->set_running_expire_tok();
+public instance (Time) get_running_expire_tok ()
 
-  my $started = time();
-  my $count = 0;
-  my $total_count = 0;
-  my %tokens = ();
-  my $showdots = $opts->{showdots};
-  my $retirepath = $path.".old";
-
-  # if $path doesn't exist, or it's not a file, or is 0 bytes in length, return
-  # we have to check again since the file may have been removed by a recent bayes db upgrade ...
-  if ( !stat($path) || !-f _ || -z _ ) { return 0; }
+Description:
+This method determines if an expire is currently running and returns the time
+the expire started.
 
-  if (!-r $path) { # will we be able to read the file?
-    warn "bayes: bad permissions on journal, can't read: $path\n";
-    return 0;
-  }
+=cut
 
-  # This is the critical phase (moving files around), so don't allow
-  # it to be interrupted.
-  {
-    local $SIG{'INT'} = 'IGNORE';
-    local $SIG{'HUP'} = 'IGNORE';
-    local $SIG{'TERM'} = 'IGNORE';
-
-    # retire the journal, so we can update the db files from it in peace.
-    # TODO: use locking here
-    if (!rename ($path, $retirepath)) {
-      warn "bayes: failed rename $path to $retirepath\n";
-      return 0;
-    }
+sub get_running_expire_tok {
+  my ($self) = @_;
+  die "get_running_expire_tok: not implemented\n";
+}
 
-    # now read the retired journal
-    if (!open (JOURNAL, "<$retirepath")) {
-      warn "bayes: cannot open read $retirepath\n";
-      return 0;
-    }
+=head2 set_running_expire_tok
 
+public instance (Time) set_running_expire_tok ()
 
-    # Read the journal
-    while (<JOURNAL>) {
-      $total_count++;
-
-      if (/^t (\d+) (.*)$/) { # Token timestamp update, cache resultant entries
-	$tokens{$2} = $1+0 if ( !exists $tokens{$2} || $1+0 > $tokens{$2} );
-      } elsif (/^c (-?\d+) (-?\d+) (\d+) (.*)$/) { # Add/full token update
-	$self->tok_sync_counters ($1+0, $2+0, $3+0, $4);
-	$count++;
-      } elsif (/^n (-?\d+) (-?\d+)$/) { # update ham/spam count
-	$self->tok_sync_nspam_nham ($1+0, $2+0);
-	$count++;
-      } elsif (/^m ([hsf]) (.+)$/) { # update msgid seen database
-	if ( $1 eq "f" ) {
-	  $self->seen_delete($2);
-	}
-	else {
-	  $self->seen_put($2,$1);
-	}
-	$count++;
-      } else {
-	warn "Bayes journal: gibberish entry found: $_";
-      }
-    }
-    close JOURNAL;
+Description:
+This method sets the running expire time to the current time.
 
-    # Now that we've determined what tokens we need to update and their
-    # final values, update the DB.  Should be much smaller than the full
-    # journal entries.
-    while( my($k,$v) = each %tokens ) {
-      $self->tok_touch_token ($v, $k);
-
-      if ((++$count % 1000) == 0) {
-	if ($showdots) { print STDERR "."; }
-	$self->set_running_expire_tok();
-      }
-    }
+=cut
 
-    if ($showdots) { print STDERR "\n"; }
+sub set_running_expire_tok {
+  my ($self) = @_;
+  die "set_running_expire_tok: not implemented\n";
+}
 
-    # we're all done, so unlink the old journal file
-    unlink ($retirepath) || warn "bayes: can't unlink $retirepath: $!\n";
+=head2 remove_running_expire_tok
 
-    $self->{db_toks}->{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = $started;
+public instance (Boolean) remove_running_expire_tok ()
 
-    my $done = time();
-    my $msg = ("synced Bayes databases from journal in ".($done - $started).
-	  " seconds: $count unique entries ($total_count total entries)");
+Description:
+This method removes a currently set running expire time.
 
-    if ($opts->{verbose}) {
-      print $msg,"\n";
-    } else {
-      dbg ($msg);
-    }
-  }
+=cut
 
-  # else, that's the lot, we're synced.  return
-  1;
+sub remove_running_expire_tok {
+  my ($self) = @_;
+  die "remove_running_expire_tok: not implemented\n";
 }
 
-sub tok_touch_token {
-  my ($self, $atime, $tok) = @_;
-  my ($ts, $th, $oldatime) = $self->tok_get ($tok);
+=head2 tok_get
+
+public instance (Integer, Integer, Time) tok_get (String $token)
+
+Description:
+This method retrieves the specified token (C<$token>) from storage and returns
+it's spam count, ham acount and last access time.
 
-  # If the new atime is < the old atime, ignore the update
-  # We figure that we'll never want to lower a token atime, so abort if
-  # we try.  (journal out of sync, etc.)
-  return if ( $oldatime >= $atime );
+=cut
 
-  $self->tok_put ($tok, $ts, $th, $atime);
+sub tok_get {
+  my ($self, $token) = @_;
+  die "tok_get: not implemented\n";
 }
 
-sub tok_sync_counters {
-  my ($self, $ds, $dh, $atime, $tok) = @_;
-  my ($ts, $th, $oldatime) = $self->tok_get ($tok);
-  $ts += $ds; if ($ts < 0) { $ts = 0; }
-  $th += $dh; if ($th < 0) { $th = 0; }
+=head2 tok_count_change
 
-  # Don't roll the atime of tokens backwards ...
-  $atime = $oldatime if ( $oldatime > $atime );
+public instance (Boolean) tok_count_change (Integer $spam_count,
+                                            Integer $ham_count,
+                                            String $token,
+                                            Time $atime)
 
-  $self->tok_put ($tok, $ts, $th, $atime);
-}
+Description:
+This method takes a C<$spam_count> and C<$ham_count> and adds it to
+C<$token> along with updating C<$token>s atime with C<$atime>.
 
-sub tok_put {
-  my ($self, $tok, $ts, $th, $atime) = @_;
-  $ts ||= 0;
-  $th ||= 0;
+=cut
 
-  if ( $tok =~ /^\015\001\007\011\003/ ) { # magic token?  Ignore it!
-    return;
-  }
+sub tok_count_change {
+  my ($self, $spam_count, $ham_count, $token, $atime) = @_;
+  die "tok_count_change: not implemented\n";
+}
 
-  # use defined() rather than exists(); the latter is not supported
-  # by NDBM_File, believe it or not.  Using defined() did not
-  # indicate any noticeable speed hit in my testing. (Mar 31 2003 jm)
-  my $exists_already = defined $self->{db_toks}->{$tok};
-
-  if ($ts == 0 && $th == 0) {
-    return if (!$exists_already); # If the token doesn't exist, just return
-    $self->{db_toks}->{$NTOKENS_MAGIC_TOKEN}--;
-    delete $self->{db_toks}->{$tok};
-  } else {
-    if (!$exists_already) { # If the token doesn't exist, raise the token count
-      $self->{db_toks}->{$NTOKENS_MAGIC_TOKEN}++;
-    }
+=head2 nspam_nham_get
 
-    $self->{db_toks}->{$tok} = $self->tok_pack ($ts, $th, $atime);
+public instance (Integer, Integer) nspam_nham_get ()
 
-    my $newmagic = $self->{db_toks}->{$NEWEST_TOKEN_AGE_MAGIC_TOKEN};
-    if (!defined ($newmagic) || $atime > $newmagic) {
-      $self->{db_toks}->{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $atime;
-    }
+Description:
+This method retrieves the total number of spam and the total number of spam
+currently under storage.
 
-    # Make sure to check for either !defined or "" ...  Apparently
-    # sometimes the DB module doesn't return the value correctly. :(
-    my $oldmagic = $self->{db_toks}->{$OLDEST_TOKEN_AGE_MAGIC_TOKEN};
-    if (!defined ($oldmagic) || $oldmagic eq "" || $atime < $oldmagic) {
-      $self->{db_toks}->{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $atime;
-    }
-  }
-}
+=cut
 
-sub tok_sync_nspam_nham {
-  my ($self, $ds, $dh) = @_;
-  my ($ns, $nh) = ($self->get_magic_tokens())[1,2];
-  if ($ds) { $ns += $ds; } if ($ns < 0) { $ns = 0; }
-  if ($dh) { $nh += $dh; } if ($nh < 0) { $nh = 0; }
-  $self->{db_toks}->{$NSPAM_MAGIC_TOKEN} = $ns;
-  $self->{db_toks}->{$NHAM_MAGIC_TOKEN} = $nh;
+sub nspam_nham_get {
+  my ($self) = @_;
+  die "nspam_nham_get: not implemented\n";
 }
 
-###########################################################################
+=head2 nspam_nham_change
 
-sub get_journal_filename {
-  my ($self) = @_;
+public instance (Boolean) nspam_nham_change (Integer $num_spam,
+                                             Integer $num_ham)
 
-  if (defined $self->{journal_live_path}) {
-    return $self->{journal_live_path};
-  }
+Description:
+This method updates the number of spam and the number of ham in the database.
 
-  my $main = $self->{bayes}->{main};
-  my $fname = $main->sed_path ($main->{conf}->{bayes_path}."_journal");
+=cut
 
-  $self->{journal_live_path} = $fname;
-  return $self->{journal_live_path};
+sub nspam_nham_change {
+  my ($self, $num_spam, $num_ham) = @_;
+  die "nspam_nham_change: not implemented\n";
 }
 
-###########################################################################
+=head2 tok_touch
 
-sub scan_count_get {
-  my ($self) = @_;
+public instance (Boolean) tok_touch (String $token,
+                                     Time $atime)
 
-  if ( $self->{db_version} < 2 ) {
-    my ($count) = $self->get_magic_tokens();
-    my $path = $self->{scan_count_little_file};
-    $count += (defined $path && -e $path ? -s _ : 0);
-    return $count;
-  }
+Description:
+This method updates the given tokens (C<$token>) access time.
 
-  0;
+=cut
+
+sub tok_touch {
+  my ($self, $token, $atime) = @_;
+  die "tok_touch: not implemanted\n";
 }
 
-###########################################################################
+=head2 cleanup
 
-# this is called directly from sa-learn(1).
-sub upgrade_old_dbm_files {
-  my ($self, $opts) = @_;
-  my $ret = 0;
+public instance (Boolean) cleanup ()
 
-  eval {
-    local $SIG{'__DIE__'};	# do not run user die() traps in here
+Description:
+This method performs any cleanup necessary before moving onto the next
+operation.
 
-    use File::Basename;
-    use File::Copy;
+=cut
 
-    # bayes directory
-    my $main = $self->{bayes}->{main};
-    my $path = $main->sed_path($main->{conf}->{bayes_path});
-    my $dir = dirname($path);
-
-    # make temporary copy since old dbm and new dbm may have same name
-    opendir(DIR, $dir) || die "can't opendir $dir: $!";
-    my @files = grep { /^bayes_(?:seen|toks)(?:\.\w+)?$/ } readdir(DIR);
-    closedir(DIR);
-    if (@files < 2 || !grep(/bayes_seen/,@files) || !grep(/bayes_toks/,@files))
-    {
-      die "unable to find bayes_toks and bayes_seen, stopping\n";
-    }
-    # untaint @files (already safe after grep)
-    @files = map { /(.*)/, $1 } @files;
+sub cleanup {
+  my ($self) = @_;
+  die "touches_cleanup: not implemented\n";
+}
 
-    for (@files) {
-      my $src = "$dir/$_";
-      my $dst = "$dir/old_$_";
-      copy($src, $dst) || die "can't copy $src to $dst: $!\n";
-    }
+=head2 is_magic_token
 
-    # delete previous to make way for import
-    for (@files) { unlink("$dir/$_"); }
+public instance (Boolean) is_magic_token (string $token)
 
-    # import
-    if ($self->tie_db_writable()) {
-      $ret += $self->upgrade_old_dbm_files_trapped("$dir/old_bayes_seen",
-						   $self->{db_seen});
-      $ret += $self->upgrade_old_dbm_files_trapped("$dir/old_bayes_toks",
-						   $self->{db_toks});
-    }
+Description:
+This method determines if a given token is "magic" or special to the
+implementation.
 
-    if ($ret == 2) {
-      print "import successful, original files saved with \"old\" prefix\n";
-    }
-    else {
-      print "import failed, original files saved with \"old\" prefix\n";
-    }
-  };
-  my $err = $@;
-
-  $self->untie_db();
+=cut
 
-  # if we died, untie the dbm files
-  if ($err) {
-    warn "bayes upgrade_old_dbm_files: $err\n";
-    return 0;
-  }
-  $ret;
+sub is_magic_token {
+  my ($self, $token) = @_;
+  die "is_magic_token: not implemented\n";
 }
 
-sub upgrade_old_dbm_files_trapped {
-  my ($self, $filename, $output) = @_;
+=head2 sync
 
-  my $count;
-  my %in;
+public instance (Boolean) sync (\% $opts)
 
-  print "upgrading to DB_File, please be patient: $filename\n";
+Description:
+This method performs a sync of the database.
 
-  # try each type of file until we find one with > 0 entries
-  for my $dbm ('DB_File', 'GDBM_File', 'NDBM_File', 'SDBM_File') {
-    $count = 0;
-    # wrap in eval so it doesn't run in general use.  This accesses db
-    # modules directly.
-    # Note: (bug 2390), the 'use' needs to be on the same line as the eval
-    # for RPM dependency checks to work properly.  It's lame, but...
-    eval 'use ' . $dbm . ';
-      tie %in, "' . $dbm . '", $filename, O_RDONLY, 0600;
-      %{ $output } = %in;
-      $count = scalar keys %{ $output };
-      untie %in;
-    ';
-    if ($@) {
-      print "$dbm: $dbm module not installed, nothing copied.\n";
-      dbg("error was: $@");
-    }
-    elsif ($count == 0) {
-      print "$dbm: no database of that kind found, nothing copied.\n";
-    }
-    else {
-      print "$dbm: copied $count entries.\n";
-      return 1;
-    }
-  }
+=cut
 
-  return 0;
+sub sync {
+  my ($self, $opts) = @_;
+  die "sync: not implemented\n";
 }
 
-###########################################################################
+=head2 scan_count_get
 
-# token marshalling format for db_toks.
+public instance (Integer) scan_count_get ()
 
-# Since we may have many entries with few hits, especially thousands of hapaxes
-# (1-occurrence entries), use a flexible entry format, instead of simply "2
-# packed ints", to keep the memory and disk space usage down.  In my
-# 18k-message test corpus, only 8.9% have >= 8 hits in either counter, so we
-# can use a 1-byte representation for the other 91% of low-hitting entries
-# and save masses of space.
+Description:
+This method gets the current scan count, if used by the implementation.
 
-# This looks like: XXSSSHHH (XX = format bits, SSS = 3 spam-count bits, HHH = 3
-# ham-count bits).  If XX in the first byte is 11, it's packed as this 1-byte
-# representation; otherwise, if XX in the first byte is 00, it's packed as
-# "CLL", ie. 1 byte and 2 32-bit "longs" in perl pack format.
+=cut
 
-# Savings: roughly halves size of toks db, at the cost of a ~10% slowdown.
+sub scan_count_get {
+  my ($self) = @_;
+  die "scan_count_get: not implemented\n";
+}
 
-use constant FORMAT_FLAG	=> 0xc0;	# 11000000
-use constant ONE_BYTE_FORMAT	=> 0xc0;	# 11000000
-use constant TWO_LONGS_FORMAT	=> 0x00;	# 00000000
+=head2 perform_upgrade
 
-use constant ONE_BYTE_SSS_BITS	=> 0x38;	# 00111000
-use constant ONE_BYTE_HHH_BITS	=> 0x07;	# 00000111
+public instance (Boolean) perform_upgrade (\% $opts)
 
-sub tok_unpack {
-  my ($self, $value) = @_;
-  $value ||= 0;
+Description:
+This method is a utility method that performs any necessary upgrades
+between versions.  It should know how to handle previous versions and
+what needs to happen to upgrade them.
 
-  my ($packed, $atime);
-  if ( $self->{db_version} == 0 ) {
-    ($packed, $atime) = unpack("CS", $value);
-  }
-  elsif ( $self->{db_version} == 1 || $self->{db_version} == 2 ) {
-    ($packed, $atime) = unpack("CV", $value);
-  }
+A true return value indicates success.
 
-  if (($packed & FORMAT_FLAG) == ONE_BYTE_FORMAT) {
-    return (($packed & ONE_BYTE_SSS_BITS) >> 3,
-		$packed & ONE_BYTE_HHH_BITS,
-		$atime || 0);
-  }
-  elsif (($packed & FORMAT_FLAG) == TWO_LONGS_FORMAT) {
-    my ($packed, $ts, $th, $atime);
-    if ( $self->{db_version} == 0 ) {
-      ($packed, $ts, $th, $atime) = unpack("CLLS", $value);
-    }
-    elsif ( $self->{db_version} == 1 ) {
-      ($packed, $ts, $th, $atime) = unpack("CVVV", $value);
-    }
-    elsif ( $self->{db_version} == 2 ) {
-      ($packed, $ts, $th, $atime) = unpack("CVVV", $value);
-    }
-    return ($ts || 0, $th || 0, $atime || 0);
-  }
-  # other formats would go here...
-  else {
-    warn "unknown packing format for Bayes db, please re-learn: $packed";
-    return (0, 0, 0);
-  }
-}
+=cut
 
-sub tok_pack {
-  my ($self, $ts, $th, $atime) = @_;
-  $ts ||= 0; $th ||= 0; $atime ||= 0;
-  if ($ts < 8 && $th < 8) {
-    return pack ("CV", ONE_BYTE_FORMAT | ($ts << 3) | $th, $atime);
-  } else {
-    return pack ("CVVV", TWO_LONGS_FORMAT, $ts, $th, $atime);
-  }
+sub perform_upgrade {
+  my ($self, $opts) = @_;
+  die "perform_upgrade: not implemented\n";
 }
-
-###########################################################################
 
 sub dbg { Mail::SpamAssassin::dbg (@_); }
 sub sa_die { Mail::SpamAssassin::sa_die (@_); }

Added: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStoreDBM.pm
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStoreDBM.pm	Thu Jan 29 18:54:33 2004
@@ -0,0 +1,1385 @@
+# <@LICENSE>
+# ====================================================================
+# The Apache Software License, Version 1.1
+# 
+# Copyright (c) 2000 The Apache Software Foundation.  All rights
+# reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in
+#    the documentation and/or other materials provided with the
+#    distribution.
+# 
+# 3. The end-user documentation included with the redistribution,
+#    if any, must include the following acknowledgment:
+#       "This product includes software developed by the
+#        Apache Software Foundation (http://www.apache.org/)."
+#    Alternately, this acknowledgment may appear in the software itself,
+#    if and wherever such third-party acknowledgments normally appear.
+# 
+# 4. The names "Apache" and "Apache Software Foundation" must
+#    not be used to endorse or promote products derived from this
+#    software without prior written permission. For written
+#    permission, please contact apache@apache.org.
+# 
+# 5. Products derived from this software may not be called "Apache",
+#    nor may "Apache" appear in their name, without prior written
+#    permission of the Apache Software Foundation.
+# 
+# THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+# ====================================================================
+# 
+# This software consists of voluntary contributions made by many
+# individuals on behalf of the Apache Software Foundation.  For more
+# information on the Apache Software Foundation, please see
+# <http://www.apache.org/>.
+# 
+# Portions of this software are based upon public domain software
+# originally written at the National Center for Supercomputing Applications,
+# University of Illinois, Urbana-Champaign.
+# </...@LICENSE>
+
+package Mail::SpamAssassin::BayesStoreDBM;
+
+use strict;
+use bytes;
+use Fcntl;
+
+use Mail::SpamAssassin;
+use Mail::SpamAssassin::Util;
+use Mail::SpamAssassin::BayesStore;
+use File::Basename;
+use File::Spec;
+use File::Path;
+
+use constant HAS_DB_FILE => eval { require DB_File; };
+
+use vars qw{
+  @ISA
+  @DBNAMES @DB_EXTENSIONS
+  $NSPAM_MAGIC_TOKEN $NHAM_MAGIC_TOKEN $LAST_EXPIRE_MAGIC_TOKEN $LAST_JOURNAL_SYNC_MAGIC_TOKEN
+  $NTOKENS_MAGIC_TOKEN $OLDEST_TOKEN_AGE_MAGIC_TOKEN $LAST_EXPIRE_REDUCE_MAGIC_TOKEN
+  $RUNNING_EXPIRE_MAGIC_TOKEN $DB_VERSION_MAGIC_TOKEN $LAST_ATIME_DELTA_MAGIC_TOKEN
+  $NEWEST_TOKEN_AGE_MAGIC_TOKEN
+};
+
+@ISA = qw( Mail::SpamAssassin::BayesStore );
+
+# db layout (quoting Matt):
+#
+# > need five db files though to make it real fast:
+# [probs] 1. ngood and nbad (two entries, so could be a flat file rather 
+# than a db file).	(now 2 entries in db_toks)
+# [toks]  2. good token -> number seen
+# [toks]  3. bad token -> number seen (both are packed into 1 entry in 1 db)
+# [probs]  4. Consolidated good token -> probability
+# [probs]  5. Consolidated bad token -> probability
+# > As you add new mails, you update the entry in 2 or 3, then regenerate
+# > the entry for that token in 4 or 5.
+# > Then as you test a new mail, you just need to pull the probability
+# > direct from 4 and 5, and generate the overall probability. A simple and
+# > very fast operation. 
+#
+# jm: we use probs as overall probability. <0.5 = ham, >0.5 = spam
+#
+# update: probs is no longer maintained as a db, to keep on-disk and in-core
+# usage down.
+#
+# also, added a new one to support forgetting, auto-learning, and
+# auto-forgetting for refiled mails:
+# [seen]  6. a list of Message-IDs of messages already learnt from. values
+# are 's' for learnt-as-spam, 'h' for learnt-as-ham.
+#
+# and another, called [scancount] to model the scan-count for expiry.
+# This is not a database.  Instead it increases by one byte for each
+# message scanned (note: scanned, not learned).
+
+@DBNAMES = qw(toks seen);
+
+# Possible file extensions used by the kinds of database files DB_File
+# might create.  We need these so we can create a new file and rename
+# it into place.
+@DB_EXTENSIONS = ('', '.db');
+
+# These are the magic tokens we use to track stuff in the DB.
+# The format is '^M^A^G^I^C' followed by any string you want.
+# None of the control chars will be in a real token.
+$DB_VERSION_MAGIC_TOKEN		= "\015\001\007\011\003DBVERSION";
+$LAST_ATIME_DELTA_MAGIC_TOKEN	= "\015\001\007\011\003LASTATIMEDELTA";
+$LAST_EXPIRE_MAGIC_TOKEN	= "\015\001\007\011\003LASTEXPIRE";
+$LAST_EXPIRE_REDUCE_MAGIC_TOKEN	= "\015\001\007\011\003LASTEXPIREREDUCE";
+$LAST_JOURNAL_SYNC_MAGIC_TOKEN	= "\015\001\007\011\003LASTJOURNALSYNC";
+$NEWEST_TOKEN_AGE_MAGIC_TOKEN	= "\015\001\007\011\003NEWESTAGE";
+$NHAM_MAGIC_TOKEN		= "\015\001\007\011\003NHAM";
+$NSPAM_MAGIC_TOKEN		= "\015\001\007\011\003NSPAM";
+$NTOKENS_MAGIC_TOKEN		= "\015\001\007\011\003NTOKENS";
+$OLDEST_TOKEN_AGE_MAGIC_TOKEN	= "\015\001\007\011\003OLDESTAGE";
+$RUNNING_EXPIRE_MAGIC_TOKEN	= "\015\001\007\011\003RUNNINGEXPIRE";
+
+###########################################################################
+
+sub new {
+  my $class = shift;
+  $class = ref($class) || $class;
+
+  my $self = $class->SUPER::new(@_);
+
+  $self->{supported_db_version} = 2;
+
+  $self->{already_ties} = 0;
+  $self->{is_locked} = 0;
+  $self->{string_to_journal} = '';
+
+  $self;
+}
+
+###########################################################################
+
+sub tie_db_readonly {
+  my ($self) = @_;
+
+  if (!HAS_DB_FILE) {
+    dbg ("bayes: DB_File module not installed, cannot use Bayes");
+    return 0;
+  }
+
+  # return if we've already tied to the db's, using the same mode
+  # (locked/unlocked) as before.
+  return 1 if ($self->{already_tied} && $self->{is_locked} == 0);
+
+  my $main = $self->{bayes}->{main};
+  if (!defined($main->{conf}->{bayes_path})) {
+    dbg ("bayes_path not defined");
+    return 0;
+  }
+
+  $self->read_db_configs();
+
+  my $path = $main->sed_path ($main->{conf}->{bayes_path});
+
+  my $found=0;
+  for my $ext (@DB_EXTENSIONS) { if (-f $path.'_toks'.$ext) { $found=1; last; } }
+
+  if (!$found) {
+    dbg ("bayes: no dbs present, cannot scan: ${path}_toks");
+    return 0;
+  }
+
+  foreach my $dbname (@DBNAMES) {
+    my $name = $path.'_'.$dbname;
+    my $db_var = 'db_'.$dbname;
+    dbg("bayes: $$ tie-ing to DB file R/O $name");
+    # untie %{$self->{$db_var}} if (tied %{$self->{$db_var}});
+    tie %{$self->{$db_var}},"DB_File",$name, O_RDONLY,
+		 (oct ($main->{conf}->{bayes_file_mode}) & 0666)
+       or goto failed_to_tie;
+  }
+
+  $self->{db_version} = ($self->get_storage_variables())[6];
+  dbg("bayes: found bayes db version ".$self->{db_version});
+
+  # If the DB version is one we don't understand, abort!
+  if ( $self->_check_db_version() ) {
+    dbg("bayes: bayes db version ".$self->{db_version}." is newer than we understand, aborting!");
+    $self->untie_db();
+    return 0;
+  }
+
+  if ( $self->{db_version} < 2 ) { # older versions use scancount
+    $self->{scan_count_little_file} = $path.'_msgcount';
+  }
+
+  $self->{already_tied} = 1;
+  return 1;
+
+failed_to_tie:
+  warn "Cannot open bayes databases ${path}_* R/O: tie failed: $!\n";
+  return 0;
+}
+
+# tie() to the databases, read-write and locked.  Any callers of
+# this should ensure they call untie_db() afterwards!
+#
+sub tie_db_writable {
+  my ($self) = @_;
+
+  if (!HAS_DB_FILE) {
+    dbg ("bayes: DB_File module not installed, cannot use Bayes");
+    return 0;
+  }
+
+  # return if we've already tied to the db's, using the same mode
+  # (locked/unlocked) as before.
+  return 1 if ($self->{already_tied} && $self->{is_locked} == 1);
+
+  my $main = $self->{bayes}->{main};
+  if (!defined($main->{conf}->{bayes_path})) {
+    dbg ("bayes_path not defined");
+    return 0;
+  }
+
+  $self->read_db_configs();
+
+  my $path = $main->sed_path ($main->{conf}->{bayes_path});
+
+  my $found=0;
+  for my $ext (@DB_EXTENSIONS) { if (-f $path.'_toks'.$ext) { $found=1; last; } }
+
+  my $parentdir = dirname ($path);
+  if (!-d $parentdir) {
+    # run in an eval(); if mkpath has no perms, it calls die()
+    eval {
+      mkpath ($parentdir, 0, (oct ($main->{conf}->{bayes_file_mode}) & 0777));
+    };
+  }
+
+  my $tout;
+  if ($main->{learn_wait_for_lock}) {
+    $tout = 300;       # TODO: Dan to write better lock code
+  } else {
+    $tout = 10;
+  }
+  if ($main->{locker}->safe_lock ($path, $tout)) {
+    $self->{locked_file} = $path;
+    $self->{is_locked} = 1;
+  } else {
+    warn "Cannot open bayes databases ${path}_* R/W: lock failed: $!\n";
+    return 0;
+  }
+
+  my $umask = umask 0;
+  foreach my $dbname (@DBNAMES) {
+    my $name = $path.'_'.$dbname;
+    my $db_var = 'db_'.$dbname;
+    dbg("bayes: $$ tie-ing to DB file R/W $name");
+    tie %{$self->{$db_var}},"DB_File",$name, O_RDWR|O_CREAT,
+		 (oct ($main->{conf}->{bayes_file_mode}) & 0666)
+       or goto failed_to_tie;
+  }
+  umask $umask;
+
+  # set our cache to what version DB we're using
+  $self->{db_version} = ($self->get_storage_variables())[6];
+  dbg("bayes: found bayes db version ".$self->{db_version});
+
+  # figure out if we can read the current DB and if we need to do a
+  # DB version update and do it if necessary if either has a problem,
+  # fail immediately
+  #
+  if ( $found && $self->_upgrade_db() ) {
+    $self->untie_db();
+    return 0;
+  }
+  elsif ( !$found ) { # new DB, make sure we know that ...
+    $self->{db_version} = $self->{db_toks}->{$DB_VERSION_MAGIC_TOKEN} = $self->DB_VERSION;
+    $self->{db_toks}->{$NTOKENS_MAGIC_TOKEN} = 0; # no tokens in the db ...
+    dbg("bayes: new db, set db version ".$self->{db_version}." and 0 tokens");
+  }
+
+  $self->{already_tied} = 1;
+  return 1;
+
+failed_to_tie:
+  my $err = $!;
+  umask $umask;
+  if ($self->{is_locked}) {
+    $self->{bayes}->{main}->{locker}->safe_unlock ($self->{locked_file});
+    $self->{is_locked} = 0;
+  }
+  warn "Cannot open bayes databases ${path}_* R/W: tie failed: $err\n";
+  return 0;
+}
+
+# Do we understand how to deal with this DB version?
+sub _check_db_version {
+  my ($self) = @_;
+  my $db_ver = ($self->get_storage_variables())[6];
+
+  if ( $db_ver > $self->DB_VERSION ) { # current DB is newer, ignore the DB!
+    warn "bayes: Found DB Version $db_ver, but can only handle up to version ".$self->DB_VERSION."\n";
+    return 1;
+  }
+
+  return 0;
+}
+
+# Check to see if we need to upgrade the DB, and do so if necessary
+sub _upgrade_db {
+  my ($self) = @_;
+
+  return 0 if ( $self->{db_version} == $self->DB_VERSION );
+  if ( $self->_check_db_version() ) {
+    dbg("bayes: bayes db version ".$self->{db_version}." is newer than we understand, aborting!");
+    return 1;
+  }
+
+  # If the current DB version is lower than the new version, upgrade!
+  # Do conversions in order so we can go 1 -> 3, make sure to update $self->{db_version}
+
+  dbg("bayes: detected bayes db format ".$self->{db_version}.", upgrading");
+
+  # since DB_File will not shrink a database (!!), we need to *create*
+  # a new one instead.
+  my $main = $self->{bayes}->{main};
+  my $path = $main->sed_path ($main->{conf}->{bayes_path});
+  my $name = $path.'_toks';
+
+  # older version's journal files are likely not in the same format as the new ones, so remove it.
+  my $jpath = $self->_get_journal_filename();
+  if ( -f $jpath ) {
+    dbg("bayes: old journal file found, removing.");
+    warn "Couldn't remove $jpath: $!" if ( !unlink $jpath );
+  }
+
+  if ( $self->{db_version} < 2 ) {
+    dbg ("bayes: upgrading database format from v".$self->{db_version}." to v2");
+
+    my($DB_NSPAM_MAGIC_TOKEN, $DB_NHAM_MAGIC_TOKEN, $DB_NTOKENS_MAGIC_TOKEN);
+    my($DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN, $DB_LAST_EXPIRE_MAGIC_TOKEN);
+
+    # Magic tokens for version 0, defined as '**[A-Z]+'
+    if ( $self->{db_version} == 0 ) {
+      $DB_NSPAM_MAGIC_TOKEN			= '**NSPAM';
+      $DB_NHAM_MAGIC_TOKEN			= '**NHAM';
+      $DB_NTOKENS_MAGIC_TOKEN			= '**NTOKENS';
+      #$DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN		= '**OLDESTAGE';
+      #$DB_LAST_EXPIRE_MAGIC_TOKEN		= '**LASTEXPIRE';
+      #$DB_SCANCOUNT_BASE_MAGIC_TOKEN		= '**SCANBASE';
+      #$DB_RUNNING_EXPIRE_MAGIC_TOKEN		= '**RUNNINGEXPIRE';
+    }
+    else {
+      $DB_NSPAM_MAGIC_TOKEN			= "\015\001\007\011\003NSPAM";
+      $DB_NHAM_MAGIC_TOKEN			= "\015\001\007\011\003NHAM";
+      $DB_NTOKENS_MAGIC_TOKEN			= "\015\001\007\011\003NTOKENS";
+      #$DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN		= "\015\001\007\011\003OLDESTAGE";
+      #$DB_LAST_EXPIRE_MAGIC_TOKEN		= "\015\001\007\011\003LASTEXPIRE";
+      #$DB_SCANCOUNT_BASE_MAGIC_TOKEN		= "\015\001\007\011\003SCANBASE";
+      #$DB_RUNNING_EXPIRE_MAGIC_TOKEN		= "\015\001\007\011\003RUNNINGEXPIRE";
+    }
+
+    # remember when we started ...
+    my $started = time;
+    my $newatime = $started;
+
+    # use O_EXCL to avoid races (bonus paranoia, since we should be locked
+    # anyway)
+    my %new_toks;
+    my $umask = umask 0;
+    tie %new_toks, "DB_File", "${name}.new", O_RDWR|O_CREAT|O_EXCL,
+          (oct ($main->{conf}->{bayes_file_mode}) & 0666) or return 1;
+    umask $umask;
+
+    # add the magic tokens to the new db.
+    $new_toks{$NSPAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NSPAM_MAGIC_TOKEN};
+    $new_toks{$NHAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NHAM_MAGIC_TOKEN};
+    $new_toks{$NTOKENS_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NTOKENS_MAGIC_TOKEN};
+    $new_toks{$DB_VERSION_MAGIC_TOKEN} = 2; # we're now a DB version 2 file
+    $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $newatime;
+    $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = $newatime;
+    $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $newatime;
+    $new_toks{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = $newatime;
+    $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = 0;
+    $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} = 0;
+
+    my $magic_re = $self->get_magic_re($self->{db_version});
+
+    # deal with the data tokens
+    my ($tok, $packed);
+    while (($tok, $packed) = each %{$self->{db_toks}}) {
+      next if ($tok =~ /$magic_re/); # skip magic tokens
+
+      my ($ts, $th, $atime) = $self->tok_unpack ($packed);
+      $new_toks{$tok} = $self->tok_pack ($ts, $th, $newatime);
+    }
+
+
+    # now untie so we can do renames
+    untie %{$self->{db_toks}};
+    untie %new_toks;
+
+    # This is the critical phase (moving files around), so don't allow
+    # it to be interrupted.
+    local $SIG{'INT'} = 'IGNORE';
+    local $SIG{'HUP'} = 'IGNORE';
+    local $SIG{'TERM'} = 'IGNORE';
+
+    # older versions used scancount, so kill the stupid little file ...
+    my $msgc = $path.'_msgcount';
+    if ( -f $msgc ) {
+      dbg("bayes: old msgcount file found, removing.");
+      if ( !unlink $msgc ) {
+        warn "Couldn't remove $msgc: $!";
+      }
+    }
+
+    # now rename in the new one.  Try several extensions
+    for my $ext (@DB_EXTENSIONS) {
+      my $newf = $name.'.new'.$ext;
+      my $oldf = $name.$ext;
+      next unless (-f $newf);
+      if (!rename ($newf, $oldf)) {
+        warn "rename $newf to $oldf failed: $!\n";
+        return 1;
+      }
+    }
+
+    # re-tie to the new db in read-write mode ...
+    tie %{$self->{db_toks}},"DB_File", $name, O_RDWR|O_CREAT,
+	 (oct ($main->{conf}->{bayes_file_mode}) & 0666) or return 1;
+
+    dbg ("bayes: upgraded database format from v".$self->{db_version}." to v2 in ".(time - $started)." seconds");
+    $self->{db_version} = 2; # need this for other functions which check
+  }
+
+  # if ( $self->{db_version} == 2 ) {
+  #   ...
+  #   $self->{db_version} = 3; # need this for other functions which check
+  # }
+  # ... and so on.
+
+  return 0;
+}
+
+###########################################################################
+
+sub untie_db {
+  my $self = shift;
+  dbg("bayes: $$ untie-ing");
+
+  foreach my $dbname (@DBNAMES) {
+    my $db_var = 'db_'.$dbname;
+
+    if (exists $self->{$db_var}) {
+      dbg ("bayes: $$ untie-ing $db_var");
+      untie %{$self->{$db_var}};
+      delete $self->{$db_var};
+    }
+  }
+
+  if ($self->{is_locked}) {
+    dbg ("bayes: files locked, now unlocking lock");
+    $self->{bayes}->{main}->{locker}->safe_unlock ($self->{locked_file});
+    $self->{is_locked} = 0;
+  }
+
+  $self->{already_tied} = 0;
+  $self->{db_version} = undef;
+}
+
+###########################################################################
+
+sub calculate_expire_delta {
+  my ($self, $newest_atime, $start, $max_expire_mult) = @_;
+
+  my %delta = (); # use a hash since an array is going to be very sparse
+
+  my $magic_re = $self->get_magic_re($self->DB_VERSION);
+
+  # do the first pass, figure out atime delta
+  my ($tok, $packed);
+  while (($tok, $packed) = each %{$self->{db_toks}}) {
+    next if ($tok =~ /$magic_re/); # skip magic tokens
+    
+    my ($ts, $th, $atime) = $self->tok_unpack ($packed);
+
+    # Go through from $start * 1 to $start * 512, mark how many tokens we would expire
+    my $token_age = $newest_atime - $atime;
+    for( my $i = 1; $i <= $max_expire_mult; $i<<=1 ) {
+      if ( $token_age >= $start * $i ) {
+        $delta{$i}++;
+      }
+      else {
+        # If the token age is less than the expire delta, it'll be
+        # less for all upcoming checks too, so abort early.
+        last;
+      }
+    }
+  }
+  return %delta;
+}
+
+###########################################################################
+
+sub token_expiration {
+  my ($self, $opts, $newdelta, @vars) = @_;
+
+  my $deleted = 0;
+  my $kept = 0;
+  my $num_hapaxes = 0;
+  my $num_lowfreq = 0;
+
+  # since DB_File will not shrink a database (!!), we need to *create*
+  # a new one instead.
+  my $main = $self->{bayes}->{main};
+  my $path = $main->sed_path ($main->{conf}->{bayes_path});
+
+  # use a temporary PID-based suffix just in case another one was
+  # created previously by an interrupted expire
+  my $tmpsuffix = "expire$$";
+  my $tmpdbname = $path.'_toks.'.$tmpsuffix;
+
+  # clean out any leftover db copies from previous runs
+  for my $ext (@DB_EXTENSIONS) { unlink ($tmpdbname.$ext); }
+
+  # use O_EXCL to avoid races (bonus paranoia, since we should be locked
+  # anyway)
+  my %new_toks;
+  my $umask = umask 0;
+  tie %new_toks, "DB_File", $tmpdbname, O_RDWR|O_CREAT|O_EXCL,
+              (oct ($main->{conf}->{bayes_file_mode}) & 0666);
+  umask $umask;
+  my $oldest;
+
+  my $showdots = $opts->{showdots};
+  if ($showdots) { print STDERR "\n"; }
+
+  # We've chosen a new atime delta if we've gotten here, so record it for posterity.
+  $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = $newdelta;
+
+  # Figure out how old is too old...
+  my $too_old = $vars[10] - $newdelta; # tooold = newest - delta
+
+  my $magic_re = $self->get_magic_re($self->DB_VERSION);
+
+  # Go ahead and do the move to new db/expire run now ...
+  my ($tok, $packed);
+  while (($tok, $packed) = each %{$self->{db_toks}}) {
+    next if ($tok =~ /$magic_re/); # skip magic tokens
+
+    my ($ts, $th, $atime) = $self->tok_unpack ($packed);
+
+    if ($atime < $too_old) {
+      $deleted++;
+    } else {
+      $new_toks{$tok} = $self->tok_pack ($ts, $th, $atime); $kept++;
+      if (!defined($oldest) || $atime < $oldest) { $oldest = $atime; }
+      if ($ts + $th == 1) {
+	$num_hapaxes++;
+      } elsif ($ts < 8 && $th < 8) {
+	$num_lowfreq++;
+      }
+    }
+
+    if ((($kept + $deleted) % 1000) == 0) {
+      if ($showdots) { print STDERR "."; }
+      $self->set_running_expire_tok();
+    }
+  }
+
+  # and add the magic tokens.  don't add the expire_running token.
+  $new_toks{$DB_VERSION_MAGIC_TOKEN} = $self->DB_VERSION;
+
+  # We haven't changed messages of each type seen, so just copy over.
+  $new_toks{$NSPAM_MAGIC_TOKEN} = $vars[1];
+  $new_toks{$NHAM_MAGIC_TOKEN} = $vars[2];
+
+  # We magically haven't removed the newest token, so just copy that value over.
+  $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $vars[10];
+
+  # The rest of these have been modified, so replace as necessary.
+  $new_toks{$NTOKENS_MAGIC_TOKEN} = $kept;
+  $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = time();
+  $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $oldest;
+  $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} = $deleted;
+
+  # now untie so we can do renames
+  untie %{$self->{db_toks}};
+  untie %new_toks;
+
+  # This is the critical phase (moving files around), so don't allow
+  # it to be interrupted.  Scope the signal changes.
+  {
+    local $SIG{'INT'} = 'IGNORE';
+    local $SIG{'HUP'} = 'IGNORE';
+    local $SIG{'TERM'} = 'IGNORE';
+
+    # now rename in the new one.  Try several extensions
+    for my $ext (@DB_EXTENSIONS) {
+      my $newf = $tmpdbname.$ext;
+      my $oldf = $path.'_toks'.$ext;
+      next unless (-f $newf);
+      if (!rename ($newf, $oldf)) {
+	warn "rename $newf to $oldf failed: $!\n";
+      }
+    }
+  }
+
+  # Call untie_db() so we unlock correctly.
+  $self->untie_db();
+
+  return ($kept, $deleted, $num_hapaxes, $num_lowfreq);
+}
+
+###########################################################################
+
+# Is a sync due?
+sub sync_due {
+  my ($self) = @_;
+
+  return 0 if ( $self->{db_version} < $self->DB_VERSION ); # don't bother doing old db versions
+
+  my $conf = $self->{bayes}->{main}->{conf};
+  return 0 if ( $conf->{bayes_journal_max_size} == 0 );
+
+  my @vars = $self->get_storage_variables();
+  dbg("Bayes DB journal sync: last sync: ".$vars[7],'bayes','-1');
+
+  ## Ok, should we do a sync?
+
+  # Not if the journal file doesn't exist, it's not a file, or it's 0 bytes long.
+  return 0 unless (stat($self->_get_journal_filename()) && -f _);
+
+  # Yes if the file size is larger than the specified maximum size.
+  return 1 if (-s _ > $conf->{bayes_journal_max_size});
+
+  # Yes if it's been at least a day since the last sync.
+  return 1 if (time - $vars[7] > 86400);
+
+  # No, I guess not.
+  return 0;
+}
+
+###########################################################################
+# db_seen reading APIs
+
+sub seen_get {
+  my ($self, $msgid) = @_;
+  $self->{db_seen}->{$msgid};
+}
+
+sub seen_put {
+  my ($self, $msgid, $seen) = @_;
+
+  if ($self->{bayes}->{main}->{learn_to_journal}) {
+    $self->defer_update ("m $seen $msgid");
+  }
+  else {
+    $self->{db_seen}->{$msgid} = $seen;
+  }
+}
+
+sub seen_delete {
+  my ($self, $msgid) = @_;
+
+  if ($self->{bayes}->{main}->{learn_to_journal}) {
+    $self->defer_update ("m f $msgid");
+  }
+  else {
+    delete $self->{db_seen}->{$msgid};
+  }
+}
+
+###########################################################################
+# db reading APIs
+
+sub tok_get {
+  my ($self, $tok) = @_;
+  $self->tok_unpack ($self->{db_toks}->{$tok});
+}
+ 
+# return the magic tokens in a specific order:
+# 0: scan count base
+# 1: number of spam
+# 2: number of ham
+# 3: number of tokens in db
+# 4: last expire atime
+# 5: oldest token in db atime
+# 6: db version value
+# 7: last journal sync
+# 8: last atime delta
+# 9: last expire reduction count
+# 10: newest token in db atime
+#
+sub get_storage_variables {
+  my ($self) = @_;
+  my @values;
+
+  my $db_ver = $self->{db_toks}->{$DB_VERSION_MAGIC_TOKEN};
+  if ( !$db_ver || $db_ver =~ /\D/ ) { $db_ver = 0; }
+
+  if ( $db_ver == 0 ) {
+    my $DB0_NSPAM_MAGIC_TOKEN = '**NSPAM';
+    my $DB0_NHAM_MAGIC_TOKEN = '**NHAM';
+    my $DB0_OLDEST_TOKEN_AGE_MAGIC_TOKEN = '**OLDESTAGE';
+    my $DB0_LAST_EXPIRE_MAGIC_TOKEN = '**LASTEXPIRE';
+    my $DB0_NTOKENS_MAGIC_TOKEN = '**NTOKENS';
+    my $DB0_SCANCOUNT_BASE_MAGIC_TOKEN = '**SCANBASE';
+
+    @values = (
+      $self->{db_toks}->{$DB0_SCANCOUNT_BASE_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB0_NSPAM_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB0_NHAM_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB0_NTOKENS_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB0_LAST_EXPIRE_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB0_OLDEST_TOKEN_AGE_MAGIC_TOKEN},
+      0,
+      0,
+      0,
+      0,
+      0,
+    );
+  }
+  elsif ( $db_ver == 1 ) {
+    my $DB1_NSPAM_MAGIC_TOKEN			= "\015\001\007\011\003NSPAM";
+    my $DB1_NHAM_MAGIC_TOKEN			= "\015\001\007\011\003NHAM";
+    my $DB1_OLDEST_TOKEN_AGE_MAGIC_TOKEN	= "\015\001\007\011\003OLDESTAGE";
+    my $DB1_LAST_EXPIRE_MAGIC_TOKEN		= "\015\001\007\011\003LASTEXPIRE";
+    my $DB1_NTOKENS_MAGIC_TOKEN			= "\015\001\007\011\003NTOKENS";
+    my $DB1_SCANCOUNT_BASE_MAGIC_TOKEN		= "\015\001\007\011\003SCANBASE";
+
+    @values = (
+      $self->{db_toks}->{$DB1_SCANCOUNT_BASE_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB1_NSPAM_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB1_NHAM_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB1_NTOKENS_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB1_LAST_EXPIRE_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB1_OLDEST_TOKEN_AGE_MAGIC_TOKEN},
+      1,
+      0,
+      0,
+      0,
+      0,
+    );
+  }
+  elsif ( $db_ver == 2 ) {
+    my $DB2_LAST_ATIME_DELTA_MAGIC_TOKEN	= "\015\001\007\011\003LASTATIMEDELTA";
+    my $DB2_LAST_EXPIRE_MAGIC_TOKEN		= "\015\001\007\011\003LASTEXPIRE";
+    my $DB2_LAST_EXPIRE_REDUCE_MAGIC_TOKEN	= "\015\001\007\011\003LASTEXPIREREDUCE";
+    my $DB2_LAST_JOURNAL_SYNC_MAGIC_TOKEN	= "\015\001\007\011\003LASTJOURNALSYNC";
+    my $DB2_NEWEST_TOKEN_AGE_MAGIC_TOKEN	= "\015\001\007\011\003NEWESTAGE";
+    my $DB2_NHAM_MAGIC_TOKEN			= "\015\001\007\011\003NHAM";
+    my $DB2_NSPAM_MAGIC_TOKEN			= "\015\001\007\011\003NSPAM";
+    my $DB2_NTOKENS_MAGIC_TOKEN			= "\015\001\007\011\003NTOKENS";
+    my $DB2_OLDEST_TOKEN_AGE_MAGIC_TOKEN	= "\015\001\007\011\003OLDESTAGE";
+    my $DB2_RUNNING_EXPIRE_MAGIC_TOKEN		= "\015\001\007\011\003RUNNINGEXPIRE";
+
+    @values = (
+      0,
+      $self->{db_toks}->{$DB2_NSPAM_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB2_NHAM_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB2_NTOKENS_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB2_LAST_EXPIRE_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB2_OLDEST_TOKEN_AGE_MAGIC_TOKEN},
+      2,
+      $self->{db_toks}->{$DB2_LAST_JOURNAL_SYNC_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB2_LAST_ATIME_DELTA_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB2_LAST_EXPIRE_REDUCE_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB2_NEWEST_TOKEN_AGE_MAGIC_TOKEN},
+    );
+  }
+
+
+  foreach ( @values ) {
+    if ( !$_ || $_ =~ /\D/ ) { $_ = 0; }
+  }
+
+  return @values;
+}
+
+sub dump_db_toks {
+  my ($self, $template, $regex, @vars) = @_;
+
+  my $magic_re = $self->get_magic_re($self->{db_version});
+
+  foreach my $tok (keys %{$self->{db_toks}}) {
+    next if ($tok =~ /$magic_re/); # skip magic tokens
+    next if (defined $regex && ($tok !~ /$regex/o));
+
+    my ($ts, $th, $atime) = $self->tok_get ($tok);
+    
+    my $prob = $self->{bayes}->compute_prob_for_token($tok, $vars[1], $vars[2],
+						      $ts, $th, $atime);
+    $prob ||= 0.5;
+    
+    printf $template,$prob,$ts,$th,$atime,$tok;
+  }
+}
+
+sub set_last_expire {
+  my ($self, $time) = @_;
+  $self->{db_toks}->{$LAST_EXPIRE_MAGIC_TOKEN} = time();
+}
+
+## Don't bother using get_magic_tokens here.  This token should only
+## ever exist when we're running expire, so we don't want to convert it if
+## it's there and we're not expiring ...
+sub get_running_expire_tok {
+  my ($self) = @_;
+  my $running = $self->{db_toks}->{$RUNNING_EXPIRE_MAGIC_TOKEN};
+  if (!$running || $running =~ /\D/) { return undef; }
+  return $running;
+}
+
+sub set_running_expire_tok {
+  my ($self) = @_;
+  $self->{db_toks}->{$RUNNING_EXPIRE_MAGIC_TOKEN} = time();
+}
+
+sub remove_running_expire_tok {
+  my ($self) = @_;
+  delete $self->{db_toks}->{$RUNNING_EXPIRE_MAGIC_TOKEN};
+}
+
+###########################################################################
+
+# db abstraction: allow deferred writes, since we will be frequently
+# writing while checking.
+
+sub tok_count_change {
+  my ($self, $ds, $dh, $tok, $atime) = @_;
+
+  $atime = 0 unless defined $atime;
+
+  if ($self->{bayes}->{main}->{learn_to_journal}) {
+    $self->defer_update ("c $ds $dh $atime $tok");
+  } else {
+    $self->tok_sync_counters ($ds, $dh, $atime, $tok);
+  }
+}
+
+sub nspam_nham_get {
+  my ($self) = @_;
+  my @vars = $self->get_storage_variables();
+  ($vars[1], $vars[2]);
+}
+
+sub nspam_nham_change {
+  my ($self, $ds, $dh) = @_;
+
+  if ($self->{bayes}->{main}->{learn_to_journal}) {
+    $self->defer_update ("n $ds $dh");
+  } else {
+    $self->tok_sync_nspam_nham ($ds, $dh);
+  }
+}
+
+sub tok_touch {
+  my ($self, $tok, $atime) = @_;
+  $self->defer_update ("t $atime $tok");
+}
+
+sub defer_update {
+  my ($self, $str) = @_;
+  $self->{string_to_journal} .= "$str\n";
+}
+
+###########################################################################
+
+sub cleanup {
+  my ($self) = @_;
+
+  my $nbytes = length ($self->{string_to_journal});
+  return if ($nbytes == 0);
+
+  my $path = $self->_get_journal_filename();
+
+  # use append mode, write atomically, then close, so simultaneous updates are
+  # not lost
+  my $conf = $self->{bayes}->{main}->{conf};
+  my $umask = umask(0777 - (oct ($conf->{bayes_file_mode}) & 0666));
+  if (!open (OUT, ">>".$path)) {
+    warn "cannot write to $path, Bayes db update ignored\n";
+    umask $umask; # reset umask
+    return;
+  }
+
+  # do not use print() here, it will break up the buffer if it's >8192 bytes,
+  # which could result in two sets of tokens getting mixed up and their
+  # touches missed.
+  my $writ = 0;
+  while ($writ < $nbytes) {
+    my $len = syswrite (OUT, $self->{string_to_journal}, $nbytes-$writ);
+
+    if (!defined $len || $len < 0) {
+      # argh, write failure, give up
+      $len = 0 unless ( defined $len );
+      warn "write failed to Bayes journal $path ($len of $nbytes)!\n";
+      last;
+    }
+
+    $writ += $len;
+    if ($len < $nbytes) {
+      # this should not happen on filesystem writes!  Still, try to recover
+      # anyway, but be noisy about it so the admin knows
+      warn "partial write to Bayes journal $path ($len of $nbytes), recovering.\n";
+      $self->{string_to_journal} = substr ($self->{string_to_journal}, $len);
+    }
+  }
+
+  if (!close OUT) {
+    warn "cannot write to $path, Bayes db update ignored\n";
+  }
+  umask $umask; # reset umask
+
+  $self->{string_to_journal} = '';
+}
+
+# Return a qr'd RE to match a token with the correct format's magic token
+sub get_magic_re {
+  my ($self, $db_ver) = @_;
+
+  $db_ver = $self->DB_VERSION if (!$db_ver); # XXX - not sure how good of a thing this is
+
+  if ( $db_ver >= 1 ) {
+    return qr/^\015\001\007\011\003/;
+  }
+
+  # When in doubt, assume v0
+  return qr/^\*\*[A-Z]+$/;
+}
+
+sub is_magic_token {
+  my ($self, $token) = @_;
+
+  my $magic_re = $self->get_magic_re($self->{db_version});
+
+  return ($token =~ /$magic_re/);
+}
+
+# provide a more generalized public insterface into the journal sync
+
+sub sync {
+  my ($self, $opts) = @_;
+
+  return $self->_sync_journal($opts);
+}
+
+###########################################################################
+# And this method reads the journal and applies the changes in one
+# (locked) transaction.
+
+sub _sync_journal {
+  my ($self, $opts) = @_;
+  my $ret = 0;
+
+  my $path = $self->_get_journal_filename();
+
+  # if $path doesn't exist, or it's not a file, or is 0 bytes in length, return
+  if ( !stat($path) || !-f _ || -z _ ) { return 0; }
+
+  eval {
+    local $SIG{'__DIE__'};	# do not run user die() traps in here
+    if ($self->tie_db_writable()) {
+      $ret = $self->_sync_journal_trapped($opts, $path);
+    }
+  };
+  my $err = $@;
+
+  # ok, untie from write-mode if we can
+  if (!$self->{bayes}->{main}->{learn_caller_will_untie}) {
+    $self->untie_db();
+  }
+
+  # handle any errors that may have occurred
+  if ($err) {
+    warn "bayes: $err\n";
+    return 0;
+  }
+
+  $ret;
+}
+
+sub _sync_journal_trapped {
+  my ($self, $opts, $path) = @_;
+
+  # Flag that we're doing work
+  $self->set_running_expire_tok();
+
+  my $started = time();
+  my $count = 0;
+  my $total_count = 0;
+  my %tokens = ();
+  my $showdots = $opts->{showdots};
+  my $retirepath = $path.".old";
+
+  # if $path doesn't exist, or it's not a file, or is 0 bytes in length, return
+  # we have to check again since the file may have been removed by a recent bayes db upgrade ...
+  if ( !stat($path) || !-f _ || -z _ ) { return 0; }
+
+  if (!-r $path) { # will we be able to read the file?
+    warn "bayes: bad permissions on journal, can't read: $path\n";
+    return 0;
+  }
+
+  # This is the critical phase (moving files around), so don't allow
+  # it to be interrupted.
+  {
+    local $SIG{'INT'} = 'IGNORE';
+    local $SIG{'HUP'} = 'IGNORE';
+    local $SIG{'TERM'} = 'IGNORE';
+
+    # retire the journal, so we can update the db files from it in peace.
+    # TODO: use locking here
+    if (!rename ($path, $retirepath)) {
+      warn "bayes: failed rename $path to $retirepath\n";
+      return 0;
+    }
+
+    # now read the retired journal
+    if (!open (JOURNAL, "<$retirepath")) {
+      warn "bayes: cannot open read $retirepath\n";
+      return 0;
+    }
+
+
+    # Read the journal
+    while (<JOURNAL>) {
+      $total_count++;
+
+      if (/^t (\d+) (.*)$/) { # Token timestamp update, cache resultant entries
+	$tokens{$2} = $1+0 if ( !exists $tokens{$2} || $1+0 > $tokens{$2} );
+      } elsif (/^c (-?\d+) (-?\d+) (\d+) (.*)$/) { # Add/full token update
+	$self->tok_sync_counters ($1+0, $2+0, $3+0, $4);
+	$count++;
+      } elsif (/^n (-?\d+) (-?\d+)$/) { # update ham/spam count
+	$self->tok_sync_nspam_nham ($1+0, $2+0);
+	$count++;
+      } elsif (/^m ([hsf]) (.+)$/) { # update msgid seen database
+	if ( $1 eq "f" ) {
+	  $self->seen_delete($2);
+	}
+	else {
+	  $self->seen_put($2,$1);
+	}
+	$count++;
+      } else {
+	warn "Bayes journal: gibberish entry found: $_";
+      }
+    }
+    close JOURNAL;
+
+    # Now that we've determined what tokens we need to update and their
+    # final values, update the DB.  Should be much smaller than the full
+    # journal entries.
+    while( my($k,$v) = each %tokens ) {
+      $self->tok_touch_token ($v, $k);
+
+      if ((++$count % 1000) == 0) {
+	if ($showdots) { print STDERR "."; }
+	$self->set_running_expire_tok();
+      }
+    }
+
+    if ($showdots) { print STDERR "\n"; }
+
+    # we're all done, so unlink the old journal file
+    unlink ($retirepath) || warn "bayes: can't unlink $retirepath: $!\n";
+
+    $self->{db_toks}->{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = $started;
+
+    my $done = time();
+    my $msg = ("synced Bayes databases from journal in ".($done - $started).
+	  " seconds: $count unique entries ($total_count total entries)");
+
+    if ($opts->{verbose}) {
+      print $msg,"\n";
+    } else {
+      dbg ($msg);
+    }
+  }
+
+  # else, that's the lot, we're synced.  return
+  1;
+}
+
+sub tok_touch_token {
+  my ($self, $atime, $tok) = @_;
+  my ($ts, $th, $oldatime) = $self->tok_get ($tok);
+
+  # If the new atime is < the old atime, ignore the update
+  # We figure that we'll never want to lower a token atime, so abort if
+  # we try.  (journal out of sync, etc.)
+  return if ( $oldatime >= $atime );
+
+  $self->tok_put ($tok, $ts, $th, $atime);
+}
+
+sub tok_sync_counters {
+  my ($self, $ds, $dh, $atime, $tok) = @_;
+  my ($ts, $th, $oldatime) = $self->tok_get ($tok);
+  $ts += $ds; if ($ts < 0) { $ts = 0; }
+  $th += $dh; if ($th < 0) { $th = 0; }
+
+  # Don't roll the atime of tokens backwards ...
+  $atime = $oldatime if ( $oldatime > $atime );
+
+  $self->tok_put ($tok, $ts, $th, $atime);
+}
+
+sub tok_put {
+  my ($self, $tok, $ts, $th, $atime) = @_;
+  $ts ||= 0;
+  $th ||= 0;
+
+  if ( $tok =~ /^\015\001\007\011\003/ ) { # magic token?  Ignore it!
+    return;
+  }
+
+  # use defined() rather than exists(); the latter is not supported
+  # by NDBM_File, believe it or not.  Using defined() did not
+  # indicate any noticeable speed hit in my testing. (Mar 31 2003 jm)
+  my $exists_already = defined $self->{db_toks}->{$tok};
+
+  if ($ts == 0 && $th == 0) {
+    return if (!$exists_already); # If the token doesn't exist, just return
+    $self->{db_toks}->{$NTOKENS_MAGIC_TOKEN}--;
+    delete $self->{db_toks}->{$tok};
+  } else {
+    if (!$exists_already) { # If the token doesn't exist, raise the token count
+      $self->{db_toks}->{$NTOKENS_MAGIC_TOKEN}++;
+    }
+
+    $self->{db_toks}->{$tok} = $self->tok_pack ($ts, $th, $atime);
+
+    my $newmagic = $self->{db_toks}->{$NEWEST_TOKEN_AGE_MAGIC_TOKEN};
+    if (!defined ($newmagic) || $atime > $newmagic) {
+      $self->{db_toks}->{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $atime;
+    }
+
+    # Make sure to check for either !defined or "" ...  Apparently
+    # sometimes the DB module doesn't return the value correctly. :(
+    my $oldmagic = $self->{db_toks}->{$OLDEST_TOKEN_AGE_MAGIC_TOKEN};
+    if (!defined ($oldmagic) || $oldmagic eq "" || $atime < $oldmagic) {
+      $self->{db_toks}->{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $atime;
+    }
+  }
+}
+
+sub tok_sync_nspam_nham {
+  my ($self, $ds, $dh) = @_;
+  my ($ns, $nh) = ($self->get_storage_variables())[1,2];
+  if ($ds) { $ns += $ds; } if ($ns < 0) { $ns = 0; }
+  if ($dh) { $nh += $dh; } if ($nh < 0) { $nh = 0; }
+  $self->{db_toks}->{$NSPAM_MAGIC_TOKEN} = $ns;
+  $self->{db_toks}->{$NHAM_MAGIC_TOKEN} = $nh;
+}
+
+###########################################################################
+
+sub _get_journal_filename {
+  my ($self) = @_;
+
+  if (defined $self->{journal_live_path}) {
+    return $self->{journal_live_path};
+  }
+
+  my $main = $self->{bayes}->{main};
+  my $fname = $main->sed_path ($main->{conf}->{bayes_path}."_journal");
+
+  $self->{journal_live_path} = $fname;
+  return $self->{journal_live_path};
+}
+
+###########################################################################
+
+sub scan_count_get {
+  my ($self) = @_;
+
+  if ( $self->{db_version} < 2 ) {
+    my ($count) = $self->get_storage_variables();
+    my $path = $self->{scan_count_little_file};
+    $count += (defined $path && -e $path ? -s _ : 0);
+    return $count;
+  }
+
+  0;
+}
+
+###########################################################################
+
+# this is called directly from sa-learn(1).
+sub perform_upgrade {
+  my ($self, $opts) = @_;
+  my $ret = 0;
+
+  eval {
+    local $SIG{'__DIE__'};	# do not run user die() traps in here
+
+    use File::Basename;
+    use File::Copy;
+
+    # bayes directory
+    my $main = $self->{bayes}->{main};
+    my $path = $main->sed_path($main->{conf}->{bayes_path});
+    my $dir = dirname($path);
+
+    # make temporary copy since old dbm and new dbm may have same name
+    opendir(DIR, $dir) || die "can't opendir $dir: $!";
+    my @files = grep { /^bayes_(?:seen|toks)(?:\.\w+)?$/ } readdir(DIR);
+    closedir(DIR);
+    if (@files < 2 || !grep(/bayes_seen/,@files) || !grep(/bayes_toks/,@files))
+    {
+      die "unable to find bayes_toks and bayes_seen, stopping\n";
+    }
+    # untaint @files (already safe after grep)
+    @files = map { /(.*)/, $1 } @files;
+ 	 
+    for (@files) {
+      my $src = "$dir/$_";
+      my $dst = "$dir/old_$_";
+      copy($src, $dst) || die "can't copy $src to $dst: $!\n";
+    }
+
+    # delete previous to make way for import
+    for (@files) { unlink("$dir/$_"); }
+
+    # import
+    if ($self->tie_db_writable()) {
+      $ret += $self->upgrade_old_dbm_files_trapped("$dir/old_bayes_seen",
+						   $self->{db_seen});
+      $ret += $self->upgrade_old_dbm_files_trapped("$dir/old_bayes_toks",
+						   $self->{db_toks});
+    }
+
+    if ($ret == 2) {
+      print "import successful, original files saved with \"old\" prefix\n";
+    }
+    else {
+      print "import failed, original files saved with \"old\" prefix\n";
+    }
+  };
+  my $err = $@;
+
+  $self->untie_db();
+
+  # if we died, untie the dbm files
+  if ($err) {
+    warn "bayes perform_upgrade: $err\n";
+    return 0;
+  }
+  $ret;
+}
+
+sub upgrade_old_dbm_files_trapped {
+  my ($self, $filename, $output) = @_;
+
+  my $count;
+  my %in;
+
+  print "upgrading to DB_File, please be patient: $filename\n";
+
+  # try each type of file until we find one with > 0 entries
+  for my $dbm ('DB_File', 'GDBM_File', 'NDBM_File', 'SDBM_File') {
+    $count = 0;
+    # wrap in eval so it doesn't run in general use.  This accesses db
+    # modules directly.
+    # Note: (bug 2390), the 'use' needs to be on the same line as the eval
+    # for RPM dependency checks to work properly.  It's lame, but...
+    eval 'use ' . $dbm . ';
+      tie %in, "' . $dbm . '", $filename, O_RDONLY, 0600;
+      %{ $output } = %in;
+      $count = scalar keys %{ $output };
+      untie %in;
+    ';
+    if ($@) {
+      print "$dbm: $dbm module not installed, nothing copied.\n";
+      dbg("error was: $@");
+    }
+    elsif ($count == 0) {
+      print "$dbm: no database of that kind found, nothing copied.\n";
+    }
+    else {
+      print "$dbm: copied $count entries.\n";
+      return 1;
+    }
+  }
+
+  return 0;
+}
+
+###########################################################################
+
+# token marshalling format for db_toks.
+
+# Since we may have many entries with few hits, especially thousands of hapaxes
+# (1-occurrence entries), use a flexible entry format, instead of simply "2
+# packed ints", to keep the memory and disk space usage down.  In my
+# 18k-message test corpus, only 8.9% have >= 8 hits in either counter, so we
+# can use a 1-byte representation for the other 91% of low-hitting entries
+# and save masses of space.
+
+# This looks like: XXSSSHHH (XX = format bits, SSS = 3 spam-count bits, HHH = 3
+# ham-count bits).  If XX in the first byte is 11, it's packed as this 1-byte
+# representation; otherwise, if XX in the first byte is 00, it's packed as
+# "CLL", ie. 1 byte and 2 32-bit "longs" in perl pack format.
+
+# Savings: roughly halves size of toks db, at the cost of a ~10% slowdown.
+
+use constant FORMAT_FLAG	=> 0xc0;	# 11000000
+use constant ONE_BYTE_FORMAT	=> 0xc0;	# 11000000
+use constant TWO_LONGS_FORMAT	=> 0x00;	# 00000000
+
+use constant ONE_BYTE_SSS_BITS	=> 0x38;	# 00111000
+use constant ONE_BYTE_HHH_BITS	=> 0x07;	# 00000111
+
+sub tok_unpack {
+  my ($self, $value) = @_;
+  $value ||= 0;
+
+  my ($packed, $atime);
+  if ( $self->{db_version} == 0 ) {
+    ($packed, $atime) = unpack("CS", $value);
+  }
+  elsif ( $self->{db_version} == 1 || $self->{db_version} == 2 ) {
+    ($packed, $atime) = unpack("CV", $value);
+  }
+
+  if (($packed & FORMAT_FLAG) == ONE_BYTE_FORMAT) {
+    return (($packed & ONE_BYTE_SSS_BITS) >> 3,
+		$packed & ONE_BYTE_HHH_BITS,
+		$atime || 0);
+  }
+  elsif (($packed & FORMAT_FLAG) == TWO_LONGS_FORMAT) {
+    my ($packed, $ts, $th, $atime);
+    if ( $self->{db_version} == 0 ) {
+      ($packed, $ts, $th, $atime) = unpack("CLLS", $value);
+    }
+    elsif ( $self->{db_version} == 1 ) {
+      ($packed, $ts, $th, $atime) = unpack("CVVV", $value);
+    }
+    elsif ( $self->{db_version} == 2 ) {
+      ($packed, $ts, $th, $atime) = unpack("CVVV", $value);
+    }
+    return ($ts || 0, $th || 0, $atime || 0);
+  }
+  # other formats would go here...
+  else {
+    warn "unknown packing format for Bayes db, please re-learn: $packed";
+    return (0, 0, 0);
+  }
+}
+
+sub tok_pack {
+  my ($self, $ts, $th, $atime) = @_;
+  $ts ||= 0; $th ||= 0; $atime ||= 0;
+  if ($ts < 8 && $th < 8) {
+    return pack ("CV", ONE_BYTE_FORMAT | ($ts << 3) | $th, $atime);
+  } else {
+    return pack ("CVVV", TWO_LONGS_FORMAT, $ts, $th, $atime);
+  }
+}
+
+###########################################################################
+
+sub dbg { Mail::SpamAssassin::dbg (@_); }
+sub sa_die { Mail::SpamAssassin::sa_die (@_); }
+
+1;

Added: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStoreSQL.pm
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStoreSQL.pm	Thu Jan 29 18:54:33 2004
@@ -0,0 +1,1505 @@
+# <@LICENSE>
+# ====================================================================
+# The Apache Software License, Version 1.1
+# 
+# Copyright (c) 2000 The Apache Software Foundation.  All rights
+# reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in
+#    the documentation and/or other materials provided with the
+#    distribution.
+# 
+# 3. The end-user documentation included with the redistribution,
+#    if any, must include the following acknowledgment:
+#       "This product includes software developed by the
+#        Apache Software Foundation (http://www.apache.org/)."
+#    Alternately, this acknowledgment may appear in the software itself,
+#    if and wherever such third-party acknowledgments normally appear.
+# 
+# 4. The names "Apache" and "Apache Software Foundation" must
+#    not be used to endorse or promote products derived from this
+#    software without prior written permission. For written
+#    permission, please contact apache@apache.org.
+# 
+# 5. Products derived from this software may not be called "Apache",
+#    nor may "Apache" appear in their name, without prior written
+#    permission of the Apache Software Foundation.
+# 
+# THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+# ====================================================================
+# 
+# This software consists of voluntary contributions made by many
+# individuals on behalf of the Apache Software Foundation.  For more
+# information on the Apache Software Foundation, please see
+# <http://www.apache.org/>.
+# 
+# Portions of this software are based upon public domain software
+# originally written at the National Center for Supercomputing Applications,
+# University of Illinois, Urbana-Champaign.
+# </...@LICENSE>
+
+=head1 NAME
+
+Mail::SpamAssassin::BayesStoreSQL - SQL Bayesian Storage Module Implementation
+
+=head1 SYNOPSIS
+
+=head1 DESCRIPTION
+
+This module implementes a SQL based bayesian storage module.
+
+=cut
+
+package Mail::SpamAssassin::BayesStoreSQL;
+
+use strict;
+use bytes;
+
+use DBI;
+
+use Mail::SpamAssassin::BayesStore;
+
+use vars qw( @ISA );
+
+@ISA = qw( Mail::SpamAssassin::BayesStore );
+
+=head1 METHODS
+
+=head2 new
+
+public class (Mail::SpamAssassin::BayesStoreSQL) new (Mail::Spamassassin::Bayes $bayes)
+
+Description:
+This methods creates a new instance of the Mail::SpamAssassin::BayesStoreSQL
+object.  It expects to be passed an instance of the Mail::SpamAssassin:Bayes
+object which is passed into the Mail::SpamAssassin::BayesStore parent object.
+
+This method sets up the database connection and determines the username to
+use in queries.
+
+=cut
+
+sub new {
+  my $class = shift;
+  $class = ref($class) || $class;
+
+  my $self = $class->SUPER::new(@_);
+
+  $self->{supported_db_version} = 2;
+
+  if (!$self->{bayes}->{conf}->{bayes_sql_dsn}) {
+    dbg("bayes: invalid config, must set bayes_sql_dsn config variable.\n");
+    return undef;
+  }
+
+  my $dsn = $self->{bayes}->{conf}->{bayes_sql_dsn};
+  my $dbuser = $self->{bayes}->{conf}->{bayes_sql_username};
+  my $dbpass = $self->{bayes}->{conf}->{bayes_sql_password};
+
+  my $dbh = DBI->connect($dsn, $dbuser, $dbpass, {'PrintError' => 1});
+
+  if (!$dbh) {
+    dbg("bayes: Unable to connect to database: ".DBI->errstr());
+
+    ## TODO!  This is not appropriate -- $bayes->{store} must alwasy
+    ## be a valid object.  returning undef from a constructor is bad
+    ## news.
+    return undef;
+  }
+
+  $self->{_dbh} = $dbh;
+
+  dbg("bayes: Database connection established");
+
+  if ($self->{bayes}->{conf}->{bayes_sql_override_username}) {
+    $self->{_username} = $self->{bayes}->{conf}->{bayes_sql_override_username};
+  }
+  else {
+    $self->{_username} = $self->{bayes}->{main}->{username};
+
+    # Need to make sure that a username is set, so just in case there is
+    # no username set in main, set one here.
+    unless ($self->{_username}) {
+      $self->{_username} = "GLOBALBAYES";
+    }
+  }
+  dbg("bayes: Using username: ".$self->{_username});
+  return $self;
+}
+
+=head2 tie_db_readonly
+
+public instance (Boolean) tie_db_readonly ();
+
+Description:
+This method ensures that the database connection is properly setup
+and working.  If necessary it will initialize a user's bayes variables
+so that they can begin using the database immediately.
+
+=cut
+
+sub tie_db_readonly {
+  my ($self) = @_;
+
+  my $ret = $self->tie_db_writable();
+
+  return $ret;
+}
+
+=head2 tie_db_writable
+
+public instance (Boolean) tie_db_writable ()
+
+Description:
+This method ensures that the database connetion is properly setup
+and working. If necessary it will initialize a users bayes variables
+so that they can begin using the database immediately.
+
+=cut
+
+sub tie_db_writable {
+  my ($self) = @_;
+  my $main = $self->{bayes}->{main};
+
+  $self->read_db_configs();
+
+  # If the DB version is one we don't understand, abort!
+  my $db_ver = $self->_get_db_version();
+  $self->{db_version} = $db_ver;
+  dbg("bayes: found bayes db version ".$self->{db_version});
+
+  if ( $db_ver != $self->DB_VERSION ) {
+    dbg("bayes: Database version $db_ver is different than we understand (".$self->DB_VERSION."), aborting!");
+    $self->untie_db();
+    return 0;
+  }
+
+  unless ($self->_initialize_db()) {
+    dbg("bayes: unable to initialize database for ".$self->{_username}." user, aborting!");
+    $self->untie_db();
+    return 0;
+  }
+
+  return 1;
+}
+
+
+=head2 untie_db
+
+public instance () untie_db ()
+
+Description:
+This method is unused for the SQL based implementation.
+
+=cut
+
+sub untie_db {
+  my ($self) = @_;
+  # not used for SQL based implementation
+}
+
+=head2 calculate_expire_delta
+
+public instance (\%) calculate_expire_delta (Integer $newest_atime,
+                                             Integer $start,
+                                             Integer $max_expire_mult)
+
+Description:
+This method performs a calculation on the data to determine the optimum
+atime for token expiration.
+
+=cut
+
+sub calculate_expire_delta {
+  my ($self, $newest_atime, $start, $max_expire_mult) = @_;
+
+  my %delta = (); # use a hash since an array is going to be very sparse
+  
+  my $sql = "SELECT count(*)
+               FROM bayes_token
+              WHERE username = ?
+                AND (? - atime) > ?";
+
+  my $sth = $self->{_dbh}->prepare_cached($sql);
+    
+  for (my $i = 1; $i <= $max_expire_mult; $i<<=1) {
+    my $rc = $sth->execute($self->{_username}, $newest_atime, $start * $i);
+
+    unless ($rc) {
+      dbg("bayes: calculate_expire_delta: SQL Error: ".$self->{_dbh}->errstr());
+      return undef;
+    }
+
+    my ($count) = $sth->fetchrow_array();
+
+    $delta{$i} = $count;
+  }
+  $sth->finish();
+
+  return %delta;
+}
+
+=head2 token_expiration
+
+public instance (Integer, Integer,
+                 Integer, Integer) token_expiration(\% $opts,
+                                                    Integer $newdelta,
+                                                    @ @vars)
+
+Description:
+This method performs the database specific expiration of tokens based on
+the passed in C<$newdelta> and C<@vars>.
+
+=cut
+
+sub token_expiration {
+  my ($self, $opts, $newdelta, @vars) = @_;
+
+  my $num_hapaxes;
+  my $num_lowfreq;
+
+  # Figure out how old is too old...
+  my $too_old = $vars[10] - $newdelta; # tooold = newest - delta
+
+  my $sql = "DELETE from bayes_token WHERE username = ? and atime < ?";
+
+  my $rows = $self->{_dbh}->do($sql, undef, $self->{_username}, $too_old);
+
+  if (!defined($rows)) {
+    dbg("bayes: actual_expire: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  my $deleted = $rows;
+
+  # We've chosen a new atime delta if we've gotten here, so record it for posterity.
+  $self->_set_last_atime_delta($newdelta);
+
+  # The rest of these have been modified, so replace as necessary.
+  $self->set_last_expire(time());
+  $self->_set_last_expire_reduce($deleted);
+
+  # Call untie_db() first so we unlock correctly etc. first
+  $self->untie_db();
+
+  my $kept = $self->_get_token_count();
+
+  $num_hapaxes = $self->_get_num_hapaxes() if ($opts->{verbose});
+  $num_lowfreq = $self->_get_num_lowfreq() if ($opts->{verbose});
+
+  return ($kept, $deleted, $num_hapaxes, $num_lowfreq);
+}
+
+=head2 sync_due
+
+public instance (Boolean) sync_due ()
+
+Description:
+This method determines if a database sync is currently required.
+
+Unused for SQL based implementation.
+
+=cut
+
+sub sync_due {
+  my ($self) = @_;
+
+  return 0;
+}
+
+=head2 seen_get
+
+public instance (String) seen_get (string $msgid)
+
+Description:
+This method retrieves the stored value, if any, for C<$msgid>.  The return value
+is the stored string ('s' for spam and 'h' for ham) or undef if C<$msgid> is not
+found.
+
+=cut
+
+sub seen_get {
+  my ($self, $msgid) = @_;
+ 
+  my $sql = "SELECT flag FROM bayes_seen WHERE username = ? AND msgid = ?";
+
+  my $sth = $self->{_dbh}->prepare_cached($sql);
+
+  unless (defined($sth)) {
+    dbg("bayes: seen_get: SQL Error: ".$self->{_dbh}->errstr());
+    return undef;
+  }
+
+  my $rc = $sth->execute($self->{_username}, $msgid);
+  
+  unless ($rc) {
+    dbg("bayes: seen_get: SQL Error: ".$self->{_dbh}->errstr());
+    return undef;
+  }
+
+  my ($flag) = $sth->fetchrow_array();
+
+  $sth->finish();
+  
+  return $flag;
+}
+
+=head2 seen_put
+
+public (Boolean) seen_put (string $msgid, char $flag)
+
+Description:
+This method records C<$msgid> as the type given by C<$flag>.  C<$flag> is one of
+two values 's' for spam and 'h' for ham.
+
+=cut
+
+sub seen_put {
+  my ($self, $msgid, $flag) = @_;
+
+  return 0 if (!$msgid);
+  return 0 if (!$flag);
+  
+  my $sql = "INSERT INTO bayes_seen (username, msgid, flag) VALUES (?,?,?)";
+  
+  my $sth = $self->{_dbh}->prepare_cached($sql);
+  
+  unless (defined($sth)) {
+      dbg("bayes: seen_put: SQL Error: ".$self->{_dbh}->errstr());
+      return 0;
+  }
+
+  my $rc = $sth->execute($self->{_username}, $msgid, $flag);
+  
+  unless ($rc) {
+      dbg("bayes: seen_put: SQL Error: ".$self->{_dbh}->errstr());
+      return 0;
+  }
+  
+  $sth->finish();
+
+  dbg("bayes: seen ($msgid) put");
+  return 1;
+}
+
+=head2 seen_delete
+
+public instance (Boolean) seen_delete (string $msgid)
+
+Description:
+This method removes C<$msgid> from the database.
+
+=cut
+
+sub seen_delete {
+  my ($self, $msgid) = @_;
+
+  return 0 if (!$msgid);
+
+  my $sql = "DELETE FROM bayes_seen WHERE username = ? AND msgid = ?";
+  
+  my $sth = $self->{_dbh}->prepare_cached($sql);
+
+  unless (defined($sth)) {
+      dbg("bayes: seen_delete: SQL Error: ".$self->{_dbh}->errstr());
+      return 0;
+  }
+
+  my $rc = $sth->execute($self->{_username}, $msgid);
+
+  unless ($rc) {
+      dbg("bayes: seen_delete: SQL Error: ".$self->{_dbh}->errstr());
+      return 0;
+  }
+
+  $sth->finish();
+
+  return 1;
+}
+
+=head2 get_storage_variables
+
+public instance (@) get_storage_variables ()
+
+Description:
+This method retrieves the various administrative variables used by
+the Bayes process and database.
+
+The values returned in the array are in the following order:
+
+0: scan count base
+
+1: number of spam
+
+2: number of ham
+
+3: number of tokens in db
+
+4: last expire atime
+
+5: oldest token in db atime
+
+6: db version value
+
+7: last journal sync
+
+8: last atime delta
+
+9: last expire reduction count
+
+10: newest token in db atime
+
+=cut
+
+sub get_storage_variables {
+  my ($self) = @_;
+  my @values;
+
+  my $sql = "SELECT spam_count, ham_count, last_expire,
+                    last_atime_delta, last_expire_reduce
+               FROM bayes_vars
+              WHERE username = ?";
+
+  my $sth = $self->{_dbh}->prepare_cached($sql);
+
+  unless (defined($sth)) {
+    dbg("bayes: get_storage_variables: SQL Error: ".$self->{_dbh}->errstr());
+    return (0,0,0,0,0,0,0,0,0,0,0);
+  }
+
+  my $rc = $sth->execute($self->{_username});
+
+  unless ($rc) {
+    dbg("bayes: get_storage_variables: SQL Error: ".$self->{_dbh}->errstr());
+    return (0,0,0,0,0,0,0,0,0,0,0);
+  }
+
+  my ($spam_count, $ham_count, $last_expire,
+      $last_atime_delta, $last_expire_reduce) = $sth->fetchrow_array();
+
+  $sth->finish();
+
+  my $token_count = $self->_get_token_count();
+  my $oldest_token_age = $self->_get_oldest_token_age();
+  my $newest_token_age = $self->_get_newest_token_age();
+  my $db_ver = $self->DB_VERSION;
+
+  @values = (
+             0,
+             $spam_count,
+             $ham_count,
+             $token_count,
+             $last_expire,
+             $oldest_token_age,
+             $db_ver,
+             0, # we do not do journal syncs
+             $last_atime_delta,
+             $last_expire_reduce,
+             $newest_token_age
+             );
+
+  foreach ( @values ) {
+    if ( !$_ || $_ =~ /\D/ ) { $_ = 0; }
+  }
+
+  return @values;
+}
+
+=head2 dump_db_toks
+
+public instance () dump_db_toks (String $template, String $regex, Array @vars)
+
+Description:
+This method loops over all tokens, computing the probability for the token and then
+printing it out according to the passed in token.
+
+=cut
+
+sub dump_db_toks {
+  my ($self, $template, $regex, @vars) = @_;
+
+  # 0/0 tokens don't count
+  # since ordering is check here, order the tokens
+  my $sql = "SELECT token, spam_count, ham_count, atime
+               FROM bayes_token
+              WHERE username = ?
+                AND (spam_count > 0 OR ham_count > 0)
+             ORDER BY token";
+
+  my $sth = $self->{_dbh}->prepare($sql);
+
+  unless (defined($sth)) {
+    dbg("bayes: dump_db_toks: SQL Error: ".$self->{_dbh}->errstr());
+    return;
+  }
+
+  my $rc = $sth->execute($self->{_username});
+
+  unless ($rc) {
+    dbg("bayes: dump_db_toks: SQL Error: ".$self->{_dbh}->errstr());
+    return;
+  }  
+
+  while (my ($token, $spam_count, $ham_count, $atime) = $sth->fetchrow_array()) {
+    my $prob = $self->{bayes}->compute_prob_for_token($token, $vars[1], $vars[2],
+						      $spam_count, $ham_count,
+						      $atime);
+    $prob ||= 0.5;
+    
+    printf $template,$prob,$spam_count,$ham_count,$atime,$token;
+  }
+
+  $sth->finish();
+
+  return;
+}
+
+=head2 set_last_expire
+
+public instance (Boolean) set_last_expire (Integer $time)
+
+Description:
+This method sets the last expire time.
+
+=cut
+
+sub set_last_expire {
+  my ($self, $time) = @_;
+
+  return 0 unless (defined($time));
+
+  my $sql = "UPDATE bayes_vars SET last_expire = ? WHERE username = ?";
+ 
+  my $sth = $self->{_dbh}->prepare_cached($sql);
+
+  unless (defined($sth)) {
+    dbg("bayes: set_last_expire: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  my $rc = $sth->execute($time, $self->{_username});
+
+  unless ($rc) {
+    dbg("bayes: set_last_expire: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  $sth->finish();
+
+  return 1;
+}
+
+=head2 get_running_expire_tok
+
+public instance (String $time) get_running_expire_tok ()
+
+Description:
+This method determines if an expire is currently running and returns
+the last time set.
+
+There can be multiple times, so we just pull the greatest (most recent)
+value.
+
+=cut
+
+sub get_running_expire_tok {
+  my ($self) = @_;
+
+  my $sql = "SELECT max(runtime) from bayes_expire WHERE username = ?";
+
+  my $sth = $self->{_dbh}->prepare_cached($sql);
+
+  unless (defined($sth)) {
+    dbg("bayes: get_running_expire_tok: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  my $rc = $sth->execute($self->{_username});
+
+  unless ($rc) {
+    dbg("bayes: get_running_expire_tok: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  my ($runtime) = $sth->fetchrow_array();
+
+  $sth->finish();
+
+  return $runtime;
+}
+
+=head2 set_running_expire_tok
+
+public instance (String $time) set_running_expire_tok ()
+
+Description:
+This method sets the time that an expire starts running.
+
+=cut
+
+sub set_running_expire_tok {
+  my ($self) = @_;
+
+  my $sql = "INSERT INTO bayes_expire (username,runtime) VALUES (?,?)";
+
+  my $sth = $self->{_dbh}->prepare_cached($sql);
+
+  my $time = time();
+
+  my $rc = $sth->execute($self->{_username}, $time);
+
+  unless ($rc) {
+      dbg("bayes: set_running_expire_tok: SQL Error: ".$self->{_dbh}->errstr());
+      return undef;
+  }
+  $sth->finish();
+  return $time;
+}
+
+=head2 remove_running_expire_tok
+
+public instance (Boolean) remove_running_expire_tok ()
+
+Description:
+This method removes the row in the database that indicates that
+and expire is currently running.
+
+=cut
+
+sub remove_running_expire_tok {
+  my ($self) = @_;
+
+  my $sql = "DELETE from bayes_expire WHERE username = ?";
+
+  my $sth = $self->{_dbh}->prepare_cached($sql);
+
+  unless (defined($sth)) {
+    dbg("bayes: remove_running_expire_tok: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  my $rows = $self->{_dbh}->do($sql, undef, $self->{_username});
+
+  if (!defined($rows)) {
+    dbg("bayes: remove_running_expire_tok: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  return 1;
+}
+=head2 tok_get
+
+public instance (Integer, Integer, Integer) tok_get (String $token)
+
+Description:
+This method retrieves a specificed token (C<$token>) from the database
+and returns it's spam_count, ham_count and last access time.
+
+=cut
+
+sub tok_get {
+  my ($self, $token) = @_;
+
+  my $sql = "SELECT spam_count, ham_count, atime
+               FROM bayes_token
+              WHERE username = ?
+                AND token = ?";
+
+  my $sth = $self->{_dbh}->prepare_cached($sql);
+
+  unless (defined($sth)) {
+    dbg("bayes: tok_get: SQL Error: ".$self->{_dbh}->errstr());
+    return (0,0,0);
+  }
+
+  my $rc = $sth->execute($self->{_username}, $token);
+
+  unless ($rc) {
+    dbg("bayes: tok_get: SQL Error: ".$self->{_dbh}->errstr());
+    return (0,0,0);
+  }
+
+  my ($spam_count, $ham_count, $atime) = $sth->fetchrow_array();
+
+  $sth->finish();
+
+  $spam_count = 0 if (!$spam_count || $spam_count < 0);
+  $ham_count = 0 if (!$ham_count || $ham_count < 0);
+  $atime = 0 if (!$atime);
+
+  return ($spam_count, $ham_count, $atime)
+}
+
+=head2 tok_count_change
+
+public instance (Boolean) tok_count_change (Integer $spam_count,
+					    Integer $ham_count,
+					    String $token,
+					    String $atime)
+
+Description:
+This method takes a C<$spam_count> and C<$ham_count> and adds it to
+C<$tok> along with updating C<$tok>s atime with C<$atime>.
+
+=cut
+
+sub tok_count_change {
+  my ($self, $spam_count, $ham_count, $token, $atime) = @_;
+
+  $atime = 0 unless defined $atime;
+
+  $self->_put_token ($token, $spam_count, $ham_count, $atime);
+}
+
+=head2 nspam_nham_get
+
+public instance ($spam_count, $ham_count) nspam_nham_get ()
+
+Description:
+This method retrieves the total number of spam and the total number of
+ham learned.
+
+=cut
+ 
+sub nspam_nham_get {
+  my ($self) = @_;
+
+  my $sql = "SELECT ham_count, spam_count FROM bayes_vars WHERE username = ?";
+
+  my $sth = $self->{_dbh}->prepare_cached($sql);
+
+  unless (defined($sth)) {
+    dbg("bayes: nspam_nham_get: SQL Error: ".$self->{_dbh}->errstr());
+    return (0,0);
+  }
+
+  my $rc = $sth->execute($self->{_username});
+
+  unless ($rc) {
+    dbg("bayes: nspam_nham_get: SQL Error: ".$self->{_dbh}->errstr());
+    return (0,0);
+  }
+
+  my ($ham_count, $spam_count) = $sth->fetchrow_array();
+
+  $sth->finish();
+  
+  return ($spam_count || 0, $ham_count || 0);
+}
+
+=head2 nspam_nham_change
+
+public instance (Boolean) nspam_nham_change (Integer $num_spam,
+                                             Integer $num_ham)
+
+Description:
+This method updates the number of spam and the number of ham in the database.
+
+=cut
+
+sub nspam_nham_change {
+  my ($self, $num_spam, $num_ham) = @_;
+
+  my $sql = "UPDATE bayes_vars
+                SET spam_count = spam_count + ?,
+                    ham_count = ham_count + ?
+              WHERE username = ?";
+
+  my $sth = $self->{_dbh}->prepare_cached($sql);
+
+  unless (defined($sth)) {
+    dbg("bayes: nspam_nham_change: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  my $rc = $sth->execute($num_spam, $num_ham, $self->{_username});
+
+  unless ($rc) {
+    dbg("bayes: nspam_nham_change: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  $sth->finish();
+
+  return 1;
+}
+
+=head2 tok_touch
+
+public instance (Boolean) tok_touch (String $token,
+                                     String $atime)
+
+Description:
+This method updates the given tokens (C<$token>) atime.
+
+The assumption is that the token already exists in the database.
+
+=cut
+
+sub tok_touch {
+  my ($self, $token, $atime) = @_;
+
+  # shortcut, will only update atime for the token if the atime is less than
+  # what we are updating to
+  my $sql = "UPDATE bayes_token
+                SET atime = ?
+              WHERE username = ?
+                AND token = ?
+                AND atime < ?";
+
+  my $sth = $self->{_dbh}->prepare_cached($sql);
+
+  unless (defined($sth)) {
+    dbg("bayes: tok_touch: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  my $rc = $sth->execute($atime, $self->{_username}, $token, $atime);
+
+  unless ($rc) {
+    dbg("bayes: tok_touch: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  $sth->finish();
+
+  return 1;
+}
+
+=head2 cleanup
+
+public instance (Boolean) cleanup ()
+
+Description:
+This method peroms any cleanup necessary before moving onto the next
+operation.
+
+=cut
+
+sub cleanup {
+  my ($self) = @_;
+
+  # Not used for this implementation
+	       
+  return 1;
+}
+
+=head2 is_magic_token
+
+public instance (Boolean) is_magic_token (string $token)
+
+Description:
+This method determines if a given token is "magic" or special to the
+implementation.
+
+=cut
+
+sub is_magic_token {
+  my ($self, $token) = @_;
+
+  return 0; # nothing is magic
+}
+
+=head2 sync
+
+public instance (Boolean) sync (\% $opts)
+
+Description:
+This method performs a sync of the database
+
+=cut
+
+sub sync {
+  my ($self, $opts) = @_;
+
+  # Not used for this implementation
+
+  return 1;
+}
+
+=head2 scan_count_get
+
+public instance (Integer) scan_count_get ()
+
+Description:
+Return the current scan count.
+
+Unused for SQL implementation.
+
+=cut
+
+sub scan_count_get {
+  my ($self) = @_;
+
+  return 0;
+}
+
+=head2 perform_upgrade
+
+public instance (Boolean) perform_upgrade (\% $opts);
+
+Description:
+Performs an upgrade of the database from one version to another, not
+currently used in this implementation.
+
+=cut
+
+sub perform_upgrade {
+  my ($self) = @_;
+
+  return 1;
+}
+
+=head1 Private Methods
+
+=head2 _get_db_version
+
+private instance (Integer) _get_db_version ()
+
+Description:
+Gets the current version of the database from the special global vars
+tables.
+
+=cut
+
+sub _get_db_version {
+  my ($self) = @_;
+
+  my $sql = "SELECT value FROM bayes_global_vars WHERE variable = 'VERSION'";
+
+  my $sth = $self->{_dbh}->prepare_cached($sql);
+
+  unless (defined($sth)) {
+    dbg("bayes: _get_db_version: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  my $rc = $sth->execute();
+
+  unless ($rc) {
+    dbg("bayes: _get_db_version: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  my ($version) = $sth->fetchrow_array();
+
+  $sth->finish();
+
+  return $version;
+}
+ 
+=head2 _initialize_db
+
+private instance (Boolean) _initialize_db ()
+
+Description:
+This method will check to see if a user has had their bayes variables
+initialized. If not then it will perform this initialization.
+
+=cut
+
+sub _initialize_db {
+  my ($self) = @_;
+
+  return 0 if (!$self->{_username});
+
+  my $sql = "SELECT count(*) FROM bayes_vars WHERE username = ?";
+
+  my $sth = $self->{_dbh}->prepare_cached($sql);
+
+  unless (defined($sth)) {
+    dbg("bayes: _initialize_db: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  my $rc = $sth->execute($self->{_username});
+
+  unless ($rc) {
+    dbg("bayes: _initialize_db: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  my ($count) = $sth->fetchrow_array();
+
+  $sth->finish();
+
+  if ($count) {
+    return 1;
+  }
+
+  # For now let the database setup the other variables as defaults
+  $sql = "INSERT INTO bayes_vars (username) VALUES (?)";
+
+  $sth = $self->{_dbh}->prepare_cached($sql);
+
+  unless (defined($sth)) {
+    dbg("bayes: _initialize_db: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  $rc = $sth->execute($self->{_username});
+
+  unless ($rc) {
+    dbg("bayes: _initialize_db: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  $sth->finish();
+
+  return 1;
+}
+
+=head2 _token_atime
+
+private instance (Integer) _token_atime (String $token)
+
+Description:
+This method returns a given tokens atime, it also serves to tell us
+if the token exists or not since the atime will be undefined if it
+does not exist.
+
+=cut
+
+sub _token_atime {
+  my ($self, $token) = @_;
+
+  return undef unless (defined($token));
+
+  my $sql = "SELECT atime
+               FROM bayes_token
+              WHERE username = ?
+                AND token = ?";
+
+  my $sth = $self->{_dbh}->prepare_cached($sql);
+
+  unless (defined($sth)) {
+    dbg("bayes: _token_atime: SQL Error: ".$self->{_dbh}->errstr());
+    return undef;
+  }
+
+  my $rc = $sth->execute($self->{_username}, $token);
+
+  unless ($rc) {
+    dbg("bayes: _token_atime: SQL Error: ".$self->{_dbh}->errstr());
+    return undef;
+  }
+
+  my ($token_atime) = $sth->fetchrow_array();
+
+  $sth->finish();
+
+  return $token_atime;
+}
+
+=head2 _delete_token
+
+private instance (Boolean) _delete_token (String $token)
+
+Description:
+This method deletes the given token from the database.
+
+=cut
+
+sub _delete_token {
+  my ($self, $token) = @_;
+
+  return 0 unless (defined($token));
+
+  my $sql = "DELETE FROM bayes_token WHERE username = ? AND token = ?";
+
+  my $sth = $self->{_dbh}->prepare_cached($sql);
+
+  unless (defined($sth)) {
+    dbg("bayes: _delete_token: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  my $rc = $sth->execute($self->{_username}, $token);
+
+  unless ($rc) {
+    dbg("bayes: _delete_token: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  $sth->finish();
+
+  return 1;
+}
+
+=head2 _put_token
+
+private instance (Boolean) _put_token (string $token,
+                                       integer $spam_count,
+                                       integer $ham_count,
+				       string $atime)
+
+Description:
+This method performs the work of either inserting or updating a token in
+the database.
+
+=cut
+
+sub _put_token {
+  my ($self, $token, $spam_count, $ham_count, $atime) = @_;
+
+  $spam_count ||= 0;
+  $ham_count ||= 0;
+
+  my $existing_atime = $self->_token_atime($token);
+
+  my $sql;
+
+  if ($spam_count == 0 && $ham_count == 0) {
+    return 1;
+  }
+
+  if (!defined($existing_atime)) {
+
+    # You can't create a new entry for a token with a negative count, so just return
+    # if we are unable to find an entry.
+    return 1 if ($spam_count < 0 || $ham_count < 0);
+
+    $sql = "INSERT INTO bayes_token
+             (username, token, spam_count, ham_count, atime)
+            VALUES (?,?,?,?,?)";
+
+    my $sth = $self->{_dbh}->prepare_cached($sql);
+
+    unless (defined($sth)) {
+      dbg("bayes: _put_token: SQL Error: ".$self->{_dbh}->errstr());
+      return 0;
+    }
+
+    my $rc = $sth->execute($self->{_username},
+			   $token,
+			   $spam_count,
+			   $ham_count,
+			   $atime);
+    
+    unless ($rc) {
+      dbg("bayes: _put_token: SQL Error: ".$self->{_dbh}->errstr());
+      return 0;
+    }
+
+    $sth->finish();
+    dbg("bayes: new token ($token) inserted");
+  }
+  else {
+    my $sql = "UPDATE bayes_token
+                  SET spam_count = spam_count + ?,
+                      ham_count = ham_count + ?,
+                      atime = ?
+                WHERE username = ?
+                  AND token = ?";
+
+    # If the existing atime is already greater then keep it.
+    # XXX - A future enhancement might be to just omit the update
+    #       of atime in this case, but that would give us one extra
+    #       SQL statement to cache, so I'm not sure if the trade off
+    #       is worth it.
+    $atime = $existing_atime if ($existing_atime > $atime);
+
+    my $sth = $self->{_dbh}->prepare_cached($sql);
+
+    unless (defined($sth)) {
+      dbg("bayes: _put_token: SQL Error: ".$self->{_dbh}->errstr());
+      return 0;
+    }
+
+    my $rc = $sth->execute($spam_count, $ham_count, $atime,
+			   $self->{_username}, $token);
+    
+    unless ($rc) {
+      dbg("bayes: _put_token: SQL Error: ".$self->{_dbh}->errstr());
+      return 0;
+    }
+    
+    $sth->finish();
+    dbg("bayes: token ($token) updated");
+  }
+  return 1;
+}
+
+=head2 _get_token_count
+
+private instance (Integer) _get_token_count ()
+
+Description:
+This method returns the total number of tokens present in the token database
+for a user.
+
+=cut
+
+sub _get_token_count {
+  my ($self) = @_;
+
+  my $sql = "SELECT count(*)
+               FROM bayes_token
+              WHERE username = ?
+                AND (spam_count > 0 OR ham_count > 0)";
+
+  my $sth = $self->{_dbh}->prepare_cached($sql);
+
+  unless (defined($sth)) {
+    dbg("bayes: _get_token_count: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  my $rc = $sth->execute($self->{_username});
+
+  unless (defined($sth)) {
+    dbg("bayes: _get_token_count: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  my ($token_count) = $sth->fetchrow_array();
+
+  $sth->finish();
+
+  return $token_count
+}
+
+=head2 _get_oldest_token_age
+
+private instance (Integer) _get_oldest_token_age ()
+
+Description:
+This method finds the atime of the oldest token in the database.
+
+=cut
+
+sub _get_oldest_token_age {
+  my ($self) = @_;
+
+  my $sql = "SELECT min(atime) FROM bayes_token WHERE username = ?";
+
+  my $sth = $self->{_dbh}->prepare_cached($sql);
+
+  unless (defined($sth)) {
+    dbg("bayes: _get_oldest_token_age: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  my $rc = $sth->execute($self->{_username});
+
+  unless ($rc) {
+    dbg("bayes: _get_oldest_token_age: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  my ($atime) = $sth->fetchrow_array();
+
+  $sth->finish();
+
+  return $atime;
+}
+
+=head2 _get_newest_token_age
+
+private instance (Integer) _get_newest_token_age ()
+
+Description:
+This method finds the atime of the newest token in the database.
+
+=cut
+
+sub _get_newest_token_age {
+  my ($self) = @_;
+
+  my $sql = "SELECT max(atime) FROM bayes_token WHERE username = ?";
+
+  my $sth = $self->{_dbh}->prepare_cached($sql);
+
+  unless (defined($sth)) {
+    dbg("bayes: _get_newest_token_age: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  my $rc = $sth->execute($self->{_username});
+
+  unless ($rc) {
+    dbg("bayes: _get_newest_token_age: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  my ($atime) = $sth->fetchrow_array();
+
+  $sth->finish();
+
+  return $atime;
+}
+
+=head2 _set_last_atime_delta
+
+private instance (Boolean) _set_last_atime_delta (Integer $newdelta)
+
+Description:
+This method sets the last_atime_delta variable in the variable table.
+
+=cut
+
+sub _set_last_atime_delta {
+  my ($self, $newdelta) = @_;
+
+  return 0 unless (defined($newdelta));
+
+  my $sql = "UPDATE bayes_vars SET last_atime_delta = ? WHERE username = ?";
+
+  my $sth = $self->{_dbh}->prepare_cached($sql);
+
+  unless (defined($sth)) {
+    dbg("bayes: _set_last_atime_delta: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  my $rc = $sth->execute($newdelta, $self->{_username});
+
+  unless ($rc) {
+    dbg("bayes: _set_last_atime_delta: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  $sth->finish();
+
+  return 1;
+}
+
+=head2 _set_last_expire_reduce
+
+private instance (Boolean) _set_last_expire_reduce (Integer $deleted)
+
+Description:
+This method sets the last_expire_reduce values in the variable table.
+
+=cut
+
+sub _set_last_expire_reduce {
+  my ($self, $deleted) = @_;
+
+  return 0 unless (defined($deleted));
+
+  my $sql = "UPDATE bayes_vars SET last_expire_reduce = ? WHERE username = ?";
+
+  my $sth = $self->{_dbh}->prepare_cached($sql);
+
+  unless (defined($sth)) {
+    dbg("bayes: _set_last_expire_reduce: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  my $rc = $sth->execute($deleted, $self->{_username});
+
+  unless ($rc) {
+    dbg("bayes: _set_last_expire_reduce: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  $sth->finish();
+
+  return 1;
+}
+
+=head2 _get_num_hapaxes
+
+private instance (Integer) _get_num_hapaxes ()
+
+Description:
+This method gets the total number of hapaxes (spam_count + ham_count == 1) in
+the token database for a user.
+
+=cut
+
+sub _get_num_hapaxes {
+  my ($self) = @_;
+
+  my $sql = "SELECT count(*)
+               FROM bayes_token
+              WHERE username = ?
+                AND spam_count + ham_count = 1";
+
+  my $sth = $self->{_dbh}->prepare_cached($sql);
+
+  unless (defined($sth)) {
+    dbg("bayes: _get_num_hapaxes: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  my $rc = $sth->execute($self->{_username});
+
+  unless ($rc) {
+    dbg("bayes: _get_num_hapaxes: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  
+  my ($num_hapaxes) = $sth->fetchrow_array();
+
+  $sth->finish();
+
+  return $num_hapaxes;
+}
+
+=head2 _get_num_lowfreq
+
+private instance (Integer) _get_num_lowfreq ()
+
+Description:
+This method gets the total number of lowfreq tokens (spam_count < 8 and
+ham_count < 8) in the token database for a user
+
+=cut
+
+sub _get_num_lowfreq {
+  my ($self) = @_;
+
+  my $sql = "SELECT count(*)
+               FROM bayes_token
+              WHERE username = ? 
+                AND (spam_count >= 0 AND spam_count < 8)
+                AND (ham_count >= 0 AND ham_count < 8)
+                AND spam_count + ham_count != 1";
+
+  my $sth = $self->{_dbh}->prepare_cached($sql);
+
+  unless (defined($sth)) {
+    dbg("bayes: _get_num_lowfreq: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  my $rc = $sth->execute($self->{_username});
+
+  unless ($rc) {
+    dbg("bayes: _get_num_lowfreq: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  my ($num_lowfreq) = $sth->fetchrow_array();
+
+  $sth->finish();
+
+  return $num_lowfreq;
+}
+
+sub dbg { Mail::SpamAssassin::dbg (@_); }
+sub sa_die { Mail::SpamAssassin::sa_die (@_); }
+
+1;

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/CmdLearn.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/CmdLearn.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/CmdLearn.pm	Thu Jan 29 18:54:33 2004
@@ -170,9 +170,9 @@
       $spamtest->{conf}->{bayes_path} = $bayes_override_path;
     }
 
-    my $ret = $spamtest->{bayes_scanner}->{store}->upgrade_old_dbm_files();
+    my $ret = $spamtest->{bayes_scanner}->{store}->perform_upgrade();
     $spamtest->finish_learner();
-    return (!(defined $ret && $ret == 2));
+    return (!$ret);
   }
 
   $spamtest->init_learner({

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm	Thu Jan 29 18:54:33 2004
@@ -236,6 +236,15 @@
   $self->{bayes_min_spam_num} = 200;
   $self->{bayes_learn_during_report} = 1;
 
+  # Allow alternate bayes storage implementation
+  $self->{bayes_store_module} = '';
+
+  # Used for SQL based Bayes implementation
+  $self->{bayes_sql_dsn} = '';
+  $self->{bayes_sql_username} = '';
+  $self->{bayes_sql_password} = '';
+  $self->{bayes_sql_override_username} = '';
+
   $self->{use_hashcash} = 1;
   $self->{hashcash_accept} = { };
   $self->{hashcash_doublespend_path} = '__userstate__/hashcash_seen';
@@ -263,6 +272,9 @@
   # "...scope", and finally, 'user_scores_sql_table'.  Defaults are "username",
   # "preference", "value", "spamassassin" and "userpref".
 
+  # defaults for SQL based auto-whitelist
+  $self->{user_awl_sql_table} = 'awl';
+
   # for backwards compatibility, we need to set the default headers
   # remove this except for X-Spam-Checker-Version in 2.70
   $self->add_default_spam_headers();	# always run this first
@@ -1759,7 +1771,18 @@
       $self->{bayes_learn_during_report} = $value+0; next;
     }
 
-=back
+=item bayes_sql_override_username
+
+Used by BayesStoreSQL storage implementation.
+
+If this options is set the BayesStoreSQL module will override the set username with
+the value given.  This could be useful for implementing global or group bayes databases.
+
+=cut
+
+    if (/^bayes_sql_override_username\s+(.*)$/) {
+      $self->{bayes_sql_override_username} = $1; next;
+    }
 
 ##############
 
@@ -2413,6 +2436,57 @@
       $self->{bayes_learn_to_journal} = $value+0; next;
     }
 
+=item bayes_store_module
+
+If this option is set, the module given will be used as an alternate to the default
+bayes storage mechanism.  It must conform to the published storage specification
+(see Mail::SpamAssassin::BayesStore).
+
+=cut
+
+    if (/^bayes_store_module\s+(.*)$/) {
+      my $module = $1;
+      $module =~ /^([_A-Za-z0-9:]+)$/;
+      $self->{bayes_store_module} = $1;
+      next;
+    }
+
+=item bayes_sql_dsn DBI::databasetype:databasename:hostname:port
+
+Used for BayesStoreSQL storage implementation.
+
+This option give the connect string used to connect to the SQL based Bayes storage.
+
+=cut
+
+    if (/^bayes_sql_dsn\s+(\S+)$/) {
+      $self->{bayes_sql_dsn} = $1; next;
+    }
+
+=item bayes_sql_username
+
+Used by BayesStoreSQL storage implementation.
+
+This option gives the username used by the above DSN.
+
+=cut
+
+    if (/^bayes_sql_username\s+(\S+)$/) {
+      $self->{bayes_sql_username} = $1; next;
+    }
+
+=item bayes_sql_password
+
+Used by BayesStoreSQL storage implementation.
+
+This option gives the password used by the above DSN.
+
+=cut
+
+    if (/^bayes_sql_password\s+(\S+)$/) {
+      $self->{bayes_sql_password} = $1; next;
+    }
+
 =item user_scores_dsn DBI:databasetype:databasename:hostname:port
 
 If you load user scores from an SQL database, this will set the DSN
@@ -2462,6 +2536,43 @@
     # leave as RE right now
     if (/^loadplugin\s+(\S+)\s+(\S+)$/) {
       $self->load_plugin ($1, $2); next;
+    }
+
+=item user_awl_dsn DBI:databasetype:databasename:hostname:port
+
+If you load user auto-whitelists from an SQL database, this will set the DSN
+used to connect.  Example: C<DBI:mysql:spamassassin:localhost>
+
+=cut
+    if (/^user_awl_dsn\s+(\S+)$/) {
+      $self->{user_awl_dsn} = $1; next;
+    }
+
+=item user_awl_sql_username username
+
+The authorized username to connect to the above DSN.
+
+=cut
+    if(/^user_awl_sql_username\s+(\S+)$/) {
+      $self->{user_awl_sql_username} = $1; next;
+    }
+
+=item user_awl_sql_password password
+
+The password for the database username, for the above DSN.
+
+=cut
+    if(/^user_awl_sql_password\s+(\S+)$/) {
+      $self->{user_awl_sql_password} = $1; next;
+    }
+
+=item user_awl_sql_table tablename
+
+The table user auto-whitelists are stored in, for the above DSN.
+
+=cut
+    if(/^user_awl_sql_table\s+(\S+)$/) {
+      $self->{user_awl_sql_table} = $1; next;
     }
 
 ###########################################################################

Added: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/SQLBasedAddrList.pm
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/SQLBasedAddrList.pm	Thu Jan 29 18:54:33 2004
@@ -0,0 +1,402 @@
+# <@LICENSE>
+# ====================================================================
+# The Apache Software License, Version 1.1
+# 
+# Copyright (c) 2000 The Apache Software Foundation.  All rights
+# reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in
+#    the documentation and/or other materials provided with the
+#    distribution.
+# 
+# 3. The end-user documentation included with the redistribution,
+#    if any, must include the following acknowledgment:
+#       "This product includes software developed by the
+#        Apache Software Foundation (http://www.apache.org/)."
+#    Alternately, this acknowledgment may appear in the software itself,
+#    if and wherever such third-party acknowledgments normally appear.
+# 
+# 4. The names "Apache" and "Apache Software Foundation" must
+#    not be used to endorse or promote products derived from this
+#    software without prior written permission. For written
+#    permission, please contact apache@apache.org.
+# 
+# 5. Products derived from this software may not be called "Apache",
+#    nor may "Apache" appear in their name, without prior written
+#    permission of the Apache Software Foundation.
+# 
+# THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+# ====================================================================
+# 
+# This software consists of voluntary contributions made by many
+# individuals on behalf of the Apache Software Foundation.  For more
+# information on the Apache Software Foundation, please see
+# <http://www.apache.org/>.
+# 
+# Portions of this software are based upon public domain software
+# originally written at the National Center for Supercomputing Applications,
+# University of Illinois, Urbana-Champaign.
+# </...@LICENSE>
+
+=head1 NAME
+
+Mail::SpamAssassin::SQLBasedAddrList - SpamAssassin SQL Based Auto Whitelist
+
+=head1 SYNOPSIS
+
+    my $factory = Mail::SpamAssassin::SQLBasedAddrList->new()
+    $spamtest->set_persistent_addr_list_factory ($factory);
+  ... call into SpamAssassin classes...
+
+SpamAssassin will call:
+
+    my $addrlist = $factory->new_checker($spamtest);
+    $entry = $addrlist->get_addr_entry ($addr, $origip);
+  ...
+
+=head1 DESCRIPTION
+
+A SQL based persistent address list implementation.
+
+See C<Mail::SpamAssassin::PersistentAddrList> for more information.
+
+Uses DBI::DBD module access to your favorite database (tested with
+MySQL, SQLite and PostgreSQL) to store user auto-whitelists.
+
+The default table structure looks like this:
+CREATE TABLE awl (
+  username VARCHAR NOT NULL,
+  email VARCHAR NOT NULL,
+  ip VARCHAR NOT NULL,
+  count INT NOT NULL,
+  totscore FLOAT NOT NULL,
+  PRIMARY KEY (username, email, ip)
+)
+
+You're table definition may change depending on which database driver
+you choose.  There is a config option to override the table name.
+
+This module introduces several new config variables:
+
+user_awl_dsn
+
+user_awl_sql_username
+
+user_awl_sql_password
+
+user_awl_sql_table
+
+see C<Mail::SpamAssassin::Conf> for more information.
+
+
+=cut
+
+package Mail::SpamAssassin::SQLBasedAddrList;
+
+use strict;
+use bytes;
+
+use DBI;
+
+use Mail::SpamAssassin::PersistentAddrList;
+
+use vars qw(@ISA);
+
+@ISA = qw(Mail::SpamAssassin::PersistentAddrList);
+
+=head2 new
+
+public class (Mail::SpamAssassin::SQLBasedAddrList) new ()
+
+Description:
+This method creates a new instance of the SQLBasedAddrList factory and calls
+the parent's (PersistentAddrList) new method.
+
+=cut
+
+sub new {
+  my ($proto) = @_;
+  my $class = ref($proto) || $proto;
+  my $self = $class->SUPER::new(@_);
+  $self->{class} = $class;
+  bless ($self, $class);
+  $self;
+}
+
+=head2 new_checker
+
+public instance (Mail::SpamAssassin::SQLBasedAddrList) new_checker (\% $main)
+
+Description:
+This method is called to setup a new checker interface and return a blessed
+copy of itself.  Here is where we setup the SQL database connection based
+on the config values.
+
+=cut
+
+sub new_checker {
+  my ($self, $main) = @_;
+
+  my $class = $self->{class};
+
+  if (!$main->{conf}->{user_awl_dsn} ||
+      !$main->{conf}->{user_awl_sql_table}) {
+    dbg("auto-whitelist (sql-based): invalid config");
+    return undef;
+  }
+
+  my $dsn    = $main->{conf}->{user_awl_dsn};
+  my $dbuser = $main->{conf}->{user_awl_sql_username};
+  my $dbpass = $main->{conf}->{user_awl_sql_password};
+
+  my $dbh = DBI->connect($dsn, $dbuser, $dbpass, {'PrintError' => 0});
+
+  if(!$dbh) {
+    dbg("auto-whitelist (sql-based): Unable to Connect to DB");
+    return undef;
+  }
+
+  $self = { 'main'      => $main,
+            'dsn'       => $dsn,
+            'dbh'       => $dbh,
+            'tablename' => $main->{conf}->{user_awl_sql_table},
+          };
+
+  dbg("SQL Based AWL: Connected to $dsn");
+
+  return bless ($self, $class);
+}
+
+=head2 get_addr_entry
+
+public instance (\%) get_addr_entry (String $addr)
+
+Description:
+This method takes a given C<$addr> and splits it between the email address
+component and the ip component and performs a lookup in the database. If
+nothing is found in the database then a blank entry hash is created and
+returned, otherwise an entry containing the found information is returned.
+
+A key, C<exists_p>, is set to 1 if an entry already exists in the database,
+otherwise it is set to 0.
+
+=cut
+
+sub get_addr_entry {
+  my ($self, $addr) = @_;
+
+  my $entry = { addr     => $addr,
+                exists_p => 0,
+                count    => 0,
+                totscore => 0,
+              };
+
+  my ($email, $ip) = $self->_unpack_addr($addr);
+
+  return $entry unless ($email && $ip);
+
+  my $username = $self->{main}->{username};
+
+  my $sql = "SELECT count, totscore FROM $self->{tablename}
+              WHERE username = ? AND email = ? AND ip = ?";
+  my $sth = $self->{dbh}->prepare($sql);
+  my $rc = $sth->execute($username, $email, $ip);
+
+  if (!$rc) { # there was an error, but try to go on
+    my $err = $self->{dbh}->errstr;
+    dbg("auto-whitelist (sql-based) get_addr_entry: SQL Error: $err");
+    $entry->{count} = 0;
+    $entry->{totscore} = 0;
+  }
+  else {
+    my $aryref = $sth->fetchrow_arrayref();
+
+    if (defined($aryref)) { # we got some data back
+      $entry->{count} = $aryref->[0] || 0;
+      $entry->{totscore} = $aryref->[1] || 0;
+      $entry->{exists_p} = 1;
+      dbg("auto-whitelist (sql-based) get_addr_entry: Found existing entry for $addr");
+    }
+    else {
+      dbg("auto-whitelist (sql-based) get_addr_entry: No entry found for $addr");
+    }
+  }
+  $sth->finish();
+
+  dbg ("auto-whitelist (sql-based): $addr scores ".$entry->{count}.'/'.$entry->{totscore});
+
+  return $entry;
+}
+
+=head2 add_score
+
+public instance (\%) add_score (\% $entry, Integer $score)
+
+Description:
+This method adds a given C<$score> to a given C<$entry>.  If the entry was
+marked as not existing in the database then an entry will be inserted,
+otherwise a simple update will be performed.
+
+NOTE: This code uses a self referential SQL call (ie set foo = foo + 1) which
+is supported by most modern database backends, but not everything calling
+itself a SQL database.
+
+=cut
+
+sub add_score {
+  my($self, $entry, $score) = @_;
+
+  return if (!$entry->{addr});
+  
+  my ($email, $ip) = $self->_unpack_addr($entry->{addr});
+
+  $entry->{count} += 1;
+  $entry->{totscore} += $score;
+  
+  return $entry unless ($email && $ip);
+
+  my $username = $self->{main}->{username};
+  
+  if ($entry->{exists_p}) { # entry already exists, so just update
+    my $sql = "UPDATE $self->{tablename} SET count = count + 1,
+                                             totscore = totscore + ?
+                WHERE username = ? AND email = ? AND ip = ?";
+    
+    my $sth = $self->{dbh}->prepare($sql);
+    my $rc = $sth->execute($score, $username, $email, $ip);
+    
+    if (!$rc) {
+      my $err = $self->{dbh}->errstr;
+      dbg("auto-whitelist (sql-based) add_score: SQL Error: $err");
+    }
+    else {
+      dbg("auto-whitelist (sql-based) add_score: New count: ". $entry->{count} .", new totscore: ".$entry->{totscore}." for ".$entry->{addr});
+    }
+    $sth->finish();
+  }
+  else { # no entry yet, so insert a new entry
+    my $sql = "INSERT INTO $self->{tablename} (username,email,ip,count,totscore) VALUES (?,?,?,?,?)";
+    my $sth = $self->{dbh}->prepare($sql);
+    my $rc = $sth->execute($username,$email,$ip,1,$score);
+    if (!$rc) {
+      my $err = $self->{dbh}->errstr;
+      dbg("auto-whitelist (sql-based) add_score: SQL Error: $err");
+    }
+    $entry->{exists_p} = 1;
+    dbg("auto-whitelist (sql-based) add_score: Created new entry for ".$entry->{addr}." with totscore: $score");
+    $sth->finish();
+  }
+  
+  return $entry;
+}
+
+=head2 remove_entry
+
+public instance () remove_entry (\% $entry)
+
+Description:
+This method removes a given C<$entry> from the database.  If the
+ip portion of the entry address is equal to "none" then remove any
+perl-IP entries for this address as well.
+
+=cut
+
+sub remove_entry {
+  my ($self, $entry) = @_;
+
+  my ($email, $ip) = $self->_unpack_addr($entry->{addr});
+
+  return unless ($email && $ip);
+
+  my $username = $self->{main}->{username};
+
+  my $sql;
+  my @args;
+
+  # when $ip is equal to none then attempt to delete all entries
+  # associated with address
+  if ($ip eq 'none') {
+    $sql = "DELETE FROM $self->{tablename} WHERE username = ? AND email = ?";
+    @args = ($username, $email);
+    dbg("auto-whitelist (sql-based) remove_entry: Removing all entries matching $email");
+  }
+  else {
+    $sql = "DELETE FROM $self->{tablename}
+             WHERE username = ? AND email = ? AND ip = ?";
+    @args = ($username, $email, $ip);
+    dbg("auto-whitelist (sql-based) remove_entry: Removing single entry matching ".$entry->{addr});
+  }
+
+  my $sth = $self->{dbh}->prepare($sql);
+  my $rc = $sth->execute(@args);
+
+  if (!$rc) {
+    my $err = $self->{dbh}->errstr;
+    dbg("auto-whitelist (sql-based) remove_entry: SQL Error: $err");
+  }
+  else {
+    # We might normally have a dbg saying we removed the address
+    # but the common codepath already provides this in SpamAssassin.pm
+  }
+  $entry = undef; # slight cleanup since it is now gone
+}
+
+=head2 finish
+
+public instance () finish ()
+
+Description:
+This method provides the necessary cleanup for the address list.
+
+=cut
+
+sub finish {
+  my ($self) = @_;
+  dbg("auto-whitelist (sql-based) finish: Disconnected from " . $self->{dsn});
+  $self->{dbh}->disconnect();
+}
+
+=head2 _unpack_addr
+
+private instance (String, String) _unpack_addr(string $addr)
+
+Description:
+This method splits an autowhitelist address into it's two components,
+email and ip address.
+
+=cut
+
+sub _unpack_addr {
+  my ($self, $addr) = @_;
+
+  my ($email, $ip) = split(/\|ip=/, $addr);
+
+  unless ($email && $ip) {
+    dbg("auto-whitelist (sql-based): _unpack_addr: Unable to decode $addr");
+  }
+
+  return ($email, $ip);
+}
+
+sub dbg { Mail::SpamAssassin::dbg (@_); }
+
+1;

Modified: incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf
==============================================================================
--- incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf	(original)
+++ incubator/spamassassin/trunk/rules/70_cvs_rules_under_test.cf	Thu Jan 29 18:54:33 2004
@@ -442,3 +442,7 @@
 body     T_RM_BPT_LONGWORDS_99 /\b(?:\w{9,}\s+){9}/
 describe T_RM_BPT_LONGWORDS_99 Long string of long words
 
+# "www" hidden as "%77%77%77", "ww%77", etc.
+rawbody	 T_HTTP_77	/http:\/\/.{0,2}[\%77]/
+describe T_HTTP_77	Contains a URL-encoded hostname (HTTP77)
+

Modified: incubator/spamassassin/trunk/sql/README
==============================================================================
--- incubator/spamassassin/trunk/sql/README	(original)
+++ incubator/spamassassin/trunk/sql/README	Thu Jan 29 18:54:33 2004
@@ -1,6 +1,6 @@
 
-Using SpamAssassin With An SQL Database
----------------------------------------
+Loading SpamAssassin User Preferences From An SQL Database
+----------------------------------------------------------
 
 SpamAssassin can now load users' score files from an SQL database.  The concept
 here is to have a web application (PHP/perl/ASP/etc.) that will allow users to
@@ -77,12 +77,18 @@
 you can use the entire recipient's email address, e.g. "user@example.com", and
 use the full varchar(100).
 
-Included is a default table that can be safely used in your own setup.  To
-use the default table, you must first create a database, and a username/password
-that can access that database.  To install the table, use the following 
-command:
+Included is a default table that can be safely used in your own setup.  To use
+the default table, you must first create a database, and a username/password
+that can access that database.
 
-mysql -h <hostname> -u <username> -p <password> databasename < spamassasin.sql
+To create a database, if one does not already exist, see "Creating A Database
+In MySQL" below.
+
+
+To install the table, use the following command:
+
+mysql -h <hostname> -u <adminusername> -p <databasename> < spamassasin.sql
+Enter password: <adminpassword>
 
 This will create the following table:
 
@@ -115,6 +121,21 @@
 
 Also note that spamd may need the "-q" switch so it knows to look up users in
 the SQL table instead of /etc/passwd.  See "man spamd".
+
+
+Creating A Database In MySQL
+----------------------------
+
+Here's the command to create a database, and user/password pair to access
+it, for MySQL:
+
+mysql -h <hostname> -u <adminusername> -p
+Enter password: <adminpassword>
+mysql> use mysql;
+mysql> insert into user (Host, User, Password) values('localhost','<username>', password('<password>'));
+mysql> insert into db (Host, Db, User, Select_priv) values('localhost','<databasename>','<username>','Y');
+mysql> create database <databasename>;
+mysql> quit
 
 
 Testing SpamAssassin/SQL

Added: incubator/spamassassin/trunk/sql/README.awl
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/sql/README.awl	Thu Jan 29 18:54:33 2004
@@ -0,0 +1,139 @@
+
+Using SpamAssassin Auto-Whitelists With An SQL Database
+-------------------------------------------------------
+
+SpamAssassin can now load users' auto-whitelists from a SQL database.
+The most common use for a system like this would be for users to be
+able to have per user auto-whitelists on systems where users may not
+have a home directory to store the whitelist DB files.
+
+In order to activate the SQL based auto-whitelist you have to
+configure spamassassin and spamd to use a different whitelist factory.
+This is  done with the auto_whitelist_factory config variable, like
+so:
+
+auto_whitelist_factory Mail::SpamAssassin::SQLBasedAddrList
+
+SpamAssassin will check the global configuration file (ie. any file
+matching /etc/mail/spamassassin/*.cf) for the following settings:
+
+user_awl_dsn                 DBI:driver:database:hostname[:port]
+user_awl_sql_username        dbusername
+user_awl_sql_password        dbpassword
+
+The first option, user_awl_dsn, describes the data source name that
+will be used to create the connection to your SQL server.  It MUST be
+in the format as listed above.  <driver> should be the DBD driver that
+you have installed to access your database (initially tested with
+MySQL, PostgreSQL and SQLite).  <database> must be the name of the
+database that you created to store the auto-whitelist table.
+<hostname> is the name of the host that contains the SQL database
+server.  <port> is the optional port number where your database server
+is listening.
+
+user_awl_dsn                DBI:mysql:spamassassin:localhost
+
+Would tell SpamAssassin to connect to the database named spamassassin using
+MySQL on the local server, and since <port> is omitted, the driver will use the
+default port number.  The other two required options tells SpamAssassin to use 
+the defined username and password to establish the connection.
+
+If the user_awl_dsn option does not exist, SpamAssassin will not attempt
+to use SQL for the auto-whitelist.
+
+One additional configuration option exists that allows you to set the
+table name for the auto-whitelist table.
+
+user_awl_sql_table           awl
+
+Requirements
+------------
+
+In order for SpamAssassin to work with your SQL database, you must have
+the perl DBI module installed, AS WELL AS the DBD driver/module for your
+specific database.  For example, if using MySQL as your RDBMS, you must have
+the Msql-Mysql module installed.  Check CPAN for the latest versions of DBI 
+and your database driver/module. 
+
+We are currently using:
+
+DBI-1.20
+Msql-Mysql-modules-1.2219
+perl v5.6.1
+
+But older versions should work fine.
+
+
+Database Schema
+---------------
+
+The database must contain a table named by 'user_awl_sql_table' (default
+setting: "awl") with at least three fields:
+
+  username varchar(100)	  # this is the username whose e-mail is being filtered
+  email varchar(200)      # this is the address key
+  ip    varchar(10)       # this is the ip key
+  count int(11)           # this is the message counter
+  totscore float          # this is the total calculated score
+
+You can add as many other fields you wish as long as the above fields
+are contained in the table.
+
+Included is a default table that can be safely used in your own setup.  To use
+the default table, you must first create a database, and a username/password
+that can access that database.  (See "Creating A Database In MySQL", in
+"sql/README", if you don't have a suitable database ready.)
+
+To install the table, use the following command:
+
+mysql -h <hostname> -u <adminusername> -p <databasename> < awl_mysql.sql
+Enter password: <adminpassword>
+
+This will create the following table:
+
+CREATE TABLE awl (
+  username varchar(100) NOT NULL default '',
+  email varchar(200) NOT NULL default '',
+  ip varchar(10) NOT NULL default '',
+  count int(11) default '0',
+  totscore float default '0',
+  PRIMARY KEY  (username,email,ip)
+) TYPE=MyISAM;
+
+
+Once you have created the database and added the table, just add the required
+lines to your global configuration file (local.cf).  Note that you
+must specify the proper whitelist factory in the config file in order
+for this to work and the current username must be passed to spamd.
+
+Testing SpamAssassin/SQL
+------------------------
+
+To test your SQL setup, and debug any possible problems, you should start
+spamd with the -D option, which will keep spamd in the foreground, and will
+output debug message to the terminal. You should then test spamd with a
+message by calling spamc.  You can use the sample-spam.txt file with the
+following command:
+
+cat sample-spam.txt | spamc
+
+Watch the debug output from spamd and look for the following debug line:
+
+SQL Based AWL: Connected to <your dsn>
+
+If you do not see the above text, then the SQL query was not successful, and
+you should consult any error messages reported.
+
+This code has been tested using MySQL as the RDBMS, with basic tests
+against PostgreSQL and SQLite.  It has been written with the utmost
+simplicity using DBI, and any database driver that conforms to the DBI
+interface and allows you to refer to a column on the right hand side
+of an expression (ie update foo set bar = bar + 1) should work with
+little or no problems.  If you find a driver that has issues, please
+report them to the SADev list.
+
+******
+NB:  This should be considered BETA, and the interface, schema, or overall
+operation of SQL support may change at any time with future releases of SA.
+******
+

Added: incubator/spamassassin/trunk/sql/README.bayes
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/sql/README.bayes	Thu Jan 29 18:54:33 2004
@@ -0,0 +1,125 @@
+
+Using A SQL Database for Bayesian Storage Module
+-------------------------------------------------------
+
+SpamAssassin can now store users' bayesian filter data in a SQL
+database. The  most common use for a system like this would be for
+users to be able to have per user bayesian filter data on systems
+where users may not have a home directory to store the data.
+
+In order to activate the SQL based bayesian storage you have to
+configure spamassassin and spamd to use a different bayes storage
+module.  This can be done via a setting in the global configuration
+file.
+
+The directives required to turn on the SQL based bayesian storage are:
+
+bayes_store_module		   Mail::SpamAssassin::BayesStoreSQL
+
+This directive is used by the Bayes module to determine which storage
+module should be used.  If not set it will default to:
+Mail::SpamAssassin::BayesStoreDBM
+
+bayes_sql_dsn			   DBI:driver:database:hostname[:port]
+bayes_sql_username		   dbusername
+bayes_sql_password		   dbpassword
+
+The bayes_sql_dsn directive describes the data source name that will
+be used to create the connection to your SQL server.  It MUST be in
+the format as listed above.  <driver> should be the DBD driver that
+you have installed to access your database (initially tested with
+MySQL, PostgreSQL, SQLite, and DBD::CSV).  <database> must be the name
+of the database that you created to store the bayes data
+tables. <hostname> is the name of the host that contains the SQL
+database  server.  <port> is the optional port number where your
+database server is listening.
+
+In addition to the global configuration directives there is a user
+preference:
+
+bayes_sql_override_username	   someusername
+
+This directive, if used, will override the username used for storing
+data in the database.  This could be used to group users together to
+share bayesian filter data.
+
+Requirements
+------------
+
+In order for SpamAssassin to work with your SQL database, you must
+have the perl DBI module installed, AS WELL AS the DBD driver/module
+for your specific database.  For example, if using MySQL as your
+RDBMS, you must have the DBD::mysql module installed.  Check CPAN for
+the latest versions of DBI and your database driver/module. 
+
+The BayesStoreSQL module was tested with:
+
+DBI-1.38
+DBD-mysql-2.9002
+perl v5.8.0
+
+But older versions should work fine as the SQL code in SpamAssassin is as 
+simple as could be.
+
+Database Schema
+---------------
+
+The database schema for storage of the bayesian filter data contains
+several different tables.  Several sample SQL schemas have been
+included in to help in setting up your database.  The schemas contain
+the minimum tables and columns necessary to work with the code as
+written.  You are free to add other columns as needed for your local
+implementation.  Presently there is no way to override the table and
+column names used by the BayesStoreSQL code, this feature may be added
+in the future.
+
+Example setup of bayes tables for MySQL:
+
+This assumes that you have already created a database for use with
+spamassassin and setup a username/password that can access that database.
+(See "Creating A Database In MySQL", in "sql/README", if you don't have a
+suitable database ready.)
+
+To install the tables using the included example, use the following command:
+
+mysql -h <hostname> -u <adminusername> -p databasename < bayes_mysql.sql
+Enter password: <adminpassword>
+
+Once you have created the database and added the tables, just add the
+required lines to your global configuration file (local.cf).
+
+Testing SpamAssassin/SQL
+------------------------
+
+To test your SQL setup, and debug any possible problems, you should
+start spamd with the -D option, which will keep spamd in the
+foreground, and will output debug message to the terminal. You should
+then test spamd with a message by calling spamc.  You can use the
+sample-spam.txt file with the following command:
+
+cat sample-spam.txt | spamc
+
+Watch the debug output from spamd and look for the following debug
+line:
+
+debug: bayes: Database connection established
+debug: bayes: Using username: <username>
+
+If you do not see the above text, then the SQL query was not
+successful, and you should see any error messages reported.
+
+This code has been tested using MySQL as the RDMS, with basic tests
+against PostgreSQL and SQLite.  It does require a database that allows
+you to refer to a column on the right hand side of an expression (ie
+update foo set bar = bar + 1).  Any database driver that allows for
+that usage should work with the BayesStoreSQL code.  NOTE: You may
+find that some implementations do not provide a significant advantage
+over using the default DBM implementation.  If you find a driver that
+should work and has issues, please report them to the SADev list.
+
+******
+NB:  This should be considered BETA, and the interface, schema, or
+overall operation of SQL support may change at any time with future
+releases of SA.
+******
+

Added: incubator/spamassassin/trunk/sql/awl_mysql.sql
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/sql/awl_mysql.sql	Thu Jan 29 18:54:33 2004
@@ -0,0 +1,8 @@
+CREATE TABLE awl (
+  username varchar(100) NOT NULL default '',
+  email varchar(200) NOT NULL default '',
+  ip varchar(10) NOT NULL default '',
+  count int(11) default '0',
+  totscore float default '0',
+  PRIMARY KEY  (username,email,ip)
+) TYPE=MyISAM;

Added: incubator/spamassassin/trunk/sql/bayes_mysql.sql
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/sql/bayes_mysql.sql	Thu Jan 29 18:54:33 2004
@@ -0,0 +1,41 @@
+
+CREATE TABLE bayes_expire (
+  username varchar(200) NOT NULL default '',
+  runtime int(11) NOT NULL default '0',
+  KEY bayes_expire_idx1 (username)
+) TYPE=MyISAM;
+
+CREATE TABLE bayes_global_vars (
+  variable varchar(30) NOT NULL default '',
+  value varchar(200) NOT NULL default '',
+  PRIMARY KEY  (variable)
+) TYPE=MyISAM;
+
+INSERT INTO bayes_global_vars VALUES ('VERSION','2');
+
+CREATE TABLE bayes_seen (
+  username varchar(200) NOT NULL default '',
+  msgid varchar(200) binary NOT NULL default '',
+  flag char(1) NOT NULL default '',
+  PRIMARY KEY  (username,msgid),
+  KEY bayes_seen_idx1 (username,flag)
+) TYPE=MyISAM;
+
+CREATE TABLE bayes_token (
+  username varchar(200) NOT NULL default '',
+  token varchar(200) binary NOT NULL default '',
+  spam_count int(11) NOT NULL default '0',
+  ham_count int(11) NOT NULL default '0',
+  atime int(11) NOT NULL default '0',
+  PRIMARY KEY  (username,token)
+) TYPE=MyISAM;
+
+CREATE TABLE bayes_vars (
+  username varchar(200) NOT NULL default '',
+  spam_count int(11) NOT NULL default '0',
+  ham_count int(11) NOT NULL default '0',
+  last_expire int(11) NOT NULL default '0',
+  last_atime_delta int(11) NOT NULL default '0',
+  last_expire_reduce int(11) NOT NULL default '0',
+  PRIMARY KEY  (username)
+) TYPE=MyISAM;

Added: incubator/spamassassin/trunk/sql/bayes_pg.sql
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/sql/bayes_pg.sql	Thu Jan 29 18:54:33 2004
@@ -0,0 +1,43 @@
+
+CREATE TABLE bayes_expire (
+  username varchar(200) NOT NULL default '',
+  runtime integer NOT NULL default '0'
+);
+
+CREATE INDEX bayes_expire_idx1 ON bayes_expire (username);
+
+CREATE TABLE bayes_global_vars (
+  variable varchar(30) NOT NULL default '',
+  value varchar(200) NOT NULL default '',
+  PRIMARY KEY  (variable)
+);
+
+INSERT INTO bayes_global_vars VALUES ('VERSION','2');
+
+CREATE TABLE bayes_seen (
+  username varchar(200) NOT NULL default '',
+  msgid varchar(200) NOT NULL default '',
+  flag character(1) NOT NULL default '',
+  PRIMARY KEY  (username,msgid)
+);
+
+CREATE INDEX bayes_seen_idx1 ON bayes_seen (username, flag);
+
+CREATE TABLE bayes_token (
+  username varchar(200) NOT NULL default '',
+  token varchar(200) NOT NULL default '',
+  spam_count integer NOT NULL default '0',
+  ham_count integer NOT NULL default '0',
+  atime integer NOT NULL default '0',
+  PRIMARY KEY  (username,token)
+);
+
+CREATE TABLE bayes_vars (
+  username varchar(200) NOT NULL default '',
+  spam_count integer NOT NULL default '0',
+  ham_count integer NOT NULL default '0',
+  last_expire integer NOT NULL default '0',
+  last_atime_delta integer NOT NULL default '0',
+  last_expire_reduce integer NOT NULL default '0',
+  PRIMARY KEY  (username)
+);

Added: incubator/spamassassin/trunk/sql/bayes_sqlite.sql
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/sql/bayes_sqlite.sql	Thu Jan 29 18:54:33 2004
@@ -0,0 +1,43 @@
+
+CREATE TABLE bayes_expire (
+  username varchar(200) NOT NULL default '',
+  runtime int(11) NOT NULL default '0',
+);
+
+CREATE INDEX bayes_expire_idx1 ON bayes_expire (username);
+
+CREATE TABLE bayes_global_vars (
+  variable varchar(30) NOT NULL default '',
+  value varchar(200) NOT NULL default '',
+  PRIMARY KEY  (variable)
+);
+
+INSERT INTO bayes_global_vars VALUES ('VERSION','2');
+
+CREATE TABLE bayes_seen (
+  username varchar(200) NOT NULL default '',
+  msgid varchar(200) NOT NULL default '',
+  flag varchar(10) NOT NULL default '',
+  PRIMARY KEY  (username,msgid)
+);
+
+CREATE INDEX bayes_seen_idx1 ON bayes_seen (username, flag);
+
+CREATE TABLE bayes_token (
+  username varchar(200) NOT NULL default '',
+  token varchar(200) NOT NULL default '',
+  spam_count int(11) NOT NULL default '0',
+  ham_count int(11) NOT NULL default '0',
+  atime int(11) NOT NULL default '0',
+  PRIMARY KEY  (username,token)
+);
+
+CREATE TABLE bayes_vars (
+  username varchar(200) NOT NULL default '',
+  spam_count int(11) NOT NULL default '0',
+  ham_count int(11) NOT NULL default '0',
+  last_expire int(11) NOT NULL default '0',
+  last_atime_delta int(11) NOT NULL default '0',
+  last_expire_reduce int(11) NOT NULL default '0',
+  PRIMARY KEY  (username)
+);

Modified: incubator/spamassassin/trunk/t/SATest.pm
==============================================================================
--- incubator/spamassassin/trunk/t/SATest.pm	(original)
+++ incubator/spamassassin/trunk/t/SATest.pm	Thu Jan 29 18:54:33 2004
@@ -50,15 +50,22 @@
   $spamc = $ENV{'SPAMC_SCRIPT'};
   $spamc ||= "../spamc/spamc";
 
+  $salearn = $ENV{'SALEARN_SCRIPT'};
+  $salearn ||= "$perl_cmd ../sa-learn";
+
   $spamdport = $ENV{'SPAMD_PORT'};
   $spamdport ||= 48373;		# whatever
   $spamd_cf_args = "-C log/test_rules_copy";
   $spamd_localrules_args = " --siteconfigpath log/localrules.tmp";
   $scr_localrules_args =   " --siteconfigpath log/localrules.tmp";
+  $salearn_localrules_args =   " --siteconfigpath log/localrules.tmp";
 
   $scr_cf_args = "-C log/test_rules_copy";
   $scr_pref_args = "-p log/test_default.cf";
+  $salearn_cf_args = "-C log/test_rules_copy";
+  $salearn_pref_args = "-p log/test_default.cf";
   $scr_test_args = "";
+  $salearn_test_args = "";
   $set_test_prefs = 0;
   $default_cf_lines = "
     bayes_path ./log/user_state/bayes
@@ -164,6 +171,40 @@
   1;
 }
 
+# Run salearn. Calls back with the output.
+# in $args: arguments to run with
+# in $read_sub: callback for the output (should read from <IN>).
+# This is called with no args.
+#
+# out: $salearn_exitcode global: exitcode from sitescooper
+# ret: undef if sitescooper fails, 1 for exit 0
+#
+sub salearnrun {
+  my $args = shift;
+  my $read_sub = shift;
+
+  rmtree ("log/outputdir.tmp"); # some tests use this
+  mkdir ("log/outputdir.tmp", 0755);
+
+  %found = ();
+  %found_anti = ();
+
+  if (defined $ENV{'SA_ARGS'}) {
+    $args = $ENV{'SA_ARGS'} . " ". $args;
+  }
+  $args = "$salearn_cf_args $salearn_localrules_args $salearn_pref_args $salearn_test_args $args";
+
+  # added fix for Windows tests from Rudif
+  my $salearnargs = "$salearn $args";
+  $salearnargs =~ s!/!\\!g if ($^O =~ /^MS(DOS|Win)/i);
+  print ("\t$salearnargs\n");
+  system ("$salearnargs > log/$testname.${Test::ntest}");
+  $salearn_exitcode = ($?>>8);
+  if ($salearn_exitcode != 0) { return undef; }
+  &checkfile ("$testname.${Test::ntest}", $read_sub);
+  1;
+}
+
 sub scrun {
   $spamd_never_started = 1;
   spamcrun (@_);
@@ -351,6 +392,31 @@
     return $killed;
   }
 }
+
+sub create_saobj {
+  my ($args) = shift; # lets you override/add arguments
+
+  # YUCK, these file/dir names should be some sort of variable, at
+  # least we keep their definition in the same file for the moment.
+  my %setup_args = ( rules_filename => 'log/test_rules_copy',
+		     site_rules_filename => 'log/localrules.tmp',
+		     userprefs_filename => 'log/test_default.cf',
+		     userstate_dir => 'log/user_state',
+		   );
+
+  # override default args
+  foreach my $arg (keys %$args) {
+    $setup_args{$arg} = $args->{$arg};
+  }
+
+  # We'll assume that the test has setup INC correctly
+  require Mail::SpamAssassin;
+
+  my $sa = Mail::SpamAssassin->new(\%setup_args);
+
+  return $sa;
+}
+  
 
 # ---------------------------------------------------------------------------
 

Added: incubator/spamassassin/trunk/t/bayesdbm.t
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/t/bayesdbm.t	Thu Jan 29 18:54:33 2004
@@ -0,0 +1,275 @@
+#!/usr/bin/perl
+
+use Data::Dumper;
+use lib '.'; use lib 't';
+use SATest; sa_t_init("bayes");
+use Test;
+
+BEGIN { 
+  if (-e 't/test_dir') {
+    chdir 't';
+  }
+
+  if (-e 'test_dir') {
+    unshift(@INC, '../blib/lib');
+  }
+
+  plan tests => 43
+};
+
+tstlocalrules ("
+        bayes_learn_to_journal 0
+");
+
+use Mail::SpamAssassin;
+use Mail::SpamAssassin::MsgParser;
+
+my $sa = create_saobj();
+
+$sa->init();
+
+ok($sa);
+
+ok($sa->{bayes_scanner});
+
+ok(!$sa->{bayes_scanner}->is_scan_available());
+
+open(MAIL,"< data/spam/001");
+
+my $raw_message = do {
+  local $/;
+  <MAIL>;
+};
+
+close(MAIL);
+ok($raw_message);
+
+my @msg;
+foreach my $line (split(/^/m,$raw_message)) {
+  $line =~ s/\r$//;
+  push(@msg, $line);
+}
+
+my $mail = Mail::SpamAssassin::MsgParser->parse( \@msg );
+
+ok($mail);
+
+my $body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+
+ok($body);
+
+my ($wc, @toks) = $sa->{bayes_scanner}->tokenize($mail, $body);
+
+ok($wc > 0);
+
+ok(scalar(@toks) > 0);
+
+my $msgid = $sa->{bayes_scanner}->get_msgid($mail);
+
+ok($msgid eq '9PS291LhupY');
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid));
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+ok($sa->{bayes_scanner}->learn(1, $mail));
+
+ok(!$sa->{bayes_scanner}->learn(1, $mail));
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 's');
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+my $tokerror = 0;
+foreach my $tok (@toks) {
+  my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok);
+  if ($spam == 0 || $ham > 0) {
+    $tokerror = 1;
+  }
+}
+ok(!$tokerror);
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+ok($sa->{bayes_scanner}->learn(0, $mail));
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 'h');
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+$tokerror = 0;
+foreach my $tok (@toks) {
+  my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok);
+  if ($spam  > 0 || $ham == 0) {
+    $tokerror = 1;
+  }
+}
+ok(!$tokerror);
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+ok($sa->{bayes_scanner}->forget($mail));
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid));
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+undef $sa;
+
+sa_t_init('bayes'); # this wipes out what is there and begins anew
+
+# make sure we learn to a journal
+tstlocalrules ("
+        bayes_learn_to_journal 1
+");
+
+$sa = create_saobj();
+
+$sa->init();
+
+# Slight cheat here, because when you learn only to journal it fails
+# to actually create the bayes_toks and bayes_seen files because we
+# are tieing read only, this will create the files for us and allow
+# things to continue
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+ok(!-e 'log/user_state/bayes_journal');
+
+ok($sa->{bayes_scanner}->learn(1, $mail));
+
+ok(-e 'log/user_state/bayes_journal');
+
+$sa->{bayes_scanner}->sync(1); # always returns 0, so no need to check return
+
+ok(!-e 'log/user_state/bayes_journal');
+
+ok(-e 'log/user_state/bayes_seen');
+
+ok(-e 'log/user_state/bayes_toks');
+
+undef $sa;
+
+sa_t_init('bayes'); # this wipes out what is there and begins anew
+
+# make sure we learn to a journal
+tstlocalrules ("
+bayes_learn_to_journal 0
+bayes_min_spam_num 10
+bayes_min_ham_num 10
+");
+
+# we get to bastardize the existing pattern matching code here.  It lets us provide
+# our own checking callback and keep using the existing ok_all_patterns call
+%patterns = ( 1 => 'Learned from message' );
+
+ok(salearnrun("--spam data/spam", \&check_examined));
+ok_all_patterns();
+
+ok(salearnrun("--ham data/nice", \&check_examined));
+ok_all_patterns();
+
+ok(salearnrun("--ham data/whitelists", \&check_examined));
+ok_all_patterns();
+
+%patterns = ( 'non-token data: bayes db version' => 'db version' );
+ok(salearnrun("--dump magic", \&patterns_run_cb));
+ok_all_patterns();
+
+use constant SCAN_USING_PERL_CODE_TEST => 1;
+# jm: off! not working for some reason.   Mind you, this is
+# not a supported way to call these APIs!  so no biggie
+
+if (SCAN_USING_PERL_CODE_TEST) {
+$sa = create_saobj();
+
+$sa->init();
+
+open(MAIL,"< ../sample-nonspam.txt");
+
+$raw_message = do {
+  local $/;
+  <MAIL>;
+};
+
+close(MAIL);
+
+@msg = ();
+foreach my $line (split(/^/m,$raw_message)) {
+  $line =~ s/\r$//;
+  push(@msg, $line);
+}
+
+$mail = Mail::SpamAssassin::MsgParser->parse( \@msg );
+
+$body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+
+my $msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail);
+
+ok($msgstatus);
+
+my $score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body);
+
+# Pretty much we can't count on the data returned with such little training
+# so just make sure that the score wasn't equal to .5 which is the default
+# return value.
+print "\treturned score: $score\n";
+ok($score != .5);
+
+open(MAIL,"< ../sample-spam.txt");
+
+$raw_message = do {
+  local $/;
+  <MAIL>;
+};
+
+close(MAIL);
+
+@msg = ();
+foreach my $line (split(/^/m,$raw_message)) {
+  $line =~ s/\r$//;
+  push(@msg, $line);
+}
+
+$mail = Mail::SpamAssassin::MsgParser->parse( \@msg );
+
+$body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+
+$msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail);
+
+$score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body);
+
+# Pretty much we can't count on the data returned with such little training
+# so just make sure that the score wasn't equal to .5 which is the default
+# return value.
+ok($score != .5);
+
+}
+
+sub check_examined {
+  local ($_);
+  my $string = shift;
+
+  if (defined $string) {
+    $_ = $string;
+  } else {
+    $_ = join ('', <IN>);
+  }
+
+  if ($_ =~ /Learned from \d+ message\(s\) \(\d+ message\(s\) examined\)/) {
+    $found{'Learned from message'}++;
+  }
+}
+
+

Added: incubator/spamassassin/trunk/t/bayessql.t
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/t/bayessql.t	Thu Jan 29 18:54:33 2004
@@ -0,0 +1,310 @@
+#!/usr/bin/perl
+
+use lib '.'; use lib 't';
+use SATest;
+use Test;
+use DBI; # for our cleanup stuff
+
+use constant TEST_ENABLED => (-e 'bayessql.cf' || -e 't/bayessql.cf');
+
+BEGIN { 
+  if (-e 't/test_dir') {
+    chdir 't';
+  }
+
+  if (-e 'test_dir') {
+    unshift(@INC, '../blib/lib');
+  }
+
+  plan tests => (TEST_ENABLED ? 38 : 0);
+
+  onfail => sub {
+    warn "\n\nNote: Failure may be due to an incorrect config.";
+  }
+};
+
+exit unless TEST_ENABLED;
+
+my $dbconfig;
+my $dbdsn;
+my $dbusername;
+my $dbpassword;
+
+open(CONFIG,"<bayessql.cf");
+while (my $line = <CONFIG>) {
+  $dbconfig .= $line;
+  if ($line =~ /^bayes_sql_dsn (.*)/) {
+    $dbdsn = $1;
+    chomp($dbdsn);
+  }
+  elsif ($line =~ /^bayes_sql_username (.*)/) {
+    $dbusername = $1;
+    chomp($dbusername);
+  }
+  elsif ($line =~ /^bayes_sql_password (.*)/) {
+    $dbpassword = $1;
+    chomp($dbpassword);
+  }
+}
+close(CONFIG);
+
+my $testuser = 'tstusr.'.$$.'.'.time();
+
+sa_t_init("bayes");
+
+tstlocalrules ("
+bayes_store_module Mail::SpamAssassin::BayesStoreSQL
+$dbconfig
+bayes_sql_override_username $testuser
+");
+
+use Mail::SpamAssassin;
+use Mail::SpamAssassin::MsgParser;
+
+my $sa = create_saobj();
+
+$sa->init();
+
+ok($sa);
+
+ok($sa->{bayes_scanner});
+
+ok(!$sa->{bayes_scanner}->is_scan_available());
+
+open(MAIL,"< data/spam/001");
+
+my $raw_message = do {
+  local $/;
+  <MAIL>;
+};
+
+close(MAIL);
+ok($raw_message);
+
+my @msg;
+foreach my $line (split(/^/m,$raw_message)) {
+  $line =~ s/\r$//;
+  push(@msg, $line);
+}
+
+my $mail = Mail::SpamAssassin::MsgParser->parse( \@msg );
+
+ok($mail);
+
+my $body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+
+ok($body);
+
+my ($wc, @toks) = $sa->{bayes_scanner}->tokenize($mail, $body);
+
+ok($wc > 0);
+
+ok(scalar(@toks) > 0);
+
+my $msgid = $sa->{bayes_scanner}->get_msgid($mail);
+
+ok($msgid eq '9PS291LhupY');
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid));
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+ok($sa->{bayes_scanner}->learn(1, $mail));
+
+ok(!$sa->{bayes_scanner}->learn(1, $mail));
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 's');
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+my $tokerror = 0;
+foreach my $tok (@toks) {
+  my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok);
+  if ($spam == 0 || $ham > 0) {
+    $tokerror = 1;
+  }
+}
+ok(!$tokerror);
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+ok($sa->{bayes_scanner}->learn(0, $mail));
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 'h');
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+$tokerror = 0;
+foreach my $tok (@toks) {
+  my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok);
+  if ($spam  > 0 || $ham == 0) {
+    $tokerror = 1;
+  }
+}
+ok(!$tokerror);
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+ok($sa->{bayes_scanner}->forget($mail));
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid));
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+undef $sa;
+
+ok(cleanupdb());
+
+sa_t_init('bayes'); # this wipes out what is there and begins anew
+
+# make sure we learn to a journal
+tstlocalrules ("
+bayes_store_module Mail::SpamAssassin::BayesStoreSQL
+$dbconfig
+bayes_min_spam_num 10
+bayes_min_ham_num 10
+bayes_sql_override_username $testuser
+");
+
+# we get to bastardize the existing pattern matching code here.  It lets us provide
+# our own checking callback and keep using the existing ok_all_patterns call
+%patterns = ( 1 => 'Learned from message' );
+
+ok(salearnrun("--spam data/spam", \&check_examined));
+ok_all_patterns();
+
+ok(salearnrun("--ham data/nice", \&check_examined));
+ok_all_patterns();
+
+ok(salearnrun("--ham data/whitelists", \&check_examined));
+ok_all_patterns();
+
+%patterns = ( 'non-token data: bayes db version' => 'db version' );
+ok(salearnrun("--dump magic", \&patterns_run_cb));
+ok_all_patterns();
+
+
+use constant SCAN_USING_PERL_CODE_TEST => 1;
+# jm: off! not working for some reason.   Mind you, this is
+# not a supported way to call these APIs!  so no biggie
+
+if (SCAN_USING_PERL_CODE_TEST) {
+$sa = create_saobj();
+
+$sa->init();
+
+open(MAIL,"< ../sample-nonspam.txt");
+
+$raw_message = do {
+  local $/;
+  <MAIL>;
+};
+
+close(MAIL);
+
+@msg = ();
+foreach my $line (split(/^/m,$raw_message)) {
+  $line =~ s/\r$//;
+  push(@msg, $line);
+}
+
+$mail = Mail::SpamAssassin::MsgParser->parse( \@msg );
+
+$body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+
+my $msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail);
+
+ok($msgstatus);
+
+my $score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body);
+
+# Pretty much we can't count on the data returned with such little training
+# so just make sure that the score wasn't equal to .5 which is the default
+# return value.
+ok($score != .5);
+
+open(MAIL,"< ../sample-spam.txt");
+
+$raw_message = do {
+  local $/;
+  <MAIL>;
+};
+
+close(MAIL);
+
+@msg = ();
+foreach my $line (split(/^/m,$raw_message)) {
+  $line =~ s/\r$//;
+  push(@msg, $line);
+}
+
+$mail = Mail::SpamAssassin::MsgParser->parse( \@msg );
+
+$body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+
+$msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail);
+
+$score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body);
+
+# Pretty much we can't count on the data returned with such little training
+# so just make sure that the score wasn't equal to .5 which is the default
+# return value.
+ok($score != .5);
+}
+
+
+ok(cleanupdb());
+
+sub check_examined {
+  local ($_);
+  my $string = shift;
+
+  if (defined $string) {
+    $_ = $string;
+  } else {
+    $_ = join ('', <IN>);
+  }
+
+  if ($_ =~ /Learned from \d+ message\(s\) \(\d+ message\(s\) examined\)/) {
+    $found{'Learned from message'}++;
+  }
+}
+
+
+sub cleanupdb {
+  my $rv;
+  my $error = 0;
+
+  my $dbh = DBI->connect($dbdsn,$dbusername,$dbpassword);
+
+  if (!defined($dbh)) {
+    return 0;
+  }
+
+  $rv = $dbh->do("DELETE FROM bayes_vars WHERE username = ?", undef, $testuser);
+  if (!defined($rv)) {
+    $error = 1;
+  }
+  $rv = $dbh->do("DELETE FROM bayes_seen WHERE username = ?", undef, $testuser);
+  if (!defined($rv)) {
+    $error = 1;
+  }
+  $rv = $dbh->do("DELETE FROM bayes_token WHERE username = ?", undef, $testuser);
+  if (!defined($rv)) {
+    $error = 1;
+  }
+  $rv = $dbh->do("DELETE FROM bayes_expire WHERE username = ?", undef, $testuser);
+  return !$error;
+}

Added: incubator/spamassassin/trunk/t/sql_based_whitelist.t
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/t/sql_based_whitelist.t	Thu Jan 29 18:54:33 2004
@@ -0,0 +1,65 @@
+#!/usr/bin/perl
+
+use lib '.'; use lib 't';
+use SATest;
+
+use constant TEST_ENABLED => (-e 't/sql_based_whitelist.cf'
+                              || -e 'sql_based_whitelist.cf');
+
+use Test;
+
+BEGIN { plan tests => (TEST_ENABLED ? 10 : 0),
+        onfail => sub {
+            warn "\n\nNote: Failure may be due to an incorrect config";
+        }
+    };
+
+exit unless TEST_ENABLED;
+
+sa_t_init("sql_based_whitelist");
+
+open(CONFIG,"<sql_based_whitelist.cf");
+while (my $line = <CONFIG>) {
+  $dbconfig .= $line;
+}
+close(CONFIG);
+
+tstlocalrules ("
+auto_whitelist_factory Mail::SpamAssassin::SQLBasedAddrList
+$dbconfig
+");
+
+# ---------------------------------------------------------------------------
+
+%is_nonspam_patterns = (
+q{ Subject: Re: [SAtalk] auto-whitelisting}, 'subj',
+);
+%is_spam_patterns = (
+q{Subject: 4000           Your Vacation Winning !}, 'subj',
+);
+
+%is_spam_patterns2 = (
+q{ X-Spam-Status: Yes}, 'status',
+);
+
+
+%patterns = %is_nonspam_patterns;
+
+ok (sarun ("--remove-addr-from-whitelist whitelist_test\@whitelist.spamassassin.taint.org", \&patterns_run_cb));
+
+# 3 times, to get into the whitelist:
+ok (sarun ("-L -t < data/nice/002", \&patterns_run_cb));
+ok (sarun ("-L -t < data/nice/002", \&patterns_run_cb));
+ok (sarun ("-L -t < data/nice/002", \&patterns_run_cb));
+
+# Now check
+ok (sarun ("-L -t < data/nice/002", \&patterns_run_cb));
+ok_all_patterns();
+
+%patterns = %is_spam_patterns;
+ok (sarun ("-L -t < data/spam/004", \&patterns_run_cb));
+ok_all_patterns();
+
+%patterns = %is_spam_patterns2;
+ok (sarun ("-L -t < data/spam/007", \&patterns_run_cb));
+ok_all_patterns();

Added: incubator/spamassassin/trunk/tools/convert_awl_dbm_to_sql
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/tools/convert_awl_dbm_to_sql	Thu Jan 29 18:54:33 2004
@@ -0,0 +1,100 @@
+#!/usr/bin/perl
+
+# WARNING: This script is VERY rough and provided only as a template
+# for moving a DB based autowhitelist to a SQL based one.  You should
+# do backups and that sort of thing before attempting to use this
+# script.
+
+use strict;
+use Fcntl;
+
+use Getopt::Long;
+
+use DBI;
+
+use DB_File ;
+use vars qw( %h $k $v ) ;
+
+sub usage {
+  print "This program takes the following required arguments:\n";
+  print "--username <username> - This is who's whitelist you are loading.\n";
+  print "                        It should match exactly what spassassin or\n";
+  print "                        spamd will be using.\n";
+  print "--dsn <dsn> - This is the database DSN.  It should be in the form:\n";
+  print "              DBI:driver:database:hostname[:port]\n";
+  print "              Consult your database drivers docs for more info.\n";
+  print "--ok - Basically a sanity check that you understand how dangerous this script is.\n";
+  print "\n";
+  print "This program take the following optional arguments:\n";
+  print "--dbautowhitelist <path>  - path to the auto-whitelist you wish to\n";
+  print "                            convert. Default is to use \n";
+  print "                            \$ENV{HOME}/.spamassassin/auto-whitelist\n";
+  print "--sqlusername <username> - Needed if your DBI driver requires a username.\n";
+  print "--sqlpassword <password> - Needed if your DBI driver requires a password.\n";
+  print "\n\n";
+  print "WARNING: This script is VERY rough and not well tested.  You should\n";
+  print "use extreme caution when working with it.  Including backing up your\n";
+  print "data and all that other good stuff.\n";
+  print "Passing of the --ok flag means you read this warning.\n";
+  print "\n";
+  exit;
+}
+
+my %opt;
+
+GetOptions('dsn=s' => \$opt{'dsn'},
+           'sqlusername=s' => \$opt{'sqlusername'},
+           'sqlpassword=s' => \$opt{'sqlpassword'},
+           'dbautowhitelist=s' => \$opt{'dbautowhitelist'},
+           'username=s' => \$opt{'username'},
+           'help' => \$opt{'help'},
+           'ok' => \$opt{'ok'},
+           );
+
+if ($opt{'help'}) {
+  usage();
+}
+
+if (!$opt{'ok'}) {
+  usage();
+}
+
+
+if (!$opt{'username'} || !$opt{'dsn'}) {
+  usage();
+}
+
+my $db;
+if ($opt{'dbautowhitelist'}) {
+  $db = $opt{'dbautowhitelist'};
+}
+else {
+  $db = $ENV{HOME}."/.spamassassin/auto-whitelist";
+}
+
+tie %h, "DB_File",$db, O_RDONLY,0600
+    or die "Cannot open file $db: $!\n";
+
+my $dbh = DBI->connect($opt{'dsn'}, $opt{'sqlusername'}, $opt{'sqlpassword'})
+    or die $DBI::errstr;
+
+my $sth = $dbh->prepare("DELETE FROM awl WHERE username = ?");
+$sth->execute($opt{'username'});
+
+my $sth = $dbh->prepare("INSERT INTO awl (username,email,ip,count,totscore) VALUES (?,?,?,?,?)");
+
+my @k = grep(!/totscore$/,keys(%h));
+for my $key (@k) {
+  my $totscore = $h{"$key|totscore"};
+  my $count = $h{$key};
+  if(defined($totscore)) {
+    my ($email, $ip) = split(/\|ip=/, $key);
+
+    if ($email && $ip) {
+      my $rc = $sth->execute($opt{'username'}, $email, $ip, $count, $totscore);
+      printf "% 8.1f %15s  --  %s\n", $totscore/$count, (sprintf "(%.1f/%d)",$totscore/$count, $count), $key;
+    }
+  }
+}
+untie %h;
+$dbh->disconnect();

Added: incubator/spamassassin/trunk/tools/convert_bayes_dbm_to_sql
==============================================================================
--- (empty file)
+++ incubator/spamassassin/trunk/tools/convert_bayes_dbm_to_sql	Thu Jan 29 18:54:33 2004
@@ -0,0 +1,242 @@
+#!/usr/bin/perl -w
+
+# WARNING: This script is VERY rough and provided only as a template
+# for moving the DB based bayes files to a SQL database.  You should
+# do backups and that sort of thing before attempting to use this
+# script.
+
+# Also, no sort of locking is provided so it's suggested that you make
+# sure nothing is accessing the bayes files you are attempting to
+# convert.
+
+use strict;
+
+use DB_File;
+use DBI;
+
+use Getopt::Long;
+
+use vars qw( %toks_db %seen_db $opt_dbpath $opt_username $opt_dsn $opt_ok
+	     $opt_dbusername $opt_dbpassword $opt_help $last_atime_delta
+	     $last_expire $last_expire_reduce $ham_count $spam_count);
+
+
+# These are the magic tokens we use to track stuff in the DB.
+# The format is '^M^A^G^I^C' followed by any string you want.
+# None of the control chars will be in a real token.
+my $DB_VERSION_MAGIC_TOKEN         = "\015\001\007\011\003DBVERSION";
+my $LAST_ATIME_DELTA_MAGIC_TOKEN   = "\015\001\007\011\003LASTATIMEDELTA";
+my $LAST_EXPIRE_MAGIC_TOKEN        = "\015\001\007\011\003LASTEXPIRE";
+my $LAST_EXPIRE_REDUCE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIREREDUCE";
+my $LAST_JOURNAL_SYNC_MAGIC_TOKEN  = "\015\001\007\011\003LASTJOURNALSYNC";
+my $NEWEST_TOKEN_AGE_MAGIC_TOKEN   = "\015\001\007\011\003NEWESTAGE";
+my $NHAM_MAGIC_TOKEN               = "\015\001\007\011\003NHAM";
+my $NSPAM_MAGIC_TOKEN              = "\015\001\007\011\003NSPAM";
+my $NTOKENS_MAGIC_TOKEN            = "\015\001\007\011\003NTOKENS";
+my $OLDEST_TOKEN_AGE_MAGIC_TOKEN   = "\015\001\007\011\003OLDESTAGE";
+my $RUNNING_EXPIRE_MAGIC_TOKEN     = "\015\001\007\011\003RUNNINGEXPIRE";
+
+
+GetOptions("dbpath=s",
+           "username=s",
+           "dsn=s",
+           "dbusername=s",
+	   "dbpassword=s",
+           "help",
+           "ok");
+
+sub usage {
+  print "This program takes the following arguments:\n";
+  print "--username <username> - This is who's bayes data you are loading.\n";
+  print "                        It should match exactly what spassassin or\n";
+  print "                        spamd will be using.\n";
+  print "--dsn <dsn> - This is the database DSN.  It should be in the form:\n";
+  print "              DBI:driver:database:hostname[:port]\n";
+  print "              Consult your database drivers docs for more info.\n";
+  print "--dbpath <path>  - path to the bayes files you wish to\n";
+  print "                   convert. Default is to use \n";
+  print "                   \$ENV{HOME}/.spamassassin/bayes\n";
+  print "                   A _toks and _seen will be added to the given path.\n";
+  print "--dbusername <username> - Needed if your DBI driver requires a username.\n";
+  print "--dbpassword <password> - Needed if your DBI driver requires a password.\n";
+  print "--ok - Basically a sanity check that you understand how dangerous this script is.\n";
+  print "\n\n";
+  print "WARNING: This script is VERY rough and not well tested.  You should\n";
+  print "use extreme caution when working with it.  Including backing up your\n";
+  print "data and all that other good stuff.\n";
+  print "Passing of the --ok flag means you read this warning.\n";
+  print "\n";
+  exit;
+}
+
+usage() if ($opt_help);
+
+if (!$opt_ok) {
+  usage();
+}
+
+my $path = $opt_dbpath || $ENV{HOME}."/.spamassassin/bayes";
+my $dsn = $opt_dsn || "dbi:mysql:spamassassin:";
+
+my $username = $opt_username || `whoami`;
+chomp($username);
+
+my $toks_name = "${path}_toks";
+my $seen_name = "${path}_seen";
+
+print "Converting DBM bayes database to SQL database for $username.\n";
+
+my $dbh = DBI->connect($dsn, $opt_dbusername, $opt_dbpassword);
+
+unless ($dbh) {
+  print "Unable to connect to database ($dsn): ".DBI->errstr()."\n";
+  exit;
+}
+
+my ($varcount) = $dbh->selectrow_array("SELECT count(*) FROM bayes_vars WHERE username = '$username'");
+
+my ($tokcount) = $dbh->selectrow_array("SELECT count(*) FROM bayes_token WHERE username = '$username'");
+
+if ($varcount || $tokcount) {
+  print "User: $username has existing data, please remove then re-run.\n";
+  exit;
+}
+
+tie %toks_db, "DB_File", $toks_name, O_RDONLY, 0600
+  or die "Cannot open file $toks_name: $!\n";
+
+if ($toks_db{$DB_VERSION_MAGIC_TOKEN} != 2) {
+  print "This conversion script only works with version 2 bayes DBM files.\n";
+  exit;
+}
+
+my $sql = "INSERT INTO bayes_token (username, token, spam_count, ham_count, atime) values (?,?,?,?,?)";
+my $sth = $dbh->prepare($sql);
+
+my $tokens = 0;
+
+# Initalize a few variables in case we end up not finding them in the database.
+$last_atime_delta = 0;
+$last_expire = 0;
+$last_expire_reduce = 0;
+
+foreach my $key ( keys(%toks_db) ) {
+  next if ($key eq $DB_VERSION_MAGIC_TOKEN);
+  next if ($key eq $NHAM_MAGIC_TOKEN);
+  next if ($key eq $NSPAM_MAGIC_TOKEN);
+  next if ($key eq $RUNNING_EXPIRE_MAGIC_TOKEN);
+  next if ($key eq $NTOKENS_MAGIC_TOKEN);
+  next if ($key eq $LAST_JOURNAL_SYNC_MAGIC_TOKEN);
+  next if ($key eq $NEWEST_TOKEN_AGE_MAGIC_TOKEN);
+  next if ($key eq $OLDEST_TOKEN_AGE_MAGIC_TOKEN);
+
+  if ($key eq $LAST_ATIME_DELTA_MAGIC_TOKEN) {
+    $last_atime_delta = $toks_db{$LAST_ATIME_DELTA_MAGIC_TOKEN};
+    next;
+  }
+
+  if ($key eq $LAST_EXPIRE_MAGIC_TOKEN) {
+    $last_expire = $toks_db{$LAST_EXPIRE_MAGIC_TOKEN};
+    next;
+  }
+
+  if ($key eq $LAST_EXPIRE_REDUCE_MAGIC_TOKEN) {
+    $last_expire_reduce = $toks_db{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN};
+    next;
+  }
+
+  my ($spam, $ham, $atime) = &tok_unpack($toks_db{$key});
+  my $rc = $sth->execute($username, $key, $spam, $ham, $atime);
+  if ($rc) {
+    $tokens++;
+  }
+  else {
+    print "Error creating entry for: $key -- $spam -- $ham -- $atime\n";
+  }
+  $sth->finish();
+}
+
+tie %seen_db,  "DB_File", $seen_name, O_RDONLY, 0600
+  or die "Cannot open file $toks_name: $!\n";
+
+$ham_count = 0;
+$spam_count = 0;
+
+$sql = "INSERT INTO bayes_seen (username, msgid, flag) VALUES (?,?,?)";
+$sth = $dbh->prepare($sql);
+
+foreach my $key (keys(%seen_db)) {
+  my $msgid = $key;
+  my $flag = $seen_db{$key};
+
+  next if ($flag ne 'h' && $flag ne 's');
+
+  my $rc = $sth->execute($username, $key, $flag);
+
+  if ($rc) {
+    if ($flag eq 'h') {
+      $ham_count++;
+    }
+    elsif ($flag eq 's') {
+      $spam_count++;
+    }
+  }
+  else {
+    print "Error creating entry for: $msgid -- $flag\n";
+  }
+  $sth->finish();
+}
+
+print "Token Count: $tokens\n";
+print "Ham Count:   $ham_count\n";
+print "Spam Count:  $spam_count\n";
+print "\nNOTE: It's possible that the above numbers may not match up exactly with\n";
+print "\n      what an sa-learn --dump magic shows.  Not sure why that is, but\n";
+print "\n      as long as it's not a huge difference I wouldn't worry about it.\n";
+
+$sql = "INSERT INTO bayes_vars (username, spam_count, ham_count, last_expire, last_atime_delta, last_expire_reduce) VALUES (?,?,?,?,?,?)";
+
+$sth = $dbh->prepare($sql);
+
+my $rc = $sth->execute($username, $spam_count, $ham_count, $last_expire, $last_atime_delta, $last_expire_reduce);
+
+unless ($rc) {
+  print "Error updating bayes_vars: ".DBI->errstr()."\n";
+  exit;
+}
+
+print "Conversion done.\n";
+
+
+
+
+
+use constant FORMAT_FLAG        => 0xc0;        # 11000000
+use constant ONE_BYTE_FORMAT    => 0xc0;        # 11000000
+use constant TWO_LONGS_FORMAT   => 0x00;        # 00000000
+
+use constant ONE_BYTE_SSS_BITS  => 0x38;        # 00111000
+use constant ONE_BYTE_HHH_BITS  => 0x07;        # 00000111
+
+sub tok_unpack {
+  my ($value) = @_;
+  $value ||= 0;
+
+  my ($packed, $atime) = unpack("CV", $value);
+
+  if (($packed & FORMAT_FLAG) == ONE_BYTE_FORMAT) {
+    return (($packed & ONE_BYTE_SSS_BITS) >> 3,
+	    $packed & ONE_BYTE_HHH_BITS,
+	    $atime || 0);
+  }
+  elsif (($packed & FORMAT_FLAG) == TWO_LONGS_FORMAT) {
+    my ($packed, $ts, $th, $atime) = unpack("CVVV", $value);
+    return ($ts || 0, $th || 0, $atime || 0);
+  }
+  # other formats would go here...
+  else {
+    warn "unknown packing format for Bayes db, please re-learn: $packed";
+    return (0, 0, 0);
+  }
+}
+

Re: svn commit: rev 6353 - in incubator/spamassassin/trunk: . lib/Mail/SpamAssassin rules sql t tools

Posted by Michael Parker <pa...@pobox.com>.
On Fri, Jan 30, 2004 at 02:54:34AM -0000, jm@apache.org wrote:
> +use constant SCAN_USING_PERL_CODE_TEST => 1;
> +# jm: off! not working for some reason.   Mind you, this is
> +# not a supported way to call these APIs!  so no biggie
> +
> +if (SCAN_USING_PERL_CODE_TEST) {
> +$sa = create_saobj();
> +
> +$sa->init();
> +
> +open(MAIL,"< ../sample-nonspam.txt");
> +
> +$raw_message = do {
> +  local $/;
> +  <MAIL>;
> +};
> +
> +close(MAIL);
.....

Justin,

Can you elaborate a little on this?  I know it was broken because of
the NoMailAudit stuff, but is it still broken?  It seems to work fine
on my systems.

Michael