You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by pa...@apache.org on 2004/04/29 03:03:00 UTC

svn commit: rev 10394 - in incubator/spamassassin/trunk: . lib/Mail/SpamAssassin lib/Mail/SpamAssassin/BayesStore sql t

Author: parker
Date: Wed Apr 28 18:02:59 2004
New Revision: 10394

Removed:
   incubator/spamassassin/trunk/sql/bayes_sqlite.sql
Modified:
   incubator/spamassassin/trunk/INSTALL
   incubator/spamassassin/trunk/MANIFEST
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStore.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStore/DBM.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStore/SQL.pm
   incubator/spamassassin/trunk/sql/README.bayes
   incubator/spamassassin/trunk/sql/bayes_mysql.sql
   incubator/spamassassin/trunk/sql/bayes_pg.sql
   incubator/spamassassin/trunk/t/bayesdbm.t
   incubator/spamassassin/trunk/t/bayesdbm_flock.t
   incubator/spamassassin/trunk/t/bayessql.t
Log:
Bug 3225: Bayes Optimization

Many optimizations for bayes storage.

1) Implement a tok_get_all that fetches multiple tokens at one time, instead of one at a time.

2) Modify SQL storage so that it keeps a running tally for total
tokens, oldest/newest token age instead using expensive select
count(*)/min(atime)/max(atime) calls.

3) In SQL, clean up tokens whose spam_count and ham_count both equal
0.

4) Added some basic caching in SQL code to avoid doing some SQL
operations.

5) Switched SQL backend to use userids instead of username for keys in
all of it's tables.

6) Removed some dead code.

7) We now store tokens as the lower 40 bits of a SHA1 hash.

8) Updated to backup/restore code to support the new database format.



Modified: incubator/spamassassin/trunk/INSTALL
==============================================================================
--- incubator/spamassassin/trunk/INSTALL	(original)
+++ incubator/spamassassin/trunk/INSTALL	Wed Apr 28 18:02:59 2004
@@ -278,6 +278,9 @@
     The Digest::SHA1 module will speed up some cryptographic hash tests,
     but is not required (although it may be required by another module).
 
+    If you make use of the bayes rules this module is strongly
+    recommended.
+
     If you use Debian, you can get Digest::SHA1 from the libdigest-sha1-perl
     package.
 

Modified: incubator/spamassassin/trunk/MANIFEST
==============================================================================
--- incubator/spamassassin/trunk/MANIFEST	(original)
+++ incubator/spamassassin/trunk/MANIFEST	Wed Apr 28 18:02:59 2004
@@ -182,7 +182,6 @@
 sql/awl_mysql.sql
 sql/bayes_mysql.sql
 sql/bayes_pg.sql
-sql/bayes_sqlite.sql
 t/SATest.pl
 t/SATest.pm
 t/basic_lint.t

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm	Wed Apr 28 18:02:59 2004
@@ -52,7 +52,7 @@
 
 use Mail::SpamAssassin;
 use Mail::SpamAssassin::PerMsgStatus;
-use Mail::SpamAssassin::SHA1 qw(sha1_hex);
+use Mail::SpamAssassin::SHA1 qw(sha1 sha1_hex);
 
 use vars qw{
   @ISA
@@ -354,10 +354,11 @@
   }
 
   # Go ahead and uniq the array, skip null tokens (can happen sometimes)
-  my %tokens = map { $_ => 1 } grep(length, @tokens);
+  # generate an SHA1 hash and take the lower 40 bits as our token
+  my %tokens = map { substr(sha1($_), -5) => { 'raw_token' => $_ } } grep(length, @tokens);
 
   # return the keys == tokens ...
-  keys %tokens;
+  return \%tokens;
 }
 
 sub tokenize_line {
@@ -684,9 +685,9 @@
 ###########################################################################
 
 sub ignore_message {
-  my ($Bayes,$PMS) = @_;
+  my ($self,$PMS) = @_;
 
-  return 0 unless $Bayes->{use_ignores};
+  return 0 unless $self->{use_ignores};
 
   my $ignore = $PMS->check_from_in_list('bayes_ignore_from')
     		|| $PMS->check_to_in_list('bayes_ignore_to');
@@ -808,11 +809,13 @@
   #
   $msgatime = time if ( $msgatime - time > 86400 );
 
-  for ($self->tokenize ($msg, $msgdata)) {
+  my $tokens = $self->tokenize($msg, $msgdata);
+
+  for my $token (keys %{$tokens}) {
     if ($isspam) {
-      $self->{store}->tok_count_change (1, 0, $_, $msgatime);
+      $self->{store}->tok_count_change (1, 0, $token, $msgatime);
     } else {
-      $self->{store}->tok_count_change (0, 1, $_, $msgatime);
+      $self->{store}->tok_count_change (0, 1, $token, $msgatime);
     }
   }
 
@@ -911,11 +914,13 @@
     $self->{store}->nspam_nham_change (0, -1);
   }
 
-  for ($self->tokenize ($msg, $msgdata)) {
+  my $tokens = $self->tokenize($msg, $msgdata);
+
+  for my $token (keys %{$tokens}) {
     if ($isspam) {
-      $self->{store}->tok_count_change (-1, 0, $_);
+      $self->{store}->tok_count_change (-1, 0, $token);
     } else {
-      $self->{store}->tok_count_change (0, -1, $_);
+      $self->{store}->tok_count_change (0, -1, $token);
     }
   }
 
@@ -1013,17 +1018,14 @@
 
 # compute the probability that that token is spammish
 sub compute_prob_for_token {
-  my ($self, $token, $ns, $nn, $s, $n, $atime) = @_;
+  my ($self, $token, $ns, $nn, $s, $n) = @_;
 
   # we allow the caller to give us the token information, just
   # to save a potentially expensive lookup
-  if (!defined($s) || !defined($n) || !defined($atime)) {
-    ($s, $n, $atime) = $self->{store}->tok_get ($token);
+  if (!defined($s) || !defined($n)) {
+    ($s, $n, undef) = $self->{store}->tok_get ($token);
   }
 
-  # store for use by header tags which list Bayes info
-  $self->{tok_raw_data}->{$token} = { s=>$s, n=>$n, atime=>$atime };
-
   return if ($s == 0 && $n == 0);
 
   if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
@@ -1133,9 +1135,7 @@
   my ($self, $permsgstatus, $msg) = @_;
   my $score;
 
-  if( $self->ignore_message($permsgstatus) ) {
-    goto skip;
-  }
+  goto skip if ($self->ignore_message($permsgstatus));
 
   goto skip unless $self->is_scan_available();
 
@@ -1149,22 +1149,22 @@
 
   my $msgdata = $self->get_msgdata_from_permsgstatus ($permsgstatus);
 
-  my $pw;
-  my @tokens = $self->tokenize ($msg, $msgdata);
+  my $msgtokens = $self->tokenize($msg, $msgdata);
+  my $tokensdata = $self->{store}->tok_get_all(keys %{$msgtokens});
 
-  # keep a temporary cache for tok_get() values, in case it's used
-  # for tokens; this is populated in compute_prob_for_token()
-  $self->{tok_raw_data} = { };
-
-  # Figure out our probabilities for the message tokens
-  my %pw = map {
-      $pw = $self->compute_prob_for_token ($_, $ns, $nn);
-      if (!defined $pw) {
-	();		# exit map()
-      } else {
-	($_ => $pw);
-      }
-  } @tokens;
+  my %pw;
+
+  foreach my $tokendata (@{$tokensdata}) {
+    my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata};
+    my $prob = $self->compute_prob_for_token($token, $ns, $nn, $tok_spam, $tok_ham);
+    if (defined($prob)) {
+      $pw{$token} = $prob;
+      $msgtokens->{$token}->{pw} = $prob;
+      $msgtokens->{$token}->{spam_count} = $tok_spam;
+      $msgtokens->{$token}->{ham_count} = $tok_ham;
+      $msgtokens->{$token}->{atime} = $atime;
+    }
+  }
 
   # If none of the tokens were found in the DB, we're going to skip
   # this message...
@@ -1173,7 +1173,7 @@
     goto skip;
   }
 
-  my $tcount_total = @tokens;
+  my $tcount_total = keys %{$msgtokens};
   my $tcount_learned = keys %pw;
 
   # Figure out the message receive time (used as atime below)
@@ -1205,22 +1205,20 @@
     # SPAMMYTOKENS tags that aren't there or collecting data that
     # won't be used?  Just collecting the data is certainly simpler.
     #
-    my $tokgot = $self->{tok_raw_data}->{$_} || { };
-    my $s = $tokgot->{s};
-    my $n = $tokgot->{n};
-    my $a = $tokgot->{atime};
-    push @$tinfo_spammy, [$_,$pw,$s,$n,$a] if $pw >= 0.5 && ++$tcount_spammy;
-    push @$tinfo_hammy,  [$_,$pw,$s,$n,$a] if $pw <  0.5 && ++$tcount_hammy;
+    my $raw_token = $msgtokens->{$_}->{raw_token} || "(unknown)";
+    my $s = $msgtokens->{$_}->{spam_count};
+    my $n = $msgtokens->{$_}->{ham_count};
+    my $a = $msgtokens->{$_}->{atime};
+    push @$tinfo_spammy, [$raw_token,$pw,$s,$n,$a] if $pw >= 0.5 && ++$tcount_spammy;
+    push @$tinfo_hammy,  [$raw_token,$pw,$s,$n,$a] if $pw <  0.5 && ++$tcount_hammy;
 
     push (@sorted, $pw);
 
     # update the atime on this token, it proved useful
     $self->{store}->tok_touch ($_, $msgatime);
 
-    dbg ("bayes token '$_' => $pw");
+    dbg ("bayes token '$raw_token' => $pw");
   }
-
-  delete $self->{tok_raw_data};         # don't need this anymore
 
   if (!@sorted || (REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE > 0 && 
 	$#sorted <= REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE))

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStore.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStore.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStore.pm	Wed Apr 28 18:02:59 2004
@@ -414,6 +414,9 @@
   # If force expire was called, do the expire no matter what.
   return 1 if ($self->{bayes}->{main}->{learn_force_expire});
 
+  # if config says not to auto expire then no need to continue
+  return 0 if ($self->{bayes}->{main}->{conf}->{bayes_auto_expire} == 0);
+
   # is the database too small for expiry?  (Do *not* use "scalar keys",
   # as this will iterate through the entire db counting them!)
   my @vars = $self->get_storage_variables();
@@ -435,7 +438,6 @@
 
   my $conf = $self->{bayes}->{main}->{conf};
   if ($ntoks <= 100000 ||			# keep at least 100k tokens
-      $conf->{bayes_auto_expire} == 0 ||	# config says don't expire
       $self->{expiry_max_db_size} > $ntoks ||	# not enough tokens to cause an expire
       $vars[10]-$vars[5] < 43200 ||		# delta between oldest and newest < 12h
       $self->{db_version} < $self->DB_VERSION # ignore old db formats
@@ -615,6 +617,21 @@
 sub tok_get {
   my ($self, $token) = @_;
   die "tok_get: not implemented\n";
+}
+
+=head2 tok_get_all
+
+public instance (\@) tok_get_all (@ @tokens)
+
+Description:
+This method retrieves the specified tokens (C<@tokens>) from storage and returns
+an array ref of arrays spam count, ham acount and last access time.
+
+=cut
+
+sub tok_get_all {
+  my ($self, $tokens) = @_;
+  die "tok_get_all: not implemented\n";
 }
 
 =head2 tok_count_change

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStore/DBM.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStore/DBM.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStore/DBM.pm	Wed Apr 28 18:02:59 2004
@@ -23,6 +23,7 @@
 use Mail::SpamAssassin;
 use Mail::SpamAssassin::Util;
 use Mail::SpamAssassin::BayesStore;
+use Mail::SpamAssassin::SHA1 qw(sha1);
 use File::Basename;
 use File::Spec;
 use File::Path;
@@ -100,7 +101,7 @@
 
   my $self = $class->SUPER::new(@_);
 
-  $self->{supported_db_version} = 2;
+  $self->{supported_db_version} = 3;
 
   $self->{already_tied} = 0;
   $self->{is_locked} = 0;
@@ -422,9 +423,82 @@
     $self->{db_version} = 2; # need this for other functions which check
   }
 
-  # if ( $self->{db_version} == 2 ) {
+  # Version 3 of the database converts all existing tokens to SHA1 hashes
+  if ( $self->{db_version} == 2 ) {
+    dbg ("bayes: upgrading database format from v".$self->{db_version}." to v3");
+    my $DB_NSPAM_MAGIC_TOKEN		  = "\015\001\007\011\003NSPAM";
+    my $DB_NHAM_MAGIC_TOKEN		  = "\015\001\007\011\003NHAM";
+    my $DB_NTOKENS_MAGIC_TOKEN		  = "\015\001\007\011\003NTOKENS";
+    my $DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN	  = "\015\001\007\011\003OLDESTAGE";
+    my $DB_LAST_EXPIRE_MAGIC_TOKEN	  = "\015\001\007\011\003LASTEXPIRE";
+    my $DB_NEWEST_TOKEN_AGE_MAGIC_TOKEN	  = "\015\001\007\011\003NEWESTAGE";
+    my $DB_LAST_JOURNAL_SYNC_MAGIC_TOKEN  = "\015\001\007\011\003LASTJOURNALSYNC";
+    my $DB_LAST_ATIME_DELTA_MAGIC_TOKEN	  = "\015\001\007\011\003LASTATIMEDELTA";
+    my $DB_LAST_EXPIRE_REDUCE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIREREDUCE";
+
+    # remember when we started ...
+    my $started = time;
+
+    # use O_EXCL to avoid races (bonus paranoia, since we should be locked
+    # anyway)
+    my %new_toks;
+    my $umask = umask 0;
+    tie %new_toks, "DB_File", "${name}.new", O_RDWR|O_CREAT|O_EXCL,
+          (oct ($main->{conf}->{bayes_file_mode}) & 0666) or return 0;
+    umask $umask;
+
+    # add the magic tokens to the new db.
+    $new_toks{$NSPAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NSPAM_MAGIC_TOKEN};
+    $new_toks{$NHAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NHAM_MAGIC_TOKEN};
+    $new_toks{$NTOKENS_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NTOKENS_MAGIC_TOKEN};
+    $new_toks{$DB_VERSION_MAGIC_TOKEN} = 3; # we're now a DB version 3 file
+    $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $self->{db_toks}->{$DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN};
+    $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = $self->{db_toks}->{$DB_LAST_EXPIRE_MAGIC_TOKEN};
+    $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NEWEST_TOKEN_AGE_MAGIC_TOKEN};
+    $new_toks{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = $self->{db_toks}->{$DB_LAST_JOURNAL_SYNC_MAGIC_TOKEN};
+    $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = $self->{db_toks}->{$DB_LAST_ATIME_DELTA_MAGIC_TOKEN};
+    $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} =$self->{db_toks}->{$DB_LAST_EXPIRE_REDUCE_MAGIC_TOKEN};
+
+    # deal with the data tokens
+    while (my ($tok, $packed) = each %{$self->{db_toks}}) {
+      next if ($tok =~ /^\015\001\007\011\003/); # skip magic tokens
+      my $tok_hash = substr(sha1($tok), -5);
+      $new_toks{$tok_hash} = $packed;
+    }
+
+    # now untie so we can do renames
+    untie %{$self->{db_toks}};
+    untie %new_toks;
+
+    # This is the critical phase (moving files around), so don't allow
+    # it to be interrupted.
+    local $SIG{'INT'} = 'IGNORE';
+    local $SIG{'TERM'} = 'IGNORE';
+    local $SIG{'HUP'} = 'IGNORE' if (!Mail::SpamAssassin::Util::am_running_on_windows());
+
+    # now rename in the new one.  Try several extensions
+    for my $ext (@DB_EXTENSIONS) {
+      my $newf = $name.'.new'.$ext;
+      my $oldf = $name.$ext;
+      next unless (-f $newf);
+      if (!rename ($newf, $oldf)) {
+        warn "rename $newf to $oldf failed: $!\n";
+        return 0;
+      }
+    }
+
+    # re-tie to the new db in read-write mode ...
+    tie %{$self->{db_toks}},"DB_File", $name, O_RDWR|O_CREAT,
+	 (oct ($main->{conf}->{bayes_file_mode}) & 0666) or return 0;
+
+    dbg ("bayes: upgraded database format from v".$self->{db_version}." to v3 in ".(time - $started)." seconds");
+
+    $self->{db_version} = 3; # need this for other functions which check
+  }
+
+  # if ( $self->{db_version} == 3 ) {
   #   ...
-  #   $self->{db_version} = 3; # need this for other functions which check
+  #   $self->{db_version} = 4; # need this for other functions which check
   # }
   # ... and so on.
 
@@ -672,6 +746,17 @@
   $self->tok_unpack ($self->{db_toks}->{$tok});
 }
  
+sub tok_get_all {
+  my ($self, @tokens) = @_;
+
+  my @tokensdata;
+  foreach my $token (@tokens) {
+    my ($tok_spam, $tok_ham, $atime) = $self->tok_unpack($self->{db_toks}->{$token});
+    push(@tokensdata, [$token, $tok_spam, $tok_ham, $atime]);
+  }
+  return \@tokensdata;
+}
+
 # return the magic tokens in a specific order:
 # 0: scan count base
 # 1: number of spam
@@ -690,9 +775,10 @@
   my @values;
 
   my $db_ver = $self->{db_toks}->{$DB_VERSION_MAGIC_TOKEN};
+
   if ( !$db_ver || $db_ver =~ /\D/ ) { $db_ver = 0; }
 
-  if ( $db_ver == 2 ) {
+  if ( $db_ver >= 2 ) {
     my $DB2_LAST_ATIME_DELTA_MAGIC_TOKEN	= "\015\001\007\011\003LASTATIMEDELTA";
     my $DB2_LAST_EXPIRE_MAGIC_TOKEN		= "\015\001\007\011\003LASTEXPIRE";
     my $DB2_LAST_EXPIRE_REDUCE_MAGIC_TOKEN	= "\015\001\007\011\003LASTEXPIREREDUCE";
@@ -711,7 +797,7 @@
       $self->{db_toks}->{$DB2_NTOKENS_MAGIC_TOKEN},
       $self->{db_toks}->{$DB2_LAST_EXPIRE_MAGIC_TOKEN},
       $self->{db_toks}->{$DB2_OLDEST_TOKEN_AGE_MAGIC_TOKEN},
-      2,
+      $db_ver,
       $self->{db_toks}->{$DB2_LAST_JOURNAL_SYNC_MAGIC_TOKEN},
       $self->{db_toks}->{$DB2_LAST_ATIME_DELTA_MAGIC_TOKEN},
       $self->{db_toks}->{$DB2_LAST_EXPIRE_REDUCE_MAGIC_TOKEN},
@@ -780,11 +866,11 @@
     # We have the value already, so just unpack it.
     my ($ts, $th, $atime) = $self->tok_unpack ($tokvalue);
     
-    my $prob = $self->{bayes}->compute_prob_for_token($tok, $vars[1], $vars[2],
-						      $ts, $th, $atime);
+    my $prob = $self->{bayes}->compute_prob_for_token($tok, $vars[1], $vars[2], $ts, $th);
     $prob ||= 0.5;
     
-    printf $template,$prob,$ts,$th,$atime,$tok;
+    my $encoded_tok = unpack("H*",$tok);
+    printf $template,$prob,$ts,$th,$atime,$encoded_tok;
   }
 }
 
@@ -827,7 +913,10 @@
   $atime = 0 unless defined $atime;
 
   if ($self->{bayes}->{main}->{learn_to_journal}) {
-    $self->defer_update ("c $ds $dh $atime $tok");
+    # we can't store the SHA1 binary value in the journal to convert it
+    # to a printable value that can be converted back later
+    my $encoded_tok = unpack("H*",$tok);
+    $self->defer_update ("c $ds $dh $atime $encoded_tok");
   } else {
     $self->tok_sync_counters ($ds, $dh, $atime, $tok);
   }
@@ -851,7 +940,10 @@
 
 sub tok_touch {
   my ($self, $tok, $atime) = @_;
-  $self->defer_update ("t $atime $tok");
+  # we can't store the SHA1 binary value in the journal to convert it
+  # to a printable value that can be converted back later
+  my $encoded_tok = unpack("H*", $tok);
+  $self->defer_update ("t $atime $encoded_tok");
 }
 
 sub defer_update {
@@ -1026,9 +1118,11 @@
       $total_count++;
 
       if (/^t (\d+) (.+)$/) { # Token timestamp update, cache resultant entries
-	$tokens{$2} = $1+0 if ( !exists $tokens{$2} || $1+0 > $tokens{$2} );
+	my $tok = pack("H*",$2);
+	$tokens{$tok} = $1+0 if ( !exists $tokens{$tok} || $1+0 > $tokens{$tok} );
       } elsif (/^c (-?\d+) (-?\d+) (\d+) (.+)$/) { # Add/full token update
-	$self->tok_sync_counters ($1+0, $2+0, $3+0, $4);
+	my $tok = pack("H*",$4);
+	$self->tok_sync_counters ($1+0, $2+0, $3+0, $tok);
 	$count++;
       } elsif (/^n (-?\d+) (-?\d+)$/) { # update ham/spam count
 	$self->tok_sync_nspam_nham ($1+0, $2+0);
@@ -1304,8 +1398,8 @@
     next if ($tok =~ MAGIC_RE); # skip magic tokens
 
     my ($ts, $th, $atime) = $self->tok_unpack($packed);
-
-    print "t\t$ts\t$th\t$atime\t$tok\n";
+    my $encoded_token = unpack("H*",$tok);
+    print "t\t$ts\t$th\t$atime\t$encoded_token\n";
   }
 
   while (my ($msgid, $flag) = each %{$self->{db_seen}}) {
@@ -1391,6 +1485,16 @@
     return 0;
   }
 
+  unless ($db_version == 2 || $db_version == 3) {
+    dbg("bayes: Database Version $db_version is unsupported, must be version 2 or 3.");
+    untie %new_toks;
+    untie %new_seen;
+    unlink $tmptoksdbname;
+    unlink $tmpseendbname;
+    $self->untie_db();
+    return 0;
+  }
+
   while (my $line = <DUMPFILE>) {
     chomp($line);
     $line_count++;
@@ -1447,6 +1551,16 @@
       if ($token_warn_p) {
 	dbg("bayes: Token ($token) has the following warnings:\n".join("\n",@warnings));
       }
+
+      # database versions < 3 did not encode their token values
+      if ($db_version < 3) {
+	$token = substr(sha1($token), -5);
+      }
+      else {
+	# turn unpacked binary token back into binary value
+	$token = pack("H*",$token);
+      }
+
       $new_toks{$token} = $self->tok_pack($spam_count, $ham_count, $atime);
       if ($atime < $oldest_token_age) {
 	$oldest_token_age = $atime;
@@ -1576,7 +1690,7 @@
   $value ||= 0;
 
   my ($packed, $atime);
-  if ( $self->{db_version} == 2 || $self->{db_version} == 1 ) {
+  if ( $self->{db_version} >= 1 ) {
     ($packed, $atime) = unpack("CV", $value);
   }
   elsif ( $self->{db_version} == 0 ) {
@@ -1590,10 +1704,7 @@
   }
   elsif (($packed & FORMAT_FLAG) == TWO_LONGS_FORMAT) {
     my ($packed, $ts, $th, $atime);
-    if ( $self->{db_version} == 2 ) {
-      ($packed, $ts, $th, $atime) = unpack("CVVV", $value);
-    }
-    elsif ( $self->{db_version} == 1 ) {
+    if ( $self->{db_version} >= 1 ) {
       ($packed, $ts, $th, $atime) = unpack("CVVV", $value);
     }
     elsif ( $self->{db_version} == 0 ) {

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStore/SQL.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStore/SQL.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStore/SQL.pm	Wed Apr 28 18:02:59 2004
@@ -32,6 +32,7 @@
 use bytes;
 
 use Mail::SpamAssassin::BayesStore;
+use Mail::SpamAssassin::SHA1 qw(sha1);
 
 use vars qw( @ISA );
 
@@ -61,7 +62,7 @@
 
   my $self = $class->SUPER::new(@_);
 
-  $self->{supported_db_version} = 2;
+  $self->{supported_db_version} = 3;
 
   if (!$self->{bayes}->{conf}->{bayes_sql_dsn}) {
     dbg("bayes: invalid config, must set bayes_sql_dsn config variable.\n");
@@ -91,6 +92,7 @@
     }
   }
   dbg("bayes: Using username: ".$self->{_username});
+
   return $self;
 }
 
@@ -206,16 +208,16 @@
   my %delta = (); # use a hash since an array is going to be very sparse
 
   return %delta unless (defined($self->{_dbh}));
-  
+
   my $sql = "SELECT count(*)
                FROM bayes_token
-              WHERE username = ?
+              WHERE id = ?
                 AND (? - atime) > ?";
 
   my $sth = $self->{_dbh}->prepare_cached($sql);
     
   for (my $i = 1; $i <= $max_expire_mult; $i<<=1) {
-    my $rc = $sth->execute($self->{_username}, $newest_atime, $start * $i);
+    my $rc = $sth->execute($self->{_userid}, $newest_atime, $start * $i);
 
     unless ($rc) {
       dbg("bayes: calculate_expire_delta: SQL Error: ".$self->{_dbh}->errstr());
@@ -254,33 +256,61 @@
   my $too_old = $vars[10] - $newdelta; # tooold = newest - delta
 
   # if token atime > newest, reset to newest ...
-  my $sql = "UPDATE bayes_token SET atime=? WHERE username = ? and atime > ?";
-  my $rows = $self->{_dbh}->do($sql, undef, $vars[10], $self->{_username}, $vars[10]);
+  my $sql = "UPDATE bayes_token SET atime = ?
+              WHERE id  = ?
+                AND atime > ?";
+
+  my $rows = $self->{_dbh}->do($sql, undef, $vars[10], $self->{_userid}, $vars[10]);
+
   unless (defined($rows)) {
-    dbg("bayes: reset tokens in future: SQL Error: ".$self->{_dbh}->errstr());
+    dbg("bayes: token_expiration: SQL Error: ".$self->{_dbh}->errstr());
     return 0;
   }
 
+  $sql = "DELETE from bayes_token
+           WHERE id = ?
+             AND atime < ?";
+
   # Do the expire
-  $sql = "DELETE from bayes_token WHERE username = ? and atime < ?";
+  $sql = "DELETE from bayes_token WHERE id = ? and atime < ?";
 
-  $rows = $self->{_dbh}->do($sql, undef, $self->{_username}, $too_old);
+  $rows = $self->{_dbh}->do($sql, undef, $self->{_userid}, $too_old);
 
   unless (defined($rows)) {
-    dbg("bayes: actual_expire: SQL Error: ".$self->{_dbh}->errstr());
+    dbg("bayes: token_expiration: SQL Error: ".$self->{_dbh}->errstr());
     return 0;
   }
 
   my $deleted = $rows;
 
-  # We've chosen a new atime delta if we've gotten here, so record it for posterity.
-  $self->_set_last_atime_delta($newdelta);
+  $sql = "UPDATE bayes_vars SET token_count = token_count - ?,
+                                last_expire = ?,
+                                last_atime_delta = ?,
+                                last_expire_reduce = ?
+           WHERE id = ?";
+
+  $rows = $self->{_dbh}->do($sql, undef, $deleted, time(), $newdelta, $deleted, $self->{_userid});
+
+  unless (defined($rows)) {
+    dbg("bayes: token_expiration: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  # Now lets update the oldest_token_age value, shouldn't need to worry about newest_token_age
+  # slight race condition here, but the chance is small that we'll insert a new token with
+  # such an old atime
+  my $oldest_token_age = $self->_get_oldest_token_age();
+
+  $sql = "UPDATE bayes_vars SET oldest_token_age = ? WHERE id = ?";
 
-  # The rest of these have been modified, so replace as necessary.
-  $self->set_last_expire(time());
-  $self->_set_last_expire_reduce($deleted);
+  $rows = $self->{_dbh}->do($sql, undef, $oldest_token_age, $self->{_userid});
 
-  my $kept = $self->_get_token_count();
+  unless (defined($rows)) {
+    dbg("bayes: token_expiration: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }
+
+  my $kept = $vars[3] - $deleted;
 
   $num_hapaxes = $self->_get_num_hapaxes() if ($opts->{verbose});
   $num_lowfreq = $self->_get_num_lowfreq() if ($opts->{verbose});
@@ -324,7 +354,9 @@
 
   return undef unless (defined($self->{_dbh}));
  
-  my $sql = "SELECT flag FROM bayes_seen WHERE username = ? AND msgid = ?";
+  my $sql = "SELECT flag FROM bayes_seen
+              WHERE id = ?
+                AND msgid = ?";
 
   my $sth = $self->{_dbh}->prepare_cached($sql);
 
@@ -333,7 +365,7 @@
     return undef;
   }
 
-  my $rc = $sth->execute($self->{_username}, $msgid);
+  my $rc = $sth->execute($self->{_userid}, $msgid);
   
   unless ($rc) {
     dbg("bayes: seen_get: SQL Error: ".$self->{_dbh}->errstr());
@@ -365,23 +397,17 @@
   
   return 0 unless (defined($self->{_dbh}));
 
-  my $sql = "INSERT INTO bayes_seen (username, msgid, flag) VALUES (?,?,?)";
-  
-  my $sth = $self->{_dbh}->prepare_cached($sql);
+  my $sql = "INSERT INTO bayes_seen (id, msgid, flag)
+             VALUES (?,?,?)";
   
-  unless (defined($sth)) {
-      dbg("bayes: seen_put: SQL Error: ".$self->{_dbh}->errstr());
-      return 0;
-  }
-
-  my $rc = $sth->execute($self->{_username}, $msgid, $flag);
+  my $rows = $self->{_dbh}->do($sql,
+			       undef,
+			       $self->{_userid}, $msgid, $flag);
   
-  unless ($rc) {
-      dbg("bayes: seen_put: SQL Error: ".$self->{_dbh}->errstr());
-      return 0;
+  unless (defined($rows)) {
+    dbg("bayes: seen_put: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
   }
-  
-  $sth->finish();
 
   dbg("bayes: seen ($msgid) put");
   return 1;
@@ -403,24 +429,19 @@
 
   return 0 unless (defined($self->{_dbh}));
 
-  my $sql = "DELETE FROM bayes_seen WHERE username = ? AND msgid = ?";
+  my $sql = "DELETE FROM bayes_seen
+              WHERE id = ?
+                AND msgid = ?";
   
-  my $sth = $self->{_dbh}->prepare_cached($sql);
-
-  unless (defined($sth)) {
-      dbg("bayes: seen_delete: SQL Error: ".$self->{_dbh}->errstr());
-      return 0;
-  }
-
-  my $rc = $sth->execute($self->{_username}, $msgid);
+  my $rows = $self->{_dbh}->do($sql,
+			       undef,
+			       $self->{_userid}, $msgid);
 
-  unless ($rc) {
-      dbg("bayes: seen_delete: SQL Error: ".$self->{_dbh}->errstr());
-      return 0;
+  unless (defined($rows)) {
+    dbg("bayes: seen_delete: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
   }
 
-  $sth->finish();
-
   return 1;
 }
 
@@ -464,10 +485,11 @@
 
   return (0,0,0,0,0,0,0,0,0,0,0) unless (defined($self->{_dbh}));
 
-  my $sql = "SELECT spam_count, ham_count, last_expire,
-                    last_atime_delta, last_expire_reduce
+  my $sql = "SELECT spam_count, ham_count, token_count, last_expire,
+                    last_atime_delta, last_expire_reduce, oldest_token_age,
+                    newest_token_age
                FROM bayes_vars
-              WHERE username = ?";
+              WHERE id = ?";
 
   my $sth = $self->{_dbh}->prepare_cached($sql);
 
@@ -476,21 +498,19 @@
     return (0,0,0,0,0,0,0,0,0,0,0);
   }
 
-  my $rc = $sth->execute($self->{_username});
+  my $rc = $sth->execute($self->{_userid});
 
   unless ($rc) {
     dbg("bayes: get_storage_variables: SQL Error: ".$self->{_dbh}->errstr());
     return (0,0,0,0,0,0,0,0,0,0,0);
   }
 
-  my ($spam_count, $ham_count, $last_expire,
-      $last_atime_delta, $last_expire_reduce) = $sth->fetchrow_array();
+  my ($spam_count, $ham_count, $token_count,
+      $last_expire, $last_atime_delta, $last_expire_reduce,
+      $oldest_token_age, $newest_token_age) = $sth->fetchrow_array();
 
   $sth->finish();
 
-  my $token_count = $self->_get_token_count();
-  my $oldest_token_age = $self->_get_oldest_token_age();
-  my $newest_token_age = $self->_get_newest_token_age();
   my $db_ver = $self->DB_VERSION;
 
   @values = (
@@ -507,10 +527,6 @@
              $newest_token_age
              );
 
-  foreach ( @values ) {
-    if ( !$_ || $_ =~ /\D/ ) { $_ = 0; }
-  }
-
   return @values;
 }
 
@@ -529,13 +545,12 @@
 
   return unless (defined($self->{_dbh}));
 
-  # 0/0 tokens don't count
-  # since ordering is check here, order the tokens
-  my $sql = "SELECT token, spam_count, ham_count, atime
+  # 0/0 tokens don't count, but in theory we shouldn't have any
+  # use RPAD to make sure we get trailing spaces in the token value
+  my $sql = "SELECT RPAD(token,5,' '), spam_count, ham_count, atime
                FROM bayes_token
-              WHERE username = ?
-                AND (spam_count > 0 OR ham_count > 0)
-             ORDER BY token";
+              WHERE id = ?
+                AND (spam_count > 0 OR ham_count > 0)";
 
   my $sth = $self->{_dbh}->prepare($sql);
 
@@ -544,7 +559,7 @@
     return;
   }
 
-  my $rc = $sth->execute($self->{_username});
+  my $rc = $sth->execute($self->{_userid});
 
   unless ($rc) {
     dbg("bayes: dump_db_toks: SQL Error: ".$self->{_dbh}->errstr());
@@ -553,11 +568,12 @@
 
   while (my ($token, $spam_count, $ham_count, $atime) = $sth->fetchrow_array()) {
     my $prob = $self->{bayes}->compute_prob_for_token($token, $vars[1], $vars[2],
-						      $spam_count, $ham_count,
-						      $atime);
+						      $spam_count, $ham_count);
     $prob ||= 0.5;
+
+    my $encoded_token = unpack("H*", $token);
     
-    printf $template,$prob,$spam_count,$ham_count,$atime,$token;
+    printf $template,$prob,$spam_count,$ham_count,$atime,$encoded_token;
   }
 
   $sth->finish();
@@ -581,24 +597,18 @@
 
   return 0 unless (defined($self->{_dbh}));
 
-  my $sql = "UPDATE bayes_vars SET last_expire = ? WHERE username = ?";
+  my $sql = "UPDATE bayes_vars SET last_expire = ? WHERE id = ?";
  
-  my $sth = $self->{_dbh}->prepare_cached($sql);
-
-  unless (defined($sth)) {
-    dbg("bayes: set_last_expire: SQL Error: ".$self->{_dbh}->errstr());
-    return 0;
-  }
-
-  my $rc = $sth->execute($time, $self->{_username});
+  my $rows = $self->{_dbh}->do($sql,
+			       undef,
+			       $time,
+			       $self->{_userid});
 
-  unless ($rc) {
+  unless (defined($rows)) {
     dbg("bayes: set_last_expire: SQL Error: ".$self->{_dbh}->errstr());
     return 0;
   }
 
-  $sth->finish();
-
   return 1;
 }
 
@@ -620,7 +630,7 @@
 
   return 0 unless (defined($self->{_dbh}));
 
-  my $sql = "SELECT max(runtime) from bayes_expire WHERE username = ?";
+  my $sql = "SELECT max(runtime) from bayes_expire WHERE id = ?";
 
   my $sth = $self->{_dbh}->prepare_cached($sql);
 
@@ -629,7 +639,7 @@
     return 0;
   }
 
-  my $rc = $sth->execute($self->{_username});
+  my $rc = $sth->execute($self->{_userid});
 
   unless ($rc) {
     dbg("bayes: get_running_expire_tok: SQL Error: ".$self->{_dbh}->errstr());
@@ -657,19 +667,18 @@
 
   return 0 unless (defined($self->{_dbh}));
 
-  my $sql = "INSERT INTO bayes_expire (username,runtime) VALUES (?,?)";
-
-  my $sth = $self->{_dbh}->prepare_cached($sql);
+  my $sql = "INSERT INTO bayes_expire (id,runtime) VALUES (?,?)";
 
   my $time = time();
 
-  my $rc = $sth->execute($self->{_username}, $time);
-
-  unless ($rc) {
-      dbg("bayes: set_running_expire_tok: SQL Error: ".$self->{_dbh}->errstr());
-      return undef;
+  my $rows = $self->{_dbh}->do($sql,
+			       undef,
+			       $self->{_userid}, $time);
+  unless (defined($rows)) {
+    dbg("bayes: set_running_expire_tok: SQL Error: ".$self->{_dbh}->errstr());
+    return undef;
   }
-  $sth->finish();
+
   return $time;
 }
 
@@ -688,17 +697,19 @@
 
   return 0 unless (defined($self->{_dbh}));
 
-  my $sql = "DELETE from bayes_expire WHERE username = ?";
+  my $sql = "DELETE from bayes_expire
+              WHERE id = ?";
 
-  my $rows = $self->{_dbh}->do($sql, undef, $self->{_username});
+  my $rows = $self->{_dbh}->do($sql, undef, $self->{_userid});
 
-  if (!defined($rows)) {
+  unless (defined($rows)) {
     dbg("bayes: remove_running_expire_tok: SQL Error: ".$self->{_dbh}->errstr());
     return 0;
   }
 
   return 1;
 }
+
 =head2 tok_get
 
 public instance (Integer, Integer, Integer) tok_get (String $token)
@@ -716,7 +727,7 @@
 
   my $sql = "SELECT spam_count, ham_count, atime
                FROM bayes_token
-              WHERE username = ?
+              WHERE id = ?
                 AND token = ?";
 
   my $sth = $self->{_dbh}->prepare_cached($sql);
@@ -726,7 +737,7 @@
     return (0,0,0);
   }
 
-  my $rc = $sth->execute($self->{_username}, $token);
+  my $rc = $sth->execute($self->{_userid}, $token);
 
   unless ($rc) {
     dbg("bayes: tok_get: SQL Error: ".$self->{_dbh}->errstr());
@@ -744,6 +755,117 @@
   return ($spam_count, $ham_count, $atime)
 }
 
+=head2 tok_get_all
+
+public instance (\@) tok_get (@ $tokens)
+
+Description:
+This method retrieves the specified tokens (C<$tokens>) from storage and returns
+an array ref of arrays spam count, ham acount and last access time.
+
+=cut
+
+sub tok_get_all {
+  my ($self, @tokens) = @_;
+
+  return [] unless (defined($self->{_dbh}));
+
+  my $token_list_size = scalar(@tokens);
+  dbg("bayes: tok_get_all: Token Count: $token_list_size");
+  my @tok_results;
+
+  my @bunch_sizes = (100, 50, 25, 5); # XXX - need to benchmark to tweak
+  my $search_index = 0;
+  my $results_index = 0;
+  my $bunch_end;
+
+  my $multi_sql = "SELECT token, spam_count, ham_count, atime
+                     FROM bayes_token
+                    WHERE id = ?
+                      AND token IN ";
+
+  my $single_sql = "SELECT token, spam_count, ham_count, atime
+                      FROM bayes_token
+                     WHERE id = ?
+                       AND token = ?";
+
+  foreach my $bunch_size (@bunch_sizes) {
+    while ($token_list_size - $search_index >= $bunch_size) {
+      my @bindings;
+      my $bindcount;
+      my $in_str = '(';
+
+      $bunch_end = $search_index + $bunch_size;
+      for ( ; $search_index < $bunch_end; $search_index++) {
+	$in_str .= '?,';
+	push(@bindings, $tokens[$search_index]);
+      }
+      chop $in_str;
+      $in_str .= ')';
+
+      my $dynamic_sql = $multi_sql . $in_str;
+
+      my $sth = $self->{_dbh}->prepare($dynamic_sql);
+
+      unless (defined($sth)) {
+	dbg("bayes: tok_get_all: SQL Error: ".$self->{_dbh}->errstr());
+	return [];
+      }
+
+      my $rc = $sth->execute($self->{_userid}, @bindings);
+
+      unless ($rc) {
+	dbg("bayes: tok_get_all: SQL Error: ".$self->{_dbh}->errstr());
+	return [];
+      }
+
+      my $results = $sth->fetchall_arrayref();
+
+      $sth->finish();
+
+      foreach my $result (@{$results}) {
+	# Make sure that spam_count and ham_count are not negative
+	$result->[1] = 0 if (!$result->[1] || $result->[1] < 0);
+	$result->[2] = 0 if (!$result->[2] || $result->[2] < 0);
+	# Make sure that atime has a value
+	$result->[3] = 0 if (!$result->[3]);
+	$tok_results[$results_index++] = $result;
+      }
+    }
+  }
+
+  while ($search_index < $token_list_size) {
+    my $sth = $self->{_dbh}->prepare($single_sql);
+
+    unless (defined($sth)) {
+      dbg("bayes: tok_get_all: SQL Error: ".$self->{_dbh}->errstr());
+      return [];
+    }
+
+    my $rc = $sth->execute($self->{_userid}, $tokens[$search_index++]);
+
+    unless ($rc) {
+      dbg("bayes: tok_get_all: SQL Error: ".$self->{_dbh}->errstr());
+      return [];
+    }
+
+    my $result = $sth->fetchrow_arrayref();
+
+    $sth->finish();
+
+    if (defined($result)) {
+      # Make sure that spam_count and ham_count are not negative
+      $result->[1] = 0 if (!$result->[1] || $result->[1] < 0);
+      $result->[2] = 0 if (!$result->[2] || $result->[2] < 0);
+      # Make sure that atime has a value
+      $result->[3] = 0 if (!$result->[3]);
+      $tok_results[$results_index++] = $result 
+    }
+  }
+
+  return \@tok_results;
+}
+
 =head2 tok_count_change
 
 public instance (Boolean) tok_count_change (Integer $spam_count,
@@ -780,27 +902,9 @@
 
   return (0,0) unless (defined($self->{_dbh}));
 
-  my $sql = "SELECT ham_count, spam_count FROM bayes_vars WHERE username = ?";
-
-  my $sth = $self->{_dbh}->prepare_cached($sql);
-
-  unless (defined($sth)) {
-    dbg("bayes: nspam_nham_get: SQL Error: ".$self->{_dbh}->errstr());
-    return (0,0);
-  }
-
-  my $rc = $sth->execute($self->{_username});
-
-  unless ($rc) {
-    dbg("bayes: nspam_nham_get: SQL Error: ".$self->{_dbh}->errstr());
-    return (0,0);
-  }
-
-  my ($ham_count, $spam_count) = $sth->fetchrow_array();
+  my @vars = $self->get_storage_variables();
 
-  $sth->finish();
-  
-  return ($spam_count || 0, $ham_count || 0);
+  return ($vars[1] || 0, $vars[2] || 0);
 }
 
 =head2 nspam_nham_change
@@ -821,24 +925,17 @@
   my $sql = "UPDATE bayes_vars
                 SET spam_count = spam_count + ?,
                     ham_count = ham_count + ?
-              WHERE username = ?";
-
-  my $sth = $self->{_dbh}->prepare_cached($sql);
+              WHERE id = ?";
 
-  unless (defined($sth)) {
-    dbg("bayes: nspam_nham_change: SQL Error: ".$self->{_dbh}->errstr());
-    return 0;
-  }
-
-  my $rc = $sth->execute($num_spam, $num_ham, $self->{_username});
+  my $rows = $self->{_dbh}->do($sql,
+			       undef,
+			       $num_spam, $num_ham, $self->{_userid});
 
-  unless ($rc) {
+  unless (defined($rows)) {
     dbg("bayes: nspam_nham_change: SQL Error: ".$self->{_dbh}->errstr());
     return 0;
   }
 
-  $sth->finish();
-
   return 1;
 }
 
@@ -863,29 +960,41 @@
   # what we are updating to
   my $sql = "UPDATE bayes_token
                 SET atime = ?
-              WHERE username = ?
+              WHERE id = ?
                 AND token = ?
                 AND atime < ?";
 
-  my $sth = $self->{_dbh}->prepare_cached($sql);
+  my $rows = $self->{_dbh}->do($sql, undef, $atime, $self->{_userid},
+			       $token, $atime);
 
-  unless (defined($sth)) {
+  unless (defined($rows)) {
     dbg("bayes: tok_touch: SQL Error: ".$self->{_dbh}->errstr());
     return 0;
   }
 
-  my $rc = $sth->execute($atime, $self->{_username}, $token, $atime);
+  # if we didn't update a row then no need to update newest_token_age
+  return 1 if ($rows eq '0E0');
 
-  unless ($rc) {
+  # need to check newest_token_age
+  # no need to check oldest_token_age since we would only update if the
+  # atime was newer than what is in the database
+  $sql = "UPDATE bayes_vars
+             SET newest_token_age = ?
+           WHERE id = ?
+             AND newest_token_age < ?";
+
+  $rows = $self->{_dbh}->do($sql, undef, $atime, $self->{_userid}, $atime);
+
+  unless (defined($rows)) {
     dbg("bayes: tok_touch: SQL Error: ".$self->{_dbh}->errstr());
     return 0;
   }
 
-  $sth->finish();
-
   return 1;
 }
 
+=cut
+
 =head2 cleanup
 
 public instance (Boolean) cleanup ()
@@ -899,8 +1008,37 @@
 sub cleanup {
   my ($self) = @_;
 
-  # Not used for this implementation
-	       
+
+  return 1 unless ($self->{needs_cleanup});
+
+  # cleanup was needed, go ahead and clear the cleanup flag
+  $self->{needs_cleanup} = 0;
+
+  my $sql = "DELETE from bayes_token
+              WHERE id = ?
+                AND spam_count = 0
+                AND ham_count = 0";
+
+  my $toks_deleted = $self->{_dbh}->do($sql, undef, $self->{_userid});
+
+  unless (defined($toks_deleted)) {
+    dbg("bayes: cleanup: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }       
+
+  # check to see if any tokens where deleted
+  return 1 if ($toks_deleted eq '0E0');
+
+  $sql = "UPDATE bayes_vars SET token_count = token_count - $toks_deleted
+           WHERE id = ?";
+
+  my $rows = $self->{_dbh}->do($sql, undef, $self->{_userid});
+
+  unless (defined($rows)) {
+    dbg("bayes: cleanup: SQL Error: ".$self->{_dbh}->errstr());
+    return 0;
+  }       
+
   return 1;
 }
 
@@ -965,25 +1103,25 @@
 
   return 0 unless (defined($self->{_dbh}));
 
-  my $rows = $self->{_dbh}->do("DELETE FROM bayes_vars WHERE username = ?",
+  my $rows = $self->{_dbh}->do("DELETE FROM bayes_vars WHERE id = ?",
 			       undef,
-			       $self->{_username});
+			       $self->{_userid});
   unless (defined($rows)) {
     dbg("SQL Error removing user (bayes_vars) data: ".$self->{_dbh}->errstr());
     return 0;
   }
 
-  $rows = $self->{_dbh}->do("DELETE FROM bayes_seen WHERE username = ?",
+  $rows = $self->{_dbh}->do("DELETE FROM bayes_seen WHERE id = ?",
 			    undef,
-			    $self->{_username});
+			    $self->{_userid});
   unless (defined($rows)) {
     dbg("SQL Error removing seen data: ".$self->{_dbh}->errstr());
     return 0;
   }
 
-  $rows = $self->{_dbh}->do("DELETE FROM bayes_token WHERE username = ?",
+  $rows = $self->{_dbh}->do("DELETE FROM bayes_token WHERE id = ?",
 			    undef,
-			    $self->{_username});
+			    $self->{_userid});
   unless (defined($rows)) {
     dbg("SQL Error removing token data: ".$self->{_dbh}->errstr());
     return 0;
@@ -1016,13 +1154,12 @@
 
   my $token_sql = "SELECT spam_count, ham_count, atime, token
                      FROM bayes_token
-                    WHERE username = ?
-                      AND (spam_count > 0 OR ham_count > 0)
-                    ORDER BY token";
+                    WHERE id = ?
+                      AND (spam_count > 0 OR ham_count > 0)";
 
   my $seen_sql = "SELECT flag, msgid
                     FROM bayes_seen
-                   WHERE username = ?";
+                   WHERE id = ?";
 
   my $sth = $self->{_dbh}->prepare($token_sql);
 
@@ -1038,8 +1175,9 @@
     return 0;
   }
 
-  while (my @values = $sth->fetchrow_array()) {
-    print "t\t" . join("\t",@values) . "\n";
+  while (my ($token, $spam_count, $ham_count, $atime) = $sth->fetchrow_array()) {
+    $token = pack("H*", $token);
+    print "t\t$spam_count, $ham_count, $atime, $token\n";
   }
 
   $sth->finish();
@@ -1104,6 +1242,11 @@
     return 0;
   }
 
+  unless ($self->_initialize_db()) {
+    dbg("bayes: Unable to re-initialize database for ".$self->{_username});
+    return 0;
+  }
+
   my $token_count = 0;
   my $db_version;
   my $num_spam = 0;
@@ -1113,8 +1256,9 @@
 
   my $line = <DUMPFILE>;
   $line_count++;
-  # We require the database version line to be the first in the file so we can figure out how
-  # to properly deal with the file.  If it is not the first line then fail
+  # We require the database version line to be the first in the file so we can
+  # figure out how to properly deal with the file.  If it is not the first
+  # line then fail
   if ($line =~ m/^v\s+(\d+)\s+db_version/) {
     $db_version = $1;
   }
@@ -1123,20 +1267,8 @@
     return 0;
   }
 
-  my $tokensql = "INSERT INTO bayes_token
-                    (username, token, spam_count, ham_count, atime)
-                  VALUES (?,?,?,?,?)";
-
-  my $tokensth = $self->{_dbh}->prepare_cached($tokensql);
-
-  my $seensql = "INSERT INTO bayes_seen (username, msgid, flag)
-                   VALUES (?, ?, ?)";
-
-  my $seensth = $self->{_dbh}->prepare_cached($seensql);
-
-  unless (defined($seensth)) {
-    dbg("SQL Error: ".$self->{_dbh}->errstr());
-    dbg("bayes: Database now in inconsistent state for ".$self->{_username});
+  unless ($db_version == 2 || $db_version == 3) {
+    dbg("bayes: Database Version $db_version is unsupported, must be version 2 or 3.");
     return 0;
   }
 
@@ -1197,13 +1329,17 @@
 	dbg("bayes: Token ($token) has the following warnings:\n".join("\n",@warnings));
       }
 
-      my $rc = $tokensth->execute($self->{_username},
-				  $token,
-				  $spam_count,
-				  $ham_count,
-				  $atime);
-      unless ($rc) {
-	dbg("bayes: Error inserting token for line: $line\nSQL Error: ".$self->errstr());
+      if ($db_version < 3) {
+	# versions < 3 use plain text tokens, so we need to convert to hash
+	$token = substr(sha1($token), -5);
+      }
+      else {
+	# turn unpacked binary token back into binary value
+	$token = pack("H*",$token);
+      }
+
+      unless ($self->_put_token($token, $spam_count, $ham_count, $atime)) {
+	dbg("bayes: Error inserting token for line: $line");
 	$error_p = 1;
       }
       $token_count++;
@@ -1223,11 +1359,8 @@
 	next;
       }
 
-      my $rc = $seensth->execute($self->{_username},
-				 $msgid,
-				 $flag);
-      unless ($rc) {
-	dbg("bayes: Error inserting msgid in seen table for line: $line\nSQL Error: ".$self->errstr());
+      unless ($self->seen_put($msgid, $flag)) {
+	dbg("bayes: Error inserting msgid in seen table for line: $line");
 	$error_p = 1;
       }
     }
@@ -1259,21 +1392,8 @@
     return 0;
   }
 
-  # There is a race condition here which is why we suggest that the user
-  # turn off SA for the duration of a restore operation.  If something comes
-  # along and calls initialize_db() before this little bit of code runs then
-  # this insert will fail, but at least we'll now wipe out the bayes_token
-  # entries for this user so that we are in a somewhat ok state.
-  my $varsupdatesql = "INSERT INTO bayes_vars (username, spam_count, ham_count)
-                       VALUES(?,?,?)";
-  
-  my $rows = $self->{_dbh}->do($varsupdatesql,
-			       undef,
-			       $self->{_username}, $num_spam, $num_ham);
-  
-  unless (defined($rows)) {
-    dbg("bayes: Error inserting user variables (bayes_vars).");
-    dbg("bayes: SQL Error:".$self->{_dbh}->errstr());
+  unless ($self->nspam_nham_change($num_spam, $num_ham)) {
+    dbg("bayes: Error updating num spam and num ham.");
     $self->clear_database();
     dbg("bayes; Database now in inconsistent state for ".$self->{_username});
     return 0;
@@ -1305,6 +1425,8 @@
 
   return 0 unless (defined($self->{_dbh}));
 
+  return ($self->{_db_version_cache}) if (defined($self->{_db_version_cache}));
+
   my $sql = "SELECT value FROM bayes_global_vars WHERE variable = 'VERSION'";
 
   my $sth = $self->{_dbh}->prepare_cached($sql);
@@ -1325,6 +1447,8 @@
 
   $sth->finish();
 
+  $self->{_db_version_cache} = $version;
+
   return $version;
 }
  
@@ -1345,130 +1469,63 @@
 
   return 0 if (!$self->{_username});
 
-  my $sql = "SELECT count(*) FROM bayes_vars WHERE username = ?";
+  my $sqlselect = "SELECT id FROM bayes_vars WHERE username = ?";
 
-  my $sth = $self->{_dbh}->prepare_cached($sql);
+  my $sthselect = $self->{_dbh}->prepare_cached($sqlselect);
 
-  unless (defined($sth)) {
+  unless (defined($sthselect)) {
     dbg("bayes: _initialize_db: SQL Error: ".$self->{_dbh}->errstr());
     return 0;
   }
 
-  my $rc = $sth->execute($self->{_username});
+  my $rc = $sthselect->execute($self->{_username});
 
   unless ($rc) {
     dbg("bayes: _initialize_db: SQL Error: ".$self->{_dbh}->errstr());
     return 0;
   }
 
-  my ($count) = $sth->fetchrow_array();
-
-  $sth->finish();
+  my ($id) = $sthselect->fetchrow_array();
 
-  if ($count) {
+  if ($id) {
+    $self->{_userid} = $id;
+    dbg("bayes: Using userid: ".$self->{_userid});
+    $sthselect->finish();
     return 1;
   }
 
   # For now let the database setup the other variables as defaults
-  $sql = "INSERT INTO bayes_vars (username) VALUES (?)";
-
-  $sth = $self->{_dbh}->prepare_cached($sql);
+  my $sqlinsert = "INSERT INTO bayes_vars (username) VALUES (?)";
 
-  unless (defined($sth)) {
+  my $rows = $self->{_dbh}->do($sqlinsert,
+			       undef,
+			       $self->{_username});
+  unless (defined($rows)) {
     dbg("bayes: _initialize_db: SQL Error: ".$self->{_dbh}->errstr());
     return 0;
   }
 
-  $rc = $sth->execute($self->{_username});
+  # Now we need to figure out what id we inserted them at, in a perfect
+  # world the database driver would handle this for us (ie mysql_insert_id)
+  # but this is far from a perfect world, however since in theory we only
+  # ever do this once it's ok to take the hit
+  $rc = $sthselect->execute($self->{_username});
 
   unless ($rc) {
     dbg("bayes: _initialize_db: SQL Error: ".$self->{_dbh}->errstr());
     return 0;
   }
 
-  $sth->finish();
-
-  return 1;
-}
-
-=head2 _token_atime
-
-private instance (Boolean) _token_atime (String $token)
-
-Description:
-This method returns a given tokens atime, it also serves to tell us
-if the token exists or not since the atime will be undefined if it
-does not exist.
-
-=cut
-
-sub _token_atime {
-  my ($self, $token) = @_;
-
-  return 0 unless (defined($self->{_dbh}));
-
-  return undef unless (defined($token));
-
-  my $sql = "SELECT atime
-               FROM bayes_token
-              WHERE username = ?
-                AND token = ?";
-
-  my $sth = $self->{_dbh}->prepare_cached($sql);
-
-  unless (defined($sth)) {
-    dbg("bayes: _token_atime: SQL Error: ".$self->{_dbh}->errstr());
-    return undef;
-  }
-
-  my $rc = $sth->execute($self->{_username}, $token);
-
-  unless ($rc) {
-    dbg("bayes: _token_atime: SQL Error: ".$self->{_dbh}->errstr());
-    return undef;
-  }
-
-  my ($token_atime) = $sth->fetchrow_array();
-
-  $sth->finish();
-
-  return $token_atime;
-}
-
-=head2 _delete_token
-
-private instance (Boolean) _delete_token (String $token)
-
-Description:
-This method deletes the given token from the database.
-
-=cut
-
-sub _delete_token {
-  my ($self, $token) = @_;
-
-  return 0 unless (defined($self->{_dbh}));
-
-  return 0 unless (defined($token));
-
-  my $sql = "DELETE FROM bayes_token WHERE username = ? AND token = ?";
-
-  my $sth = $self->{_dbh}->prepare_cached($sql);
-
-  unless (defined($sth)) {
-    dbg("bayes: _delete_token: SQL Error: ".$self->{_dbh}->errstr());
-    return 0;
-  }
+  ($id) = $sthselect->fetchrow_array();
 
-  my $rc = $sth->execute($self->{_username}, $token);
+  $sthselect->finish();
 
-  unless ($rc) {
-    dbg("bayes: _delete_token: SQL Error: ".$self->{_dbh}->errstr());
-    return 0;
+  if ($id) {
+    $self->{_userid} = $id;
+    dbg("bayes: Using userid: ".$self->{_userid});
+    return 1;
   }
 
-  $sth->finish();
-
   return 1;
 }
 
@@ -1493,20 +1550,22 @@
   $spam_count ||= 0;
   $ham_count ||= 0;
 
-  my $existing_atime = $self->_token_atime($token);
+  my ($existing_spam_count,
+      $existing_ham_count,
+      $existing_atime) = $self->tok_get($token);
 
   if ($spam_count == 0 && $ham_count == 0) {
     return 1;
   }
 
-  if (!defined($existing_atime)) {
+  if (!$existing_atime) {
 
     # You can't create a new entry for a token with a negative count, so just return
     # if we are unable to find an entry.
     return 1 if ($spam_count < 0 || $ham_count < 0);
 
     my $sql = "INSERT INTO bayes_token
-               (username, token, spam_count, ham_count, atime)
+               (id, token, spam_count, ham_count, atime)
                VALUES (?,?,?,?,?)";
 
     my $sth = $self->{_dbh}->prepare_cached($sql);
@@ -1516,7 +1575,7 @@
       return 0;
     }
 
-    my $rc = $sth->execute($self->{_username},
+    my $rc = $sth->execute($self->{_userid},
 			   $token,
 			   $spam_count,
 			   $ham_count,
@@ -1528,14 +1587,66 @@
     }
 
     $sth->finish();
-    dbg("bayes: new token ($token) inserted");
+
+    $sql = "UPDATE bayes_vars SET token_count = token_count + 1
+             WHERE id = ?";
+
+    my $rows = $self->{_dbh}->do($sql, undef, $self->{_userid});
+    
+    unless (defined($rows)) {
+      dbg("bayes: _put_token: SQL Error: ".$self->{_dbh}->errstr());
+      return 0;
+    }
+
+    $sql = "UPDATE bayes_vars SET newest_token_age = ?
+             WHERE id = ? AND newest_token_age < ?";
+
+    $rows = $self->{_dbh}->do($sql, undef, $atime, $self->{_userid}, $atime);
+
+    unless (defined($rows)) {
+      dbg("bayes: _put_token: SQL Error: ".$self->{_dbh}->errstr());
+      return 0;
+    }
+
+    if ($rows eq '0E0') {
+      # no need to update oldest_token_age if we updated newest_token_age
+      
+      $sql = "UPDATE bayes_vars SET oldest_token_age = ?
+               WHERE id = ? AND oldest_token_age > ?";
+
+      $rows = $self->{_dbh}->do($sql, undef, $atime, $self->{_userid}, $atime);
+      
+      unless (defined($rows)) {
+	dbg("bayes: _put_token: SQL Error: ".$self->{_dbh}->errstr());
+	return 0;
+      }
+    }
   }
   else {
+
+    if ($spam_count < 0 || $ham_count < 0) {
+      # we only need to cleanup when we subtract counts for a token and the
+      # counts may have both reached 0
+      # XXX - future optimization, since we have the existing spam/ham counts
+      # we can make an educated guess on if the count would reach 0, for
+      # instance, if we are decreasing spam_count but spam_count is currently
+      # > 1000, then there is no possible why this update or any others that
+      # might currently be happening could reduce that value to 0, so there
+      # would be no need to set the needs_cleanup flag
+      $self->{needs_cleanup} = 1;
+    }
+
     my $update_atime_p = 1;
+    my $updated_atime_p = 0;
 
-    # if the existing atime is already >= the one we are going to set, then don't bother
+    # if the existing atime is already >= the one we are going to set, then
+    # don't bother
     $update_atime_p = 0 if ($existing_atime >= $atime);
 
+    # These SQL statements include as part of the WHERE clause something like
+    # "AND spam_count + ? >= 0" or "AND ham_count + ? >= 0".  This is to keep
+    # the count from going negative.
+
     if ($spam_count) {
       my $sql;
       my @args;
@@ -1543,19 +1654,19 @@
 	$sql = "UPDATE bayes_token
                    SET spam_count = spam_count + ?,
                        atime = ?
-                 WHERE username = ?
+                 WHERE id = ?
                    AND token = ?
                    AND spam_count + ? >= 0";
-	@args = ($spam_count, $atime, $self->{_username}, $token, $spam_count);
-	$update_atime_p = 0;
+	@args = ($spam_count, $atime, $self->{_userid}, $token, $spam_count);
+	$updated_atime_p = 1; # note the fact that we did do it
       }
       else {
 	$sql = "UPDATE bayes_token
                    SET spam_count = spam_count + ?
-                 WHERE username = ?
+                 WHERE id = ?
                    AND token = ?
                    AND spam_count + ? >= 0";
-	@args = ($spam_count, $self->{_username}, $token, $spam_count);
+	@args = ($spam_count, $self->{_userid}, $token, $spam_count);
       }
 
       my $rows = $self->{_dbh}->do($sql, undef, @args);
@@ -1569,22 +1680,22 @@
     if ($ham_count) {
       my $sql;
       my @args;
-      if ($update_atime_p) {
+      if ($update_atime_p && !$updated_atime_p) {
 	$sql = "UPDATE bayes_token
                    SET ham_count = ham_count + ?,
                        atime = ?
-                 WHERE username = ?
+                 WHERE id = ?
                    AND token = ?
                    AND ham_count + ? >= 0";
-	@args = ($ham_count, $atime, $self->{_username}, $token, $ham_count);
+	@args = ($ham_count, $atime, $self->{_userid}, $token, $ham_count);
       }
       else {
 	$sql = "UPDATE bayes_token
                    SET ham_count = ham_count + ?
-                 WHERE username = ?
+                 WHERE id = ?
                    AND token = ?
                    AND ham_count + ? >= 0";
-	@args = ($ham_count, $self->{_username}, $token, $ham_count);
+	@args = ($ham_count, $self->{_userid}, $token, $ham_count);
       }
 
       my $rows = $self->{_dbh}->do($sql, undef, @args);
@@ -1595,50 +1706,23 @@
       }
     }
 
-    dbg("bayes: token ($token) updated");
-  }
-  return 1;
-}
-
-=head2 _get_token_count
-
-private instance (Integer) _get_token_count ()
-
-Description:
-This method returns the total number of tokens present in the token database
-for a user.
-
-=cut
-
-sub _get_token_count {
-  my ($self) = @_;
-
-  return 0 unless (defined($self->{_dbh}));
-
-  my $sql = "SELECT count(*)
-               FROM bayes_token
-              WHERE username = ?
-                AND (spam_count > 0 OR ham_count > 0)";
-
-  my $sth = $self->{_dbh}->prepare_cached($sql);
-
-  unless (defined($sth)) {
-    dbg("bayes: _get_token_count: SQL Error: ".$self->{_dbh}->errstr());
-    return 0;
-  }
+    if ($updated_atime_p) {
+      # we updated the atime, so we need to check and update bayes_vars
+      # we only need to worry about newest_token_age since we would have
+      # only updated the atime if it was > the previous value
+      my $sql = "UPDATE bayes_vars SET newest_token_age = ?
+                  WHERE id = ? AND newest_token_age < ?";
 
-  my $rc = $sth->execute($self->{_username});
+      my $rows = $self->{_dbh}->do($sql, undef, $atime, $self->{_userid}, $atime);
 
-  unless (defined($sth)) {
-    dbg("bayes: _get_token_count: SQL Error: ".$self->{_dbh}->errstr());
-    return 0;
+      unless (defined($rows)) {
+	dbg("bayes: _put_token: SQL Error: ".$self->{_dbh}->errstr());
+	return 0;
+      }
+    }
   }
 
-  my ($token_count) = $sth->fetchrow_array();
-
-  $sth->finish();
-
-  return $token_count
+  return 1;
 }
 
 =head2 _get_oldest_token_age
@@ -1648,6 +1732,10 @@
 Description:
 This method finds the atime of the oldest token in the database.
 
+The use of min(atime) in the SQL is ugly and but really the most efficient
+way of getting the oldest_token_age after we've done a mass expire.  It should
+only be called at expire time.
+
 =cut
 
 sub _get_oldest_token_age {
@@ -1655,7 +1743,8 @@
 
   return 0 unless (defined($self->{_dbh}));
 
-  my $sql = "SELECT min(atime) FROM bayes_token WHERE username = ?";
+  my $sql = "SELECT min(atime) FROM bayes_token
+              WHERE id = ?";
 
   my $sth = $self->{_dbh}->prepare_cached($sql);
 
@@ -1664,7 +1753,7 @@
     return 0;
   }
 
-  my $rc = $sth->execute($self->{_username});
+  my $rc = $sth->execute($self->{_userid});
 
   unless ($rc) {
     dbg("bayes: _get_oldest_token_age: SQL Error: ".$self->{_dbh}->errstr());
@@ -1678,116 +1767,6 @@
   return $atime;
 }
 
-=head2 _get_newest_token_age
-
-private instance (Integer) _get_newest_token_age ()
-
-Description:
-This method finds the atime of the newest token in the database.
-
-=cut
-
-sub _get_newest_token_age {
-  my ($self) = @_;
-
-  return 0 unless (defined($self->{_dbh}));
-
-  my $sql = "SELECT max(atime) FROM bayes_token WHERE username = ?";
-
-  my $sth = $self->{_dbh}->prepare_cached($sql);
-
-  unless (defined($sth)) {
-    dbg("bayes: _get_newest_token_age: SQL Error: ".$self->{_dbh}->errstr());
-    return 0;
-  }
-
-  my $rc = $sth->execute($self->{_username});
-
-  unless ($rc) {
-    dbg("bayes: _get_newest_token_age: SQL Error: ".$self->{_dbh}->errstr());
-    return 0;
-  }
-
-  my ($atime) = $sth->fetchrow_array();
-
-  $sth->finish();
-
-  return $atime;
-}
-
-=head2 _set_last_atime_delta
-
-private instance (Boolean) _set_last_atime_delta (Integer $newdelta)
-
-Description:
-This method sets the last_atime_delta variable in the variable table.
-
-=cut
-
-sub _set_last_atime_delta {
-  my ($self, $newdelta) = @_;
-
-  return 0 unless (defined($self->{_dbh}));
-
-  return 0 unless (defined($newdelta));
-
-  my $sql = "UPDATE bayes_vars SET last_atime_delta = ? WHERE username = ?";
-
-  my $sth = $self->{_dbh}->prepare_cached($sql);
-
-  unless (defined($sth)) {
-    dbg("bayes: _set_last_atime_delta: SQL Error: ".$self->{_dbh}->errstr());
-    return 0;
-  }
-
-  my $rc = $sth->execute($newdelta, $self->{_username});
-
-  unless ($rc) {
-    dbg("bayes: _set_last_atime_delta: SQL Error: ".$self->{_dbh}->errstr());
-    return 0;
-  }
-
-  $sth->finish();
-
-  return 1;
-}
-
-=head2 _set_last_expire_reduce
-
-private instance (Boolean) _set_last_expire_reduce (Integer $deleted)
-
-Description:
-This method sets the last_expire_reduce values in the variable table.
-
-=cut
-
-sub _set_last_expire_reduce {
-  my ($self, $deleted) = @_;
-
-  return 0 unless (defined($self->{_dbh}));
-
-  return 0 unless (defined($deleted));
-
-  my $sql = "UPDATE bayes_vars SET last_expire_reduce = ? WHERE username = ?";
-
-  my $sth = $self->{_dbh}->prepare_cached($sql);
-
-  unless (defined($sth)) {
-    dbg("bayes: _set_last_expire_reduce: SQL Error: ".$self->{_dbh}->errstr());
-    return 0;
-  }
-
-  my $rc = $sth->execute($deleted, $self->{_username});
-
-  unless ($rc) {
-    dbg("bayes: _set_last_expire_reduce: SQL Error: ".$self->{_dbh}->errstr());
-    return 0;
-  }
-
-  $sth->finish();
-
-  return 1;
-}
 
 =head2 _get_num_hapaxes
 
@@ -1806,7 +1785,7 @@
 
   my $sql = "SELECT count(*)
                FROM bayes_token
-              WHERE username = ?
+              WHERE id = ?
                 AND spam_count + ham_count = 1";
 
   my $sth = $self->{_dbh}->prepare_cached($sql);
@@ -1816,7 +1795,7 @@
     return 0;
   }
 
-  my $rc = $sth->execute($self->{_username});
+  my $rc = $sth->execute($self->{_userid});
 
   unless ($rc) {
     dbg("bayes: _get_num_hapaxes: SQL Error: ".$self->{_dbh}->errstr());
@@ -1848,7 +1827,7 @@
 
   my $sql = "SELECT count(*)
                FROM bayes_token
-              WHERE username = ? 
+              WHERE id = ?
                 AND (spam_count >= 0 AND spam_count < 8)
                 AND (ham_count >= 0 AND ham_count < 8)
                 AND spam_count + ham_count != 1";
@@ -1860,7 +1839,7 @@
     return 0;
   }
 
-  my $rc = $sth->execute($self->{_username});
+  my $rc = $sth->execute($self->{_userid});
 
   unless ($rc) {
     dbg("bayes: _get_num_lowfreq: SQL Error: ".$self->{_dbh}->errstr());

Modified: incubator/spamassassin/trunk/sql/README.bayes
==============================================================================
--- incubator/spamassassin/trunk/sql/README.bayes	(original)
+++ incubator/spamassassin/trunk/sql/README.bayes	Wed Apr 28 18:02:59 2004
@@ -28,11 +28,11 @@
 be used to create the connection to your SQL server.  It MUST be in
 the format as listed above.  <driver> should be the DBD driver that
 you have installed to access your database (initially tested with
-MySQL, PostgreSQL, SQLite, and DBD::CSV).  <database> must be the name
-of the database that you created to store the bayes data
-tables. <hostname> is the name of the host that contains the SQL
-database  server.  <port> is the optional port number where your
-database server is listening.
+MySQL, PostgreSQL, and SQLite).  <database> must be the name of the
+database that you created to store the bayes data tables. <hostname>
+is the name of the host that contains the SQL database  server.
+<port> is the optional port number where your database server is
+listening.
 
 For an example of connection to PostgreSQL, see the main README file.
 

Modified: incubator/spamassassin/trunk/sql/bayes_mysql.sql
==============================================================================
--- incubator/spamassassin/trunk/sql/bayes_mysql.sql	(original)
+++ incubator/spamassassin/trunk/sql/bayes_mysql.sql	Wed Apr 28 18:02:59 2004
@@ -1,8 +1,8 @@
 
 CREATE TABLE bayes_expire (
-  username varchar(200) NOT NULL default '',
+  id int(11) NOT NULL default '0',
   runtime int(11) NOT NULL default '0',
-  KEY bayes_expire_idx1 (username)
+  KEY bayes_expire_idx1 (id)
 ) TYPE=MyISAM;
 
 CREATE TABLE bayes_global_vars (
@@ -11,31 +11,35 @@
   PRIMARY KEY  (variable)
 ) TYPE=MyISAM;
 
-INSERT INTO bayes_global_vars VALUES ('VERSION','2');
+INSERT INTO bayes_global_vars VALUES ('VERSION','3');
 
 CREATE TABLE bayes_seen (
-  username varchar(200) NOT NULL default '',
+  id int(11) NOT NULL default '0',
   msgid varchar(200) binary NOT NULL default '',
   flag char(1) NOT NULL default '',
-  PRIMARY KEY  (username,msgid),
-  KEY bayes_seen_idx1 (username,flag)
+  PRIMARY KEY  (id,msgid),
 ) TYPE=MyISAM;
 
 CREATE TABLE bayes_token (
-  username varchar(200) NOT NULL default '',
-  token varchar(200) binary NOT NULL default '',
+  id int(11) NOT NULL default '0',
+  token char(5) NOT NULL default '',
   spam_count int(11) NOT NULL default '0',
   ham_count int(11) NOT NULL default '0',
   atime int(11) NOT NULL default '0',
-  PRIMARY KEY  (username,token)
+  PRIMARY KEY  (id, token)
 ) TYPE=MyISAM;
 
 CREATE TABLE bayes_vars (
+  id int(11) NOT NULL AUTO_INCREMENT,
   username varchar(200) NOT NULL default '',
   spam_count int(11) NOT NULL default '0',
   ham_count int(11) NOT NULL default '0',
+  token_count int(11) NOT NULL default '0',
   last_expire int(11) NOT NULL default '0',
   last_atime_delta int(11) NOT NULL default '0',
   last_expire_reduce int(11) NOT NULL default '0',
-  PRIMARY KEY  (username)
+  oldest_token_age int(11) NOT NULL default '2147483647',
+  newest_token_age int(11) NOT NULL default '0',
+  PRIMARY KEY  (id),
+  UNIQUE bayes_vars_idx1 (username)
 ) TYPE=MyISAM;

Modified: incubator/spamassassin/trunk/sql/bayes_pg.sql
==============================================================================
--- incubator/spamassassin/trunk/sql/bayes_pg.sql	(original)
+++ incubator/spamassassin/trunk/sql/bayes_pg.sql	Wed Apr 28 18:02:59 2004
@@ -1,10 +1,10 @@
 
 CREATE TABLE bayes_expire (
-  username varchar(200) NOT NULL default '',
+  id int(11) NOT NULL default '0',
   runtime integer NOT NULL default '0'
 );
 
-CREATE INDEX bayes_expire_idx1 ON bayes_expire (username);
+CREATE INDEX bayes_expire_idx1 ON bayes_expire (id);
 
 CREATE TABLE bayes_global_vars (
   variable varchar(30) NOT NULL default '',
@@ -12,32 +12,36 @@
   PRIMARY KEY  (variable)
 );
 
-INSERT INTO bayes_global_vars VALUES ('VERSION','2');
+INSERT INTO bayes_global_vars VALUES ('VERSION','3');
 
 CREATE TABLE bayes_seen (
-  username varchar(200) NOT NULL default '',
+  id int(11) NOT NULL default '0',
   msgid varchar(200) NOT NULL default '',
   flag character(1) NOT NULL default '',
-  PRIMARY KEY  (username,msgid)
+  PRIMARY KEY  (id,msgid)
 );
 
-CREATE INDEX bayes_seen_idx1 ON bayes_seen (username, flag);
-
 CREATE TABLE bayes_token (
-  username varchar(200) NOT NULL default '',
-  token varchar(200) NOT NULL default '',
+  id int(11) NOT NULL default '0',
+  token char(200) NOT NULL default '',
   spam_count integer NOT NULL default '0',
   ham_count integer NOT NULL default '0',
   atime integer NOT NULL default '0',
-  PRIMARY KEY  (username,token)
+  PRIMARY KEY  (id,token)
 );
 
 CREATE TABLE bayes_vars (
+  id serial NOT NULL,
   username varchar(200) NOT NULL default '',
   spam_count integer NOT NULL default '0',
   ham_count integer NOT NULL default '0',
+  token_count integer NOT NULL default '0',
   last_expire integer NOT NULL default '0',
   last_atime_delta integer NOT NULL default '0',
   last_expire_reduce integer NOT NULL default '0',
-  PRIMARY KEY  (username)
+  oldest_token_age integer NOT NULL default '2147483647',
+  newest_token_age integer NOT NULL default '0',
+  PRIMARY KEY  (id)
 );
+
+CREATE INDEX bayes_vars_idx1 ON bayes_vars (username);
\ No newline at end of file

Modified: incubator/spamassassin/trunk/t/bayesdbm.t
==============================================================================
--- incubator/spamassassin/trunk/t/bayesdbm.t	(original)
+++ incubator/spamassassin/trunk/t/bayesdbm.t	Wed Apr 28 18:02:59 2004
@@ -16,7 +16,7 @@
     unshift(@INC, '../blib/lib');
   }
 
-  plan tests => (HAS_DB_FILE ? 42 : 0);
+  plan tests => (HAS_DB_FILE ? 44 : 0);
 };
 
 exit unless HAS_DB_FILE;
@@ -55,9 +55,9 @@
 
 ok($body);
 
-my @toks = $sa->{bayes_scanner}->tokenize($mail, $body);
+my $toks = $sa->{bayes_scanner}->tokenize($mail, $body);
 
-ok(scalar(@toks) > 0);
+ok(scalar(keys %{$toks}) > 0);
 
 my($msgid,$msgid_hdr) = $sa->{bayes_scanner}->get_msgid($mail);
 
@@ -84,7 +84,7 @@
 ok($sa->{bayes_scanner}->{store}->tie_db_writable());
 
 my $tokerror = 0;
-foreach my $tok (@toks) {
+foreach my $tok (keys %{$toks}) {
   my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok);
   if ($spam == 0 || $ham > 0) {
     $tokerror = 1;
@@ -92,6 +92,19 @@
 }
 ok(!$tokerror);
 
+my $tokens = $sa->{bayes_scanner}->{store}->tok_get_all(keys %{$toks});
+
+ok($tokens);
+
+$tokerror = 0;
+foreach my $tok (@{$tokens}) {
+  my ($token, $tok_spam, $tok_ham, $atime) = @{$tok};
+  if ($tok_spam == 0 || $tok_ham > 0) {
+    $tokerror = 1;
+  }
+}
+ok(!$tokerror);
+
 $sa->{bayes_scanner}->{store}->untie_db();
 
 ok($sa->{bayes_scanner}->learn(0, $mail));
@@ -105,7 +118,7 @@
 ok($sa->{bayes_scanner}->{store}->tie_db_writable());
 
 $tokerror = 0;
-foreach my $tok (@toks) {
+foreach my $tok (keys %{$toks}) {
   my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok);
   if ($spam  > 0 || $ham == 0) {
     $tokerror = 1;

Modified: incubator/spamassassin/trunk/t/bayesdbm_flock.t
==============================================================================
--- incubator/spamassassin/trunk/t/bayesdbm_flock.t	(original)
+++ incubator/spamassassin/trunk/t/bayesdbm_flock.t	Wed Apr 28 18:02:59 2004
@@ -16,7 +16,7 @@
     unshift(@INC, '../blib/lib');
   }
 
-  plan tests => (HAS_DB_FILE ? 42 : 0);
+  plan tests => (HAS_DB_FILE ? 44 : 0);
 };
 
 exit unless HAS_DB_FILE;
@@ -56,9 +56,9 @@
 
 ok($body);
 
-my @toks = $sa->{bayes_scanner}->tokenize($mail, $body);
+my $toks = $sa->{bayes_scanner}->tokenize($mail, $body);
 
-ok(scalar(@toks) > 0);
+ok(scalar(keys %{$toks}) > 0);
 
 my($msgid,$msgid_hdr) = $sa->{bayes_scanner}->get_msgid($mail);
 
@@ -85,7 +85,7 @@
 ok($sa->{bayes_scanner}->{store}->tie_db_writable());
 
 my $tokerror = 0;
-foreach my $tok (@toks) {
+foreach my $tok (keys %{$toks}) {
   my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok);
   if ($spam == 0 || $ham > 0) {
     $tokerror = 1;
@@ -93,6 +93,19 @@
 }
 ok(!$tokerror);
 
+my $tokens = $sa->{bayes_scanner}->{store}->tok_get_all(keys %{$toks});
+
+ok($tokens);
+
+$tokerror = 0;
+foreach my $tok (@{$tokens}) {
+  my ($token, $tok_spam, $tok_ham, $atime) = @{$tok};
+  if ($tok_spam == 0 || $tok_ham > 0) {
+    $tokerror = 1;
+  }
+}
+ok(!$tokerror);
+
 $sa->{bayes_scanner}->{store}->untie_db();
 
 ok($sa->{bayes_scanner}->learn(0, $mail));
@@ -106,7 +119,7 @@
 ok($sa->{bayes_scanner}->{store}->tie_db_writable());
 
 $tokerror = 0;
-foreach my $tok (@toks) {
+foreach my $tok (keys %{$toks}) {
   my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok);
   if ($spam  > 0 || $ham == 0) {
     $tokerror = 1;

Modified: incubator/spamassassin/trunk/t/bayessql.t
==============================================================================
--- incubator/spamassassin/trunk/t/bayessql.t	(original)
+++ incubator/spamassassin/trunk/t/bayessql.t	Wed Apr 28 18:02:59 2004
@@ -16,7 +16,7 @@
     unshift(@INC, '../blib/lib');
   }
 
-  plan tests => ((TEST_ENABLED && HAS_DBI) ? 38 : 0);
+  plan tests => ((TEST_ENABLED && HAS_DBI) ? 40 : 0);
 
   onfail => sub {
     warn "\n\nNote: Failure may be due to an incorrect config.";
@@ -94,9 +94,9 @@
 
 ok($body);
 
-my @toks = $sa->{bayes_scanner}->tokenize($mail, $body);
+my $toks = $sa->{bayes_scanner}->tokenize($mail, $body);
 
-ok(scalar(@toks) > 0);
+ok(scalar(keys %{$toks}) > 0);
 
 my($msgid,$msgid_hdr) = $sa->{bayes_scanner}->get_msgid($mail);
 
@@ -123,7 +123,7 @@
 ok($sa->{bayes_scanner}->{store}->tie_db_writable());
 
 my $tokerror = 0;
-foreach my $tok (@toks) {
+foreach my $tok (keys %{$toks}) {
   my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok);
   if ($spam == 0 || $ham > 0) {
     $tokerror = 1;
@@ -131,6 +131,20 @@
 }
 ok(!$tokerror);
 
+my $tokens = $sa->{bayes_scanner}->{store}->tok_get_all(keys %{$toks});
+
+ok($tokens);
+
+$tokerror = 0;
+foreach my $tok (@{$tokens}) {
+  my ($token, $tok_spam, $tok_ham, $atime) = @{$tok};
+  if ($tok_spam == 0 || $tok_ham > 0) {
+    $tokerror = 1;
+  }
+}
+
+ok(!$tokerror);
+
 $sa->{bayes_scanner}->{store}->untie_db();
 
 ok($sa->{bayes_scanner}->learn(0, $mail));
@@ -144,7 +158,7 @@
 ok($sa->{bayes_scanner}->{store}->tie_db_writable());
 
 $tokerror = 0;
-foreach my $tok (@toks) {
+foreach my $tok (keys %{$toks}) {
   my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok);
   if ($spam  > 0 || $ham == 0) {
     $tokerror = 1;
@@ -292,18 +306,24 @@
     return 0;
   }
 
-  $rv = $dbh->do("DELETE FROM bayes_vars WHERE username = ?", undef, $testuser);
+  $rv = $dbh->do("DELETE FROM bayes_seen WHERE id = (SELECT id FROM bayes_vars WHERE username = ?)", undef, $testuser);
+  if (!defined($rv)) {
+    $error = 1;
+  }
+
+  $rv = $dbh->do("DELETE FROM bayes_token WHERE id = (SELECT id FROM bayes_vars WHERE username = ?)", undef, $testuser);
   if (!defined($rv)) {
     $error = 1;
   }
-  $rv = $dbh->do("DELETE FROM bayes_seen WHERE username = ?", undef, $testuser);
+
+  $rv = $dbh->do("DELETE FROM bayes_expire WHERE id = (SELECT id FROM bayes_vars WHERE username = ?)", undef, $testuser);
   if (!defined($rv)) {
     $error = 1;
   }
-  $rv = $dbh->do("DELETE FROM bayes_token WHERE username = ?", undef, $testuser);
+
+  $rv = $dbh->do("DELETE FROM bayes_vars WHERE username = ?", undef, $testuser);
   if (!defined($rv)) {
     $error = 1;
   }
-  $rv = $dbh->do("DELETE FROM bayes_expire WHERE username = ?", undef, $testuser);
   return !$error;
 }