You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2005/06/02 03:40:03 UTC

svn commit: r179482 - in /spamassassin/trunk: MANIFEST lib/Mail/SpamAssassin/BayesStore/DBM.pm t/bayessdbm_seen_delete.t

Author: jm
Date: Wed Jun  1 18:40:02 2005
New Revision: 179482

URL: http://svn.apache.org/viewcvs?rev=179482&view=rev
Log:
bug 2975: it is now possible to delete the bayes_seen database file(s) without breaking Bayes, to cope with its uncontrolled growth

Added:
    spamassassin/trunk/t/bayessdbm_seen_delete.t   (with props)
Modified:
    spamassassin/trunk/MANIFEST
    spamassassin/trunk/lib/Mail/SpamAssassin/BayesStore/DBM.pm

Modified: spamassassin/trunk/MANIFEST
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/MANIFEST?rev=179482&r1=179481&r2=179482&view=diff
==============================================================================
--- spamassassin/trunk/MANIFEST (original)
+++ spamassassin/trunk/MANIFEST Wed Jun  1 18:40:02 2005
@@ -262,6 +262,7 @@
 t/bayesdbm.t
 t/bayesdbm_flock.t
 t/bayessdbm.t
+t/bayessdbm_seen_delete.t
 t/bayessql.t
 t/blacklist_autolearn.t
 t/body_mod.t

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/BayesStore/DBM.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/BayesStore/DBM.pm?rev=179482&r1=179481&r2=179482&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/BayesStore/DBM.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/BayesStore/DBM.pm Wed Jun  1 18:40:02 2005
@@ -166,10 +166,25 @@
     my $name = $path.'_'.$dbname;
     my $db_var = 'db_'.$dbname;
     dbg("bayes: tie-ing to DB file R/O $name");
+
     # untie %{$self->{$db_var}} if (tied %{$self->{$db_var}});
-    tie %{$self->{$db_var}},$self->DBM_MODULE,$name, O_RDONLY,
-		 (oct($main->{conf}->{bayes_file_mode}) & 0666)
-       or goto failed_to_tie;
+    if (!tie %{$self->{$db_var}},$self->DBM_MODULE, $name, O_RDONLY,
+		 (oct($main->{conf}->{bayes_file_mode}) & 0666))
+    {
+      # bug 2975: it's acceptable for the db_seen to not be present,
+      # to allow it to be recycled.  if that's the case, just create
+      # a new, empty one. we don't need to lock it, since we won't
+      # be writing to it; let the R/W api deal with that case.
+
+      if ($dbname eq 'seen') {
+        tie %{$self->{$db_var}},$self->DBM_MODULE, $name, O_RDWR|O_CREAT,
+                    (oct($main->{conf}->{bayes_file_mode}) & 0666)
+          or goto failed_to_tie;
+      }
+      else {
+        goto failed_to_tie;
+      }
+    }
   }
 
   $self->{db_version} = ($self->get_storage_variables())[6];

Added: spamassassin/trunk/t/bayessdbm_seen_delete.t
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/t/bayessdbm_seen_delete.t?rev=179482&view=auto
==============================================================================
--- spamassassin/trunk/t/bayessdbm_seen_delete.t (added)
+++ spamassassin/trunk/t/bayessdbm_seen_delete.t Wed Jun  1 18:40:02 2005
@@ -0,0 +1,286 @@
+#!/usr/bin/perl
+
+use Data::Dumper;
+use lib '.'; use lib 't';
+use SATest; sa_t_init("bayessdbm_seen_delete");
+use Test;
+
+use constant HAS_SDBM_FILE => eval { require SDBM_File; };
+
+BEGIN { 
+  if (-e 't/test_dir') {
+    chdir 't';
+  }
+
+  if (-e 'test_dir') {
+    unshift(@INC, '../blib/lib');
+  }
+
+  plan tests => (HAS_SDBM_FILE ? 54 : 0);
+};
+
+exit unless HAS_SDBM_FILE;
+
+tstlocalrules ("
+        bayes_store_module Mail::SpamAssassin::BayesStore::SDBM
+        bayes_learn_to_journal 0
+");
+
+use Mail::SpamAssassin;
+
+my $sa = create_saobj();
+
+$sa->init();
+
+ok($sa);
+
+ok($sa->{bayes_scanner});
+
+ok(!$sa->{bayes_scanner}->is_scan_available());
+
+open(MAIL,"< data/spam/001");
+
+my $raw_message = do {
+  local $/;
+  <MAIL>;
+};
+
+close(MAIL);
+ok($raw_message);
+
+my $mail = $sa->parse( $raw_message );
+
+ok($mail);
+
+my $body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+
+ok($body);
+
+my $toks = $sa->{bayes_scanner}->tokenize($mail, $body);
+
+ok(scalar(keys %{$toks}) > 0);
+
+my($msgid,$msgid_hdr) = $sa->{bayes_scanner}->get_msgid($mail);
+
+# $msgid is the generated hash messageid
+# $msgid_hdr is the Message-Id header
+ok($msgid eq 'ce33e4a8bc5798c65428d6018380bae346c7c126@sa_generated');
+ok($msgid_hdr eq '9PS291LhupY');
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid));
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+ok($sa->{bayes_scanner}->learn(1, $mail));
+
+ok(!$sa->{bayes_scanner}->learn(1, $mail));
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 's');
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+my $tokerror = 0;
+foreach my $tok (keys %{$toks}) {
+  my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok);
+  if ($spam == 0 || $ham > 0) {
+    $tokerror = 1;
+  }
+}
+ok(!$tokerror);
+
+my $tokens = $sa->{bayes_scanner}->{store}->tok_get_all(keys %{$toks});
+
+ok($tokens);
+
+$tokerror = 0;
+foreach my $tok (@{$tokens}) {
+  my ($token, $tok_spam, $tok_ham, $atime) = @{$tok};
+  if ($tok_spam == 0 || $tok_ham > 0) {
+    $tokerror = 1;
+  }
+}
+ok(!$tokerror);
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+ok($sa->{bayes_scanner}->learn(0, $mail));
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 'h');
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+$tokerror = 0;
+foreach my $tok (keys %{$toks}) {
+  my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok);
+  if ($spam  > 0 || $ham == 0) {
+    $tokerror = 1;
+  }
+}
+ok(!$tokerror);
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+ok($sa->{bayes_scanner}->forget($mail));
+
+ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+
+ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid));
+
+$sa->{bayes_scanner}->{store}->untie_db();
+
+undef $sa;
+
+sa_t_init('bayes'); # this wipes out what is there and begins anew
+
+# make sure we learn to a journal
+tstlocalrules ("
+        bayes_store_module Mail::SpamAssassin::BayesStore::SDBM
+        bayes_learn_to_journal 1
+");
+
+$sa = create_saobj();
+
+$sa->init();
+
+ok(!-e 'log/user_state/bayes_journal');
+
+ok($sa->{bayes_scanner}->learn(1, $mail));
+
+ok(-e 'log/user_state/bayes_journal');
+
+$sa->{bayes_scanner}->sync(1); # always returns 0, so no need to check return
+
+ok(!-e 'log/user_state/bayes_journal');
+
+ok(-e 'log/user_state/bayes_seen.pag');
+ok(-e 'log/user_state/bayes_seen.dir');
+
+ok(-e 'log/user_state/bayes_toks.pag');
+ok(-e 'log/user_state/bayes_toks.dir');
+
+undef $sa;
+
+sa_t_init('bayes'); # this wipes out what is there and begins anew
+
+# make sure we learn to a journal
+tstlocalrules ("
+        bayes_store_module Mail::SpamAssassin::BayesStore::SDBM
+bayes_learn_to_journal 0
+bayes_min_spam_num 10
+bayes_min_ham_num 10
+");
+
+# we get to bastardize the existing pattern matching code here.  It lets us provide
+# our own checking callback and keep using the existing ok_all_patterns call
+%patterns = ( 1 => 'Acted on message' );
+
+ok(salearnrun("--spam data/spam", \&check_examined));
+ok_all_patterns();
+
+ok(salearnrun("--ham data/nice", \&check_examined));
+ok_all_patterns();
+
+ok(salearnrun("--ham data/whitelists", \&check_examined));
+ok_all_patterns();
+
+%patterns = ( 'non-token data: bayes db version' => 'db version' );
+ok(salearnrun("--dump magic", \&patterns_run_cb));
+ok_all_patterns();
+
+# now delete the journal and bayes_seen -- should still be possible
+# for Bayes to continue...
+unlink 'log/user_state/bayes_journal';
+ok(unlink 'log/user_state/bayes_seen.pag');
+ok(unlink 'log/user_state/bayes_seen.dir');
+
+use constant SCAN_USING_PERL_CODE_TEST => 1;
+
+if (SCAN_USING_PERL_CODE_TEST) {
+$sa = create_saobj();
+
+$sa->init();
+
+open(MAIL,"< ../sample-nonspam.txt");
+
+$raw_message = do {
+  local $/;
+  <MAIL>;
+};
+
+close(MAIL);
+
+$mail = $sa->parse( $raw_message );
+
+$body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+
+my $msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail);
+
+ok($msgstatus);
+
+my $score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body);
+
+# Pretty much we can't count on the data returned with such little training
+# so just make sure that the score wasn't equal to .5 which is the default
+# return value.
+print "\treturned score: $score\n";
+ok($score =~ /\d/ && $score <= 1.0 && $score != .5);
+
+open(MAIL,"< ../sample-spam.txt");
+
+$raw_message = do {
+  local $/;
+  <MAIL>;
+};
+
+close(MAIL);
+
+$mail = $sa->parse( $raw_message );
+
+$body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+
+$msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail);
+
+$score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body);
+
+# Pretty much we can't count on the data returned with such little training
+# so just make sure that the score wasn't equal to .5 which is the default
+# return value.
+print "\treturned score: $score\n";
+ok($score =~ /\d/ && $score <= 1.0 && $score != .5);
+
+}
+
+ok($sa->{bayes_scanner}->{store}->clear_database());
+
+ok(!-e 'log/user_state/bayes_journal');
+ok(!-e 'log/user_state/bayes_seen.pag');
+ok(!-e 'log/user_state/bayes_seen.dir');
+ok(!-e 'log/user_state/bayes_toks.pag');
+ok(!-e 'log/user_state/bayes_toks.dir');
+
+sub check_examined {
+  local ($_);
+  my $string = shift;
+
+  if (defined $string) {
+    $_ = $string;
+  } else {
+    $_ = join ('', <IN>);
+  }
+
+  if ($_ =~ /(?:Forgot|Learned) tokens from \d+ message\(s\) \(\d+ message\(s\) examined\)/) {
+    $found{'Acted on message'}++;
+  }
+}
+
+

Propchange: spamassassin/trunk/t/bayessdbm_seen_delete.t
------------------------------------------------------------------------------
    svn:executable = *