You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2004/02/19 00:26:04 UTC

svn commit: rev 6733 - in incubator/spamassassin/trunk: lib/Mail/SpamAssassin t

Author: felicity
Date: Wed Feb 18 15:26:01 2004
New Revision: 6733

Modified:
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm
   incubator/spamassassin/trunk/t/bayesdbm.t
Log:
bug 3055: spammers are using the same message id to get around bayes being able to learn different messages.  make the hash message-id the default now, but be backwards compatible with the seen db.

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm	Wed Feb 18 15:26:01 2004
@@ -684,35 +684,49 @@
 # this function is trapped by the wrapper above
 sub learn_trapped {
   my ($self, $isspam, $msg, $body, $msgid) = @_;
+  my @msgid = ( $msgid );
 
-  $msgid ||= $self->get_msgid ($msg);
-  my $seen = $self->{store}->seen_get ($msgid);
-  if (defined ($seen)) {
-    if (($seen eq 's' && $isspam) || ($seen eq 'h' && !$isspam)) {
-      dbg ("$msgid: already learnt correctly, not learning twice");
-      return 0;
-    } elsif ($seen !~ /^[hs]$/) {
-      warn ("db_seen corrupt: value='$seen' for $msgid. ignored");
-    } else {
-      dbg ("$msgid: already learnt as opposite, forgetting first");
-
-      # kluge so that forget() won't untie the db on us ...
-      my $orig = $self->{main}->{learn_caller_will_untie};
-      $self->{main}->{learn_caller_will_untie} = 1;
-
-      my $fatal = !defined $self->forget ($msg);
+  if (!defined $msgid) {
+    @msgid = $self->get_msgid($msg);
+  }
 
-      # reset the value post-forget() ...
-      $self->{main}->{learn_caller_will_untie} = $orig;
+  foreach $msgid ( @msgid ) {
+    my $seen = $self->{store}->seen_get ($msgid);
 
-      # forget() gave us a fatal error, so propagate that up
-      if ($fatal) {
-        dbg("forget() returned a fatal error, so learn() will too");
-	return;
+    if (defined ($seen)) {
+      if (($seen eq 's' && $isspam) || ($seen eq 'h' && !$isspam)) {
+        dbg ("$msgid: already learnt correctly, not learning twice");
+        return 0;
+      } elsif ($seen !~ /^[hs]$/) {
+        warn ("db_seen corrupt: value='$seen' for $msgid. ignored");
+      } else {
+        dbg ("$msgid: already learnt as opposite, forgetting first");
+
+        # kluge so that forget() won't untie the db on us ...
+        my $orig = $self->{main}->{learn_caller_will_untie};
+        $self->{main}->{learn_caller_will_untie} = 1;
+
+        my $fatal = !defined $self->forget ($msg);
+
+        # reset the value post-forget() ...
+        $self->{main}->{learn_caller_will_untie} = $orig;
+    
+        # forget() gave us a fatal error, so propagate that up
+        if ($fatal) {
+          dbg("forget() returned a fatal error, so learn() will too");
+	  return;
+        }
       }
+
+      # we're only going to have seen this once, so stop if it's been
+      # seen already
+      last;
     }
   }
 
+  # Now that we're sure we haven't seen this message before ...
+  $msgid = $msgid[0];
+
   if ($isspam) {
     $self->{store}->nspam_nham_change (1, 0);
   } else {
@@ -790,27 +804,44 @@
 # this function is trapped by the wrapper above
 sub forget_trapped {
   my ($self, $msg, $body, $msgid) = @_;
-
-  $msgid ||= $self->get_msgid ($msg);
-  my $seen = $self->{store}->seen_get ($msgid);
+  my @msgid = ( $msgid );
   my $isspam;
-  if (defined ($seen)) {
-    if ($seen eq 's') {
-      $isspam = 1;
-    } elsif ($seen eq 'h') {
-      $isspam = 0;
-    } else {
-      dbg ("forget: message $msgid seen entry is neither ham nor spam, ignored");
-      return 0;
+
+  if (!defined $msgid) {
+    @msgid = $self->get_msgid($msg);
+  }
+
+  while( $msgid = shift @msgid ) {
+    my $seen = $self->{store}->seen_get ($msgid);
+
+    if (defined ($seen)) {
+      if ($seen eq 's') {
+        $isspam = 1;
+      } elsif ($seen eq 'h') {
+        $isspam = 0;
+      } else {
+        dbg ("forget: msgid $msgid seen entry is neither ham nor spam, ignored");
+        return 0;
+      }
+
+      # messages should only be learned once, so stop if we find a msgid
+      # which was seen before
+      last;
+    }
+    else {
+      dbg ("forget: msgid $msgid not learnt, ignored");
     }
-  } else {
-    dbg ("forget: message $msgid not learnt, ignored");
-    return 0;
   }
 
-  if ($isspam) {
+  # This message wasn't learnt before, so return
+  if (!defined $isspam) {
+    dbg("forget: no msgid from this message has been learnt, skipping message");
+    return 0;
+  }
+  elsif ($isspam) {
     $self->{store}->nspam_nham_change (-1, 0);
-  } else {
+  }
+  else {
     $self->{store}->nspam_nham_change (0, -1);
   }
 
@@ -836,32 +867,36 @@
 sub get_msgid {
   my ($self, $msg) = @_;
 
+  my @msgid = ();
+
   my $msgid = $msg->get_header("Message-Id");
-  if (!defined $msgid || $msgid eq '' || $msgid =~ /^\s*<\s*>.*$/) { # generate a best effort unique id
-    # Use sha1(Date:, last received: and top N bytes of body)
-    # where N is MIN(1024 bytes, 1/2 of body length)
-    #
-    my $date = $msg->get_header("Date");
-    $date = "None" if (!defined $date || $date eq ''); # No Date?
+  if (defined $msgid && $msgid ne '' && $msgid !~ /^\s*<\s*(?:\@sa_generated)?>.*$/) {
+    # remove \r and < and > prefix/suffixes
+    chomp $msgid;
+    $msgid =~ s/^<//; $msgid =~ s/>.*$//g;
+    push(@msgid, $msgid);
+  }
 
-    my @rcvd = $msg->get_header("Received");
-    my $rcvd = $rcvd[$#rcvd];
-    $rcvd = "None" if (!defined $rcvd || $rcvd eq ''); # No Received?
+  # Use sha1(Date:, last received: and top N bytes of body)
+  # where N is MIN(1024 bytes, 1/2 of body length)
+  #
+  my $date = $msg->get_header("Date");
+  $date = "None" if (!defined $date || $date eq ''); # No Date?
 
-    my $body = $msg->get_pristine_body();
-    if (length($body) > 64) { # Small Body?
-      my $keep = ( length $body > 2048 ? 1024 : int(length($body) / 2) );
-      substr($body, $keep) = '';
-    }
+  my @rcvd = $msg->get_header("Received");
+  my $rcvd = $rcvd[$#rcvd];
+  $rcvd = "None" if (!defined $rcvd || $rcvd eq ''); # No Received?
 
-    $msgid = sha1($date."\000".$rcvd."\000".$body).'@sa_generated';
+  # Make a copy since pristine_body is a reference ...
+  my $body = join('', $msg->get_pristine_body());
+  if (length($body) > 64) { # Small Body?
+    my $keep = ( length $body > 2048 ? 1024 : int(length($body) / 2) );
+    substr($body, $keep) = '';
   }
 
-  # remove \r and < and > prefix/suffixes
-  chomp $msgid;
-  $msgid =~ s/^<//; $msgid =~ s/>.*$//g;
+  unshift(@msgid, sha1($date."\000".$rcvd."\000".$body).'@sa_generated');
 
-  $msgid;
+  return wantarray ? @msgid : $msgid[0];
 }
 
 sub add_uris_for_permsgstatus {
@@ -886,7 +921,7 @@
 
   if (!defined $body) {
     # why?!
-    warn "failed to get body for ".$self->get_msgid($self->{msg})."\n";
+    warn "failed to get body for ".scalar($self->get_msgid($self->{msg}))."\n";
     return [ ];
   }
 

Modified: incubator/spamassassin/trunk/t/bayesdbm.t
==============================================================================
--- incubator/spamassassin/trunk/t/bayesdbm.t	(original)
+++ incubator/spamassassin/trunk/t/bayesdbm.t	Wed Feb 18 15:26:01 2004
@@ -16,7 +16,7 @@
     unshift(@INC, '../blib/lib');
   }
 
-  plan tests => (HAS_DB_FILE ? 43 : 0);
+  plan tests => (HAS_DB_FILE ? 44 : 0);
 };
 
 exit unless HAS_DB_FILE;
@@ -68,9 +68,11 @@
 
 ok(scalar(@toks) > 0);
 
-my $msgid = $sa->{bayes_scanner}->get_msgid($mail);
+my($msgid,$msgid_hdr) = $sa->{bayes_scanner}->get_msgid($mail);
 
-ok($msgid eq '9PS291LhupY');
+# $msgid is the generated hash messageid, $msgid_hdr is the Message-Id header ...
+ok($msgid eq '502e12b89b9c74074744ffc18a95d80cff2effcd@sa_generated');
+ok($msgid_hdr eq '9PS291LhupY');
 
 ok($sa->{bayes_scanner}->{store}->tie_db_writable());