You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by he...@apache.org on 2019/08/09 12:00:05 UTC

svn commit: r1864788 - in /spamassassin/trunk: ./ lib/Mail/SpamAssassin/ lib/Mail/SpamAssassin/Plugin/ t/

Author: hege
Date: Fri Aug  9 12:00:04 2019
New Revision: 1864788

URL: http://svn.apache.org/viewvc?rev=1864788&view=rev
Log:
Bug 5185, 7187:
- Added Message::get_pristine_body_digest(), Message::get_msgid(), Message::generate_msgid() functions
- Removed Plugin::Bayes::get_msgid() function
- Fix TxRep and Bayes usage of above
- generate_msgid() now uses To + Date + whole LF normalized pristine_body

Modified:
    spamassassin/trunk/UPGRADE
    spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TxRep.pm
    spamassassin/trunk/t/bayesbdb.t
    spamassassin/trunk/t/bayesdbm.t
    spamassassin/trunk/t/bayesdbm_flock.t
    spamassassin/trunk/t/bayessdbm.t
    spamassassin/trunk/t/bayessdbm_seen_delete.t
    spamassassin/trunk/t/bayessql.t

Modified: spamassassin/trunk/UPGRADE
URL: http://svn.apache.org/viewvc/spamassassin/trunk/UPGRADE?rev=1864788&r1=1864787&r2=1864788&view=diff
==============================================================================
--- spamassassin/trunk/UPGRADE (original)
+++ spamassassin/trunk/UPGRADE Fri Aug  9 12:00:04 2019
@@ -125,6 +125,14 @@ Note for Users Upgrading to SpamAssassin
 - New dns_options nov4, nov6 (must set nov6 if resolver is filtering AAAA
   replies).
 
+- API: Added Message::get_pristine_body_digest(), Message::get_msgid(),
+  Message::generate_msgid() functions, removed deprecated private
+  Plugin::Bayes::get_msgid() function.
+
+- Bayes and TxRep seen (Message-ID tracking) hashing method changed.
+  No actions are required. If re-learning some old messages, they might
+  be learned twice. Old IDs should expire automatically.
+
 Note for Users Upgrading to SpamAssassin 3.4.2
 ----------------------------------------------
 

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm?rev=1864788&r1=1864787&r2=1864788&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm Fri Aug  9 12:00:04 2019
@@ -528,6 +528,77 @@ sub get_pristine_body {
   return $self->{pristine_body};
 }
 
+=item get_pristine_body_digest()
+
+Returns SHA1 hex digest of the pristine message body.
+CRLF line endings are normalized to LF before hashing.
+
+=cut
+
+sub get_pristine_body_digest {
+  my ($self) = @_;
+
+  return $self->{pristine_body_digest} if exists $self->{pristine_body_digest};
+
+  if ($self->{line_ending} eq "\015\012") {
+    # Don't make a copy, process line by line to save memory
+    # CRLF should be exception, so it's not that critical here
+    my $sha = Digest::SHA->new('sha1');
+    while ($self->{pristine_body} =~ /(.*?)(\015\012)?/gs) {
+      $sha->add($1.(defined $2 ? "\012" : ""));
+    }
+    $self->{pristine_body_digest} = $sha->hexdigest;
+  } else {
+    $self->{pristine_body_digest} = sha1_hex($self->{pristine_body});
+  }
+
+  dbg("message: pristine body digest: ".$self->{pristine_body_digest});
+  return $self->{pristine_body_digest};
+}
+
+# ---------------------------------------------------------------------------
+
+=item get_msgid()
+
+Returns Message-ID header for the message, with <> and surrounding
+whitespace removed. Returns undef, if nothing found between <>.
+
+=cut
+
+sub get_msgid {
+  my ($self) = @_;
+
+  my $msgid = $self->get_header("Message-Id");
+  if (defined $msgid && $msgid =~ /^\s*<(.+)>\s*$/s) {
+    return $1;
+  } else {
+    return undef;
+  }
+}
+
+=item generate_msgid()
+
+Generate a calculated "Message-ID" in B<sh...@sa_generated> format, using
+To, Date headers and pristine body as source for hashing.
+
+=cut
+
+sub generate_msgid {
+  my ($self) = @_;
+
+  return $self->{msgid_generated} if exists $self->{msgid_generated};
+
+  # See Bug 5185, not using Received headers etc anymore
+  my $to = $self->get_header("To") || '';
+  my $date = $self->get_header("Date") || '';
+  my $body_digest = $self->get_pristine_body_digest();
+
+  $self->{msgid_generated} =
+    sha1_hex($to."\000".$date."\000".$body_digest).'@sa_generated';
+
+  return $self->{msgid_generated};
+}
+
 # ---------------------------------------------------------------------------
 
 =item extract_message_metadata($permsgstatus)

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm?rev=1864788&r1=1864787&r2=1864788&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm Fri Aug  9 12:00:04 2019
@@ -406,10 +406,11 @@ sub _learn_trapped {
   my @msgid = ( $msgid );
 
   if (!defined $msgid) {
-    @msgid = $self->get_msgid($msg);
+    @msgid = ( $msg->generate_msgid(), $msg->get_msgid() );
   }
 
   foreach my $msgid_t ( @msgid ) {
+    next if !defined $msgid_t;
     my $seen = $self->{store}->seen_get ($msgid_t);
 
     if (defined ($seen)) {
@@ -541,7 +542,7 @@ sub _forget_trapped {
   my $isspam;
 
   if (!defined $msgid) {
-    @msgid = $self->get_msgid($msg);
+    @msgid = ( $msg->generate_msgid(), $msg->get_msgid() );
   }
 
   while( $msgid = shift @msgid ) {
@@ -960,49 +961,6 @@ sub learner_dump_database {
 ###########################################################################
 # TODO: these are NOT public, but the test suite needs to call them.
 
-sub get_msgid {
-  my ($self, $msg) = @_;
-
-  my @msgid;
-
-  my $msgid = $msg->get_header("Message-Id");
-  if (defined $msgid && $msgid ne '' && $msgid !~ /^\s*<\s*(?:\@sa_generated)?>.*$/) {
-    # remove \r and < and > prefix/suffixes
-    chomp $msgid;
-    $msgid =~ s/^<//; $msgid =~ s/>.*$//g;
-    push(@msgid, $msgid);
-  }
-
-  # Modified 2012-01-17  per bug 5185 to remove last received from msg_id calculation
-
-  # Use sha1_hex(Date: and top N bytes of body)
-  # where N is MIN(1024 bytes, 1/2 of body length)
-  #
-  my $date = $msg->get_header("Date");
-  $date = "None" if (!defined $date || $date eq ''); # No Date?
-
-  #Removed per bug 5185
-  #my @rcvd = $msg->get_header("Received");
-  #my $rcvd = $rcvd[$#rcvd];
-  #$rcvd = "None" if (!defined $rcvd || $rcvd eq ''); # No Received?
-
-  # Make a copy since pristine_body is a reference ...
-  my $body = join('', $msg->get_pristine_body());
-
-  if (length($body) > 64) { # Small Body?
-    my $keep = ( length $body > 2048 ? 1024 : int(length($body) / 2) );
-    substr($body, $keep) = '';
-  }
-
-  #Stripping all CR and LF so that testing midstream from MTA and post delivery don't 
-  #generate different id's simply because of LF<->CR<->CRLF changes.
-  $body =~ s/[\r\n]//g;
-
-  unshift(@msgid, sha1_hex($date."\000".$body).'@sa_generated');
-
-  return wantarray ? @msgid : $msgid[0];
-}
-
 sub get_body_from_msg {
   my ($self, $msg) = @_;
 
@@ -1020,7 +978,7 @@ sub get_body_from_msg {
 
   if (!defined $msgdata) {
     # why?!
-    warn "bayes: failed to get body for ".scalar($self->get_msgid($self->{msg}))."\n";
+    warn "bayes: failed to get body for ".scalar($self->{msg}->generate_msgid())."\n";
     return { };
   }
 

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TxRep.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TxRep.pm?rev=1864788&r1=1864787&r2=1864788&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TxRep.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TxRep.pm Fri Aug  9 12:00:04 2019
@@ -1245,9 +1245,7 @@ sub check_senders_reputation {
   my $timer    = $self->{main}->time_method("total_txrep");
   my $msgscore = (defined $self->{learning})? $self->{learning} : $pms->get_autolearn_points();
   my $date     = $pms->{msg}->receive_date() || $pms->{date_header_time};
-  my $msg_id   = $self->{msgid} ||
-                 Mail::SpamAssassin::Plugin::Bayes->get_msgid($pms->{msg}) ||
-                 $pms->get('Message-Id') || $pms->get('Message-ID') || $pms->get('MESSAGE-ID') || $pms->get('MESSAGEID');
+  my $msg_id   = $self->{msgid} || $pms->{msg}->generate_msgid();
 
   my $from   = lc $pms->get('From:addr') || $pms->get('EnvelopeFrom:addr');;
   return 0 unless $from =~ /\S/;

Modified: spamassassin/trunk/t/bayesbdb.t
URL: http://svn.apache.org/viewvc/spamassassin/trunk/t/bayesbdb.t?rev=1864788&r1=1864787&r2=1864788&view=diff
==============================================================================
--- spamassassin/trunk/t/bayesbdb.t (original)
+++ spamassassin/trunk/t/bayesbdb.t Fri Aug  9 12:00:04 2019
@@ -69,11 +69,12 @@ my $toks = getimpl->tokenize($mail, $bod
 
 ok(scalar(keys %{$toks}) > 0);
 
-my($msgid,$msgid_hdr) = getimpl->get_msgid($mail);
+my $msgid = $mail->generate_msgid();
+my $msgid_hdr = $mail->get_msgid();
 
 # $msgid is the generated hash messageid
 # $msgid_hdr is the Message-Id header
-ok($msgid eq '4cf5cc4d53b22e94d3e55932a606b18641a54041@sa_generated')
+ok($msgid eq '71f849915d7e469ddc1890cd8175f6876843f99e@sa_generated')
     or warn "got: [$msgid]";
 ok($msgid_hdr eq '9PS291LhupY');
 

Modified: spamassassin/trunk/t/bayesdbm.t
URL: http://svn.apache.org/viewvc/spamassassin/trunk/t/bayesdbm.t?rev=1864788&r1=1864787&r2=1864788&view=diff
==============================================================================
--- spamassassin/trunk/t/bayesdbm.t (original)
+++ spamassassin/trunk/t/bayesdbm.t Fri Aug  9 12:00:04 2019
@@ -62,11 +62,12 @@ my $toks = getimpl->tokenize($mail, $bod
 
 ok(scalar(keys %{$toks}) > 0);
 
-my($msgid,$msgid_hdr) = getimpl->get_msgid($mail);
+my $msgid = $mail->generate_msgid();
+my $msgid_hdr = $mail->get_msgid();
 
 # $msgid is the generated hash messageid
 # $msgid_hdr is the Message-Id header
-ok($msgid eq '4cf5cc4d53b22e94d3e55932a606b18641a54041@sa_generated')
+ok($msgid eq '71f849915d7e469ddc1890cd8175f6876843f99e@sa_generated')
     or warn "got: [$msgid]";
 ok($msgid_hdr eq '9PS291LhupY');
 

Modified: spamassassin/trunk/t/bayesdbm_flock.t
URL: http://svn.apache.org/viewvc/spamassassin/trunk/t/bayesdbm_flock.t?rev=1864788&r1=1864787&r2=1864788&view=diff
==============================================================================
--- spamassassin/trunk/t/bayesdbm_flock.t (original)
+++ spamassassin/trunk/t/bayesdbm_flock.t Fri Aug  9 12:00:04 2019
@@ -65,11 +65,12 @@ my $toks = getimpl->tokenize($mail, $bod
 
 ok(scalar(keys %{$toks}) > 0);
 
-my($msgid,$msgid_hdr) = getimpl->get_msgid($mail);
+my $msgid = $mail->generate_msgid();
+my $msgid_hdr = $mail->get_msgid();
 
 # $msgid is the generated hash messageid
 # $msgid_hdr is the Message-Id header
-ok($msgid eq '4cf5cc4d53b22e94d3e55932a606b18641a54041@sa_generated');
+ok($msgid eq '71f849915d7e469ddc1890cd8175f6876843f99e@sa_generated');
 ok($msgid_hdr eq '9PS291LhupY');
 
 ok(getimpl->{store}->tie_db_writable());

Modified: spamassassin/trunk/t/bayessdbm.t
URL: http://svn.apache.org/viewvc/spamassassin/trunk/t/bayessdbm.t?rev=1864788&r1=1864787&r2=1864788&view=diff
==============================================================================
--- spamassassin/trunk/t/bayessdbm.t (original)
+++ spamassassin/trunk/t/bayessdbm.t Fri Aug  9 12:00:04 2019
@@ -64,11 +64,12 @@ my $toks = getimpl->tokenize($mail, $bod
 
 ok(scalar(keys %{$toks}) > 0);
 
-my($msgid,$msgid_hdr) = getimpl->get_msgid($mail);
+my $msgid = $mail->generate_msgid();
+my $msgid_hdr = $mail->get_msgid();
 
 # $msgid is the generated hash messageid
 # $msgid_hdr is the Message-Id header
-ok($msgid eq '4cf5cc4d53b22e94d3e55932a606b18641a54041@sa_generated');
+ok($msgid eq '71f849915d7e469ddc1890cd8175f6876843f99e@sa_generated');
 ok($msgid_hdr eq '9PS291LhupY');
 
 ok(getimpl->{store}->tie_db_writable());

Modified: spamassassin/trunk/t/bayessdbm_seen_delete.t
URL: http://svn.apache.org/viewvc/spamassassin/trunk/t/bayessdbm_seen_delete.t?rev=1864788&r1=1864787&r2=1864788&view=diff
==============================================================================
--- spamassassin/trunk/t/bayessdbm_seen_delete.t (original)
+++ spamassassin/trunk/t/bayessdbm_seen_delete.t Fri Aug  9 12:00:04 2019
@@ -64,11 +64,12 @@ my $toks = getimpl->tokenize($mail, $bod
 
 ok(scalar(keys %{$toks}) > 0);
 
-my($msgid,$msgid_hdr) = getimpl->get_msgid($mail);
+my $msgid = $mail->generate_msgid();
+my $msgid_hdr = $mail->get_msgid();
 
 # $msgid is the generated hash messageid
 # $msgid_hdr is the Message-Id header
-ok($msgid eq '4cf5cc4d53b22e94d3e55932a606b18641a54041@sa_generated');
+ok($msgid eq '71f849915d7e469ddc1890cd8175f6876843f99e@sa_generated');
 ok($msgid_hdr eq '9PS291LhupY');
 
 ok(getimpl->{store}->tie_db_writable());

Modified: spamassassin/trunk/t/bayessql.t
URL: http://svn.apache.org/viewvc/spamassassin/trunk/t/bayessql.t?rev=1864788&r1=1864787&r2=1864788&view=diff
==============================================================================
--- spamassassin/trunk/t/bayessql.t (original)
+++ spamassassin/trunk/t/bayessql.t Fri Aug  9 12:00:04 2019
@@ -147,11 +147,12 @@ my $toks = getimpl->tokenize($mail, $bod
 
 ok(scalar(keys %{$toks}) > 0);
 
-my($msgid,$msgid_hdr) = getimpl->get_msgid($mail);
+my $msgid = $mail->generate_msgid();
+my $msgid_hdr = $mail->get_msgid();
 
 # $msgid is the generated hash messageid
 # $msgid_hdr is the Message-Id header
-ok($msgid eq '4cf5cc4d53b22e94d3e55932a606b18641a54041@sa_generated');
+ok($msgid eq '71f849915d7e469ddc1890cd8175f6876843f99e@sa_generated');
 ok($msgid_hdr eq '9PS291LhupY');
 
 ok(getimpl->{store}->tie_db_writable());