You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by he...@apache.org on 2019/08/09 12:00:05 UTC
svn commit: r1864788 - in /spamassassin/trunk: ./ lib/Mail/SpamAssassin/
lib/Mail/SpamAssassin/Plugin/ t/
Author: hege
Date: Fri Aug 9 12:00:04 2019
New Revision: 1864788
URL: http://svn.apache.org/viewvc?rev=1864788&view=rev
Log:
Bug 5185, 7187:
- Added Message::get_pristine_body_digest(), Message::get_msgid(), Message::generate_msgid() functions
- Removed Plugin::Bayes::get_msgid() function
- Fix TxRep and Bayes usage of above
- generate_msgid() now uses To + Date + whole LF normalized pristine_body
Modified:
spamassassin/trunk/UPGRADE
spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TxRep.pm
spamassassin/trunk/t/bayesbdb.t
spamassassin/trunk/t/bayesdbm.t
spamassassin/trunk/t/bayesdbm_flock.t
spamassassin/trunk/t/bayessdbm.t
spamassassin/trunk/t/bayessdbm_seen_delete.t
spamassassin/trunk/t/bayessql.t
Modified: spamassassin/trunk/UPGRADE
URL: http://svn.apache.org/viewvc/spamassassin/trunk/UPGRADE?rev=1864788&r1=1864787&r2=1864788&view=diff
==============================================================================
--- spamassassin/trunk/UPGRADE (original)
+++ spamassassin/trunk/UPGRADE Fri Aug 9 12:00:04 2019
@@ -125,6 +125,14 @@ Note for Users Upgrading to SpamAssassin
- New dns_options nov4, nov6 (must set nov6 if resolver is filtering AAAA
replies).
+- API: Added Message::get_pristine_body_digest(), Message::get_msgid(),
+ Message::generate_msgid() functions, removed deprecated private
+ Plugin::Bayes::get_msgid() function.
+
+- Bayes and TxRep seen (Message-ID tracking) hashing method changed.
+ No actions are required. If re-learning some old messages, they might
+ be learned twice. Old IDs should expire automatically.
+
Note for Users Upgrading to SpamAssassin 3.4.2
----------------------------------------------
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm?rev=1864788&r1=1864787&r2=1864788&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm Fri Aug 9 12:00:04 2019
@@ -528,6 +528,77 @@ sub get_pristine_body {
return $self->{pristine_body};
}
+=item get_pristine_body_digest()
+
+Returns SHA1 hex digest of the pristine message body.
+CRLF line endings are normalized to LF before hashing.
+
+=cut
+
+sub get_pristine_body_digest {
+ my ($self) = @_;
+
+ return $self->{pristine_body_digest} if exists $self->{pristine_body_digest};
+
+ if ($self->{line_ending} eq "\015\012") {
+ # Don't make a copy, process line by line to save memory
+ # CRLF should be exception, so it's not that critical here
+ my $sha = Digest::SHA->new('sha1');
+ while ($self->{pristine_body} =~ /(.*?)(\015\012)?/gs) {
+ $sha->add($1.(defined $2 ? "\012" : ""));
+ }
+ $self->{pristine_body_digest} = $sha->hexdigest;
+ } else {
+ $self->{pristine_body_digest} = sha1_hex($self->{pristine_body});
+ }
+
+ dbg("message: pristine body digest: ".$self->{pristine_body_digest});
+ return $self->{pristine_body_digest};
+}
+
+# ---------------------------------------------------------------------------
+
+=item get_msgid()
+
+Returns Message-ID header for the message, with <> and surrounding
+whitespace removed. Returns undef, if nothing found between <>.
+
+=cut
+
+sub get_msgid {
+ my ($self) = @_;
+
+ my $msgid = $self->get_header("Message-Id");
+ if (defined $msgid && $msgid =~ /^\s*<(.+)>\s*$/s) {
+ return $1;
+ } else {
+ return undef;
+ }
+}
+
+=item generate_msgid()
+
+Generate a calculated "Message-ID" in B<sh...@sa_generated> format, using
+To, Date headers and pristine body as source for hashing.
+
+=cut
+
+sub generate_msgid {
+ my ($self) = @_;
+
+ return $self->{msgid_generated} if exists $self->{msgid_generated};
+
+ # See Bug 5185, not using Received headers etc anymore
+ my $to = $self->get_header("To") || '';
+ my $date = $self->get_header("Date") || '';
+ my $body_digest = $self->get_pristine_body_digest();
+
+ $self->{msgid_generated} =
+ sha1_hex($to."\000".$date."\000".$body_digest).'@sa_generated';
+
+ return $self->{msgid_generated};
+}
+
# ---------------------------------------------------------------------------
=item extract_message_metadata($permsgstatus)
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm?rev=1864788&r1=1864787&r2=1864788&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm Fri Aug 9 12:00:04 2019
@@ -406,10 +406,11 @@ sub _learn_trapped {
my @msgid = ( $msgid );
if (!defined $msgid) {
- @msgid = $self->get_msgid($msg);
+ @msgid = ( $msg->generate_msgid(), $msg->get_msgid() );
}
foreach my $msgid_t ( @msgid ) {
+ next if !defined $msgid_t;
my $seen = $self->{store}->seen_get ($msgid_t);
if (defined ($seen)) {
@@ -541,7 +542,7 @@ sub _forget_trapped {
my $isspam;
if (!defined $msgid) {
- @msgid = $self->get_msgid($msg);
+ @msgid = ( $msg->generate_msgid(), $msg->get_msgid() );
}
while( $msgid = shift @msgid ) {
@@ -960,49 +961,6 @@ sub learner_dump_database {
###########################################################################
# TODO: these are NOT public, but the test suite needs to call them.
-sub get_msgid {
- my ($self, $msg) = @_;
-
- my @msgid;
-
- my $msgid = $msg->get_header("Message-Id");
- if (defined $msgid && $msgid ne '' && $msgid !~ /^\s*<\s*(?:\@sa_generated)?>.*$/) {
- # remove \r and < and > prefix/suffixes
- chomp $msgid;
- $msgid =~ s/^<//; $msgid =~ s/>.*$//g;
- push(@msgid, $msgid);
- }
-
- # Modified 2012-01-17 per bug 5185 to remove last received from msg_id calculation
-
- # Use sha1_hex(Date: and top N bytes of body)
- # where N is MIN(1024 bytes, 1/2 of body length)
- #
- my $date = $msg->get_header("Date");
- $date = "None" if (!defined $date || $date eq ''); # No Date?
-
- #Removed per bug 5185
- #my @rcvd = $msg->get_header("Received");
- #my $rcvd = $rcvd[$#rcvd];
- #$rcvd = "None" if (!defined $rcvd || $rcvd eq ''); # No Received?
-
- # Make a copy since pristine_body is a reference ...
- my $body = join('', $msg->get_pristine_body());
-
- if (length($body) > 64) { # Small Body?
- my $keep = ( length $body > 2048 ? 1024 : int(length($body) / 2) );
- substr($body, $keep) = '';
- }
-
- #Stripping all CR and LF so that testing midstream from MTA and post delivery don't
- #generate different id's simply because of LF<->CR<->CRLF changes.
- $body =~ s/[\r\n]//g;
-
- unshift(@msgid, sha1_hex($date."\000".$body).'@sa_generated');
-
- return wantarray ? @msgid : $msgid[0];
-}
-
sub get_body_from_msg {
my ($self, $msg) = @_;
@@ -1020,7 +978,7 @@ sub get_body_from_msg {
if (!defined $msgdata) {
# why?!
- warn "bayes: failed to get body for ".scalar($self->get_msgid($self->{msg}))."\n";
+ warn "bayes: failed to get body for ".scalar($self->{msg}->generate_msgid())."\n";
return { };
}
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TxRep.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TxRep.pm?rev=1864788&r1=1864787&r2=1864788&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TxRep.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TxRep.pm Fri Aug 9 12:00:04 2019
@@ -1245,9 +1245,7 @@ sub check_senders_reputation {
my $timer = $self->{main}->time_method("total_txrep");
my $msgscore = (defined $self->{learning})? $self->{learning} : $pms->get_autolearn_points();
my $date = $pms->{msg}->receive_date() || $pms->{date_header_time};
- my $msg_id = $self->{msgid} ||
- Mail::SpamAssassin::Plugin::Bayes->get_msgid($pms->{msg}) ||
- $pms->get('Message-Id') || $pms->get('Message-ID') || $pms->get('MESSAGE-ID') || $pms->get('MESSAGEID');
+ my $msg_id = $self->{msgid} || $pms->{msg}->generate_msgid();
my $from = lc $pms->get('From:addr') || $pms->get('EnvelopeFrom:addr');;
return 0 unless $from =~ /\S/;
Modified: spamassassin/trunk/t/bayesbdb.t
URL: http://svn.apache.org/viewvc/spamassassin/trunk/t/bayesbdb.t?rev=1864788&r1=1864787&r2=1864788&view=diff
==============================================================================
--- spamassassin/trunk/t/bayesbdb.t (original)
+++ spamassassin/trunk/t/bayesbdb.t Fri Aug 9 12:00:04 2019
@@ -69,11 +69,12 @@ my $toks = getimpl->tokenize($mail, $bod
ok(scalar(keys %{$toks}) > 0);
-my($msgid,$msgid_hdr) = getimpl->get_msgid($mail);
+my $msgid = $mail->generate_msgid();
+my $msgid_hdr = $mail->get_msgid();
# $msgid is the generated hash messageid
# $msgid_hdr is the Message-Id header
-ok($msgid eq '4cf5cc4d53b22e94d3e55932a606b18641a54041@sa_generated')
+ok($msgid eq '71f849915d7e469ddc1890cd8175f6876843f99e@sa_generated')
or warn "got: [$msgid]";
ok($msgid_hdr eq '9PS291LhupY');
Modified: spamassassin/trunk/t/bayesdbm.t
URL: http://svn.apache.org/viewvc/spamassassin/trunk/t/bayesdbm.t?rev=1864788&r1=1864787&r2=1864788&view=diff
==============================================================================
--- spamassassin/trunk/t/bayesdbm.t (original)
+++ spamassassin/trunk/t/bayesdbm.t Fri Aug 9 12:00:04 2019
@@ -62,11 +62,12 @@ my $toks = getimpl->tokenize($mail, $bod
ok(scalar(keys %{$toks}) > 0);
-my($msgid,$msgid_hdr) = getimpl->get_msgid($mail);
+my $msgid = $mail->generate_msgid();
+my $msgid_hdr = $mail->get_msgid();
# $msgid is the generated hash messageid
# $msgid_hdr is the Message-Id header
-ok($msgid eq '4cf5cc4d53b22e94d3e55932a606b18641a54041@sa_generated')
+ok($msgid eq '71f849915d7e469ddc1890cd8175f6876843f99e@sa_generated')
or warn "got: [$msgid]";
ok($msgid_hdr eq '9PS291LhupY');
Modified: spamassassin/trunk/t/bayesdbm_flock.t
URL: http://svn.apache.org/viewvc/spamassassin/trunk/t/bayesdbm_flock.t?rev=1864788&r1=1864787&r2=1864788&view=diff
==============================================================================
--- spamassassin/trunk/t/bayesdbm_flock.t (original)
+++ spamassassin/trunk/t/bayesdbm_flock.t Fri Aug 9 12:00:04 2019
@@ -65,11 +65,12 @@ my $toks = getimpl->tokenize($mail, $bod
ok(scalar(keys %{$toks}) > 0);
-my($msgid,$msgid_hdr) = getimpl->get_msgid($mail);
+my $msgid = $mail->generate_msgid();
+my $msgid_hdr = $mail->get_msgid();
# $msgid is the generated hash messageid
# $msgid_hdr is the Message-Id header
-ok($msgid eq '4cf5cc4d53b22e94d3e55932a606b18641a54041@sa_generated');
+ok($msgid eq '71f849915d7e469ddc1890cd8175f6876843f99e@sa_generated');
ok($msgid_hdr eq '9PS291LhupY');
ok(getimpl->{store}->tie_db_writable());
Modified: spamassassin/trunk/t/bayessdbm.t
URL: http://svn.apache.org/viewvc/spamassassin/trunk/t/bayessdbm.t?rev=1864788&r1=1864787&r2=1864788&view=diff
==============================================================================
--- spamassassin/trunk/t/bayessdbm.t (original)
+++ spamassassin/trunk/t/bayessdbm.t Fri Aug 9 12:00:04 2019
@@ -64,11 +64,12 @@ my $toks = getimpl->tokenize($mail, $bod
ok(scalar(keys %{$toks}) > 0);
-my($msgid,$msgid_hdr) = getimpl->get_msgid($mail);
+my $msgid = $mail->generate_msgid();
+my $msgid_hdr = $mail->get_msgid();
# $msgid is the generated hash messageid
# $msgid_hdr is the Message-Id header
-ok($msgid eq '4cf5cc4d53b22e94d3e55932a606b18641a54041@sa_generated');
+ok($msgid eq '71f849915d7e469ddc1890cd8175f6876843f99e@sa_generated');
ok($msgid_hdr eq '9PS291LhupY');
ok(getimpl->{store}->tie_db_writable());
Modified: spamassassin/trunk/t/bayessdbm_seen_delete.t
URL: http://svn.apache.org/viewvc/spamassassin/trunk/t/bayessdbm_seen_delete.t?rev=1864788&r1=1864787&r2=1864788&view=diff
==============================================================================
--- spamassassin/trunk/t/bayessdbm_seen_delete.t (original)
+++ spamassassin/trunk/t/bayessdbm_seen_delete.t Fri Aug 9 12:00:04 2019
@@ -64,11 +64,12 @@ my $toks = getimpl->tokenize($mail, $bod
ok(scalar(keys %{$toks}) > 0);
-my($msgid,$msgid_hdr) = getimpl->get_msgid($mail);
+my $msgid = $mail->generate_msgid();
+my $msgid_hdr = $mail->get_msgid();
# $msgid is the generated hash messageid
# $msgid_hdr is the Message-Id header
-ok($msgid eq '4cf5cc4d53b22e94d3e55932a606b18641a54041@sa_generated');
+ok($msgid eq '71f849915d7e469ddc1890cd8175f6876843f99e@sa_generated');
ok($msgid_hdr eq '9PS291LhupY');
ok(getimpl->{store}->tie_db_writable());
Modified: spamassassin/trunk/t/bayessql.t
URL: http://svn.apache.org/viewvc/spamassassin/trunk/t/bayessql.t?rev=1864788&r1=1864787&r2=1864788&view=diff
==============================================================================
--- spamassassin/trunk/t/bayessql.t (original)
+++ spamassassin/trunk/t/bayessql.t Fri Aug 9 12:00:04 2019
@@ -147,11 +147,12 @@ my $toks = getimpl->tokenize($mail, $bod
ok(scalar(keys %{$toks}) > 0);
-my($msgid,$msgid_hdr) = getimpl->get_msgid($mail);
+my $msgid = $mail->generate_msgid();
+my $msgid_hdr = $mail->get_msgid();
# $msgid is the generated hash messageid
# $msgid_hdr is the Message-Id header
-ok($msgid eq '4cf5cc4d53b22e94d3e55932a606b18641a54041@sa_generated');
+ok($msgid eq '71f849915d7e469ddc1890cd8175f6876843f99e@sa_generated');
ok($msgid_hdr eq '9PS291LhupY');
ok(getimpl->{store}->tie_db_writable());