You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2005/12/15 03:27:20 UTC
svn commit: r356971 [1/2] - in /spamassassin/branches/tvd-evaltoplugin: ./
lib/Mail/SpamAssassin/ lib/Mail/SpamAssassin/Plugin/ rules/
Author: felicity
Date: Wed Dec 14 18:27:14 2005
New Revision: 356971
URL: http://svn.apache.org/viewcvs?rev=356971&view=rev
Log:
first pass at moving EvalTests.pm into various different plugins
Added:
spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/Bayes.pm
spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/BodyEval.pm
spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/DNSEval.pm
spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/HTMLEval.pm
spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/HeaderEval.pm
spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/MIMEEval.pm
spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/RelayEval.pm
spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/URIEval.pm
spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/WLBLEval.pm
spamassassin/branches/tvd-evaltoplugin/rules/v320.pre
Removed:
spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/EvalTests.pm
Modified:
spamassassin/branches/tvd-evaltoplugin/MANIFEST
spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Bayes.pm
spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Conf.pm
spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Constants.pm
spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Dns.pm
spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/PerMsgStatus.pm
spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Util.pm
Modified: spamassassin/branches/tvd-evaltoplugin/MANIFEST
URL: http://svn.apache.org/viewcvs/spamassassin/branches/tvd-evaltoplugin/MANIFEST?rev=356971&r1=356970&r2=356971&view=diff
==============================================================================
--- spamassassin/branches/tvd-evaltoplugin/MANIFEST (original)
+++ spamassassin/branches/tvd-evaltoplugin/MANIFEST Wed Dec 14 18:27:14 2005
@@ -46,7 +46,6 @@
lib/Mail/SpamAssassin/DBBasedAddrList.pm
lib/Mail/SpamAssassin/Dns.pm
lib/Mail/SpamAssassin/DnsResolver.pm
-lib/Mail/SpamAssassin/EvalTests.pm
lib/Mail/SpamAssassin/HTML.pm
lib/Mail/SpamAssassin/Locales.pm
lib/Mail/SpamAssassin/Locker.pm
@@ -71,19 +70,28 @@
lib/Mail/SpamAssassin/Plugin/AccessDB.pm
lib/Mail/SpamAssassin/Plugin/AntiVirus.pm
lib/Mail/SpamAssassin/Plugin/AutoLearnThreshold.pm
+lib/Mail/SpamAssassin/Plugin/Bayes.pm
+lib/Mail/SpamAssassin/Plugin/BodyEval.pm
lib/Mail/SpamAssassin/Plugin/DCC.pm
+lib/Mail/SpamAssassin/Plugin/DNSEval.pm
lib/Mail/SpamAssassin/Plugin/DomainKeys.pm
+lib/Mail/SpamAssassin/Plugin/HTMLEval.pm
lib/Mail/SpamAssassin/Plugin/Hashcash.pm
+lib/Mail/SpamAssassin/Plugin/HeaderEval.pm
+lib/Mail/SpamAssassin/Plugin/MIMEEval.pm
lib/Mail/SpamAssassin/Plugin/MIMEHeader.pm
lib/Mail/SpamAssassin/Plugin/Pyzor.pm
lib/Mail/SpamAssassin/Plugin/Razor2.pm
lib/Mail/SpamAssassin/Plugin/RelayCountry.pm
+lib/Mail/SpamAssassin/Plugin/RelayEval.pm
lib/Mail/SpamAssassin/Plugin/ReplaceTags.pm
lib/Mail/SpamAssassin/Plugin/SPF.pm
lib/Mail/SpamAssassin/Plugin/SpamCop.pm
lib/Mail/SpamAssassin/Plugin/Test.pm
lib/Mail/SpamAssassin/Plugin/TextCat.pm
lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm
+lib/Mail/SpamAssassin/Plugin/URIEval.pm
+lib/Mail/SpamAssassin/Plugin/WLBLEval.pm
lib/Mail/SpamAssassin/Plugin/WhiteListSubject.pm
lib/Mail/SpamAssassin/PluginHandler.pm
lib/Mail/SpamAssassin/Reporter.pm
Modified: spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Bayes.pm
URL: http://svn.apache.org/viewcvs/spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Bayes.pm?rev=356971&r1=356970&r2=356971&view=diff
==============================================================================
--- spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Bayes.pm (original)
+++ spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Bayes.pm Wed Dec 14 18:27:14 2005
@@ -670,8 +670,10 @@
return 0 unless $self->{use_ignores};
- my $ignore = $PMS->check_from_in_list('bayes_ignore_from')
- || $PMS->check_to_in_list('bayes_ignore_to');
+ my $ig_from = $self->{main}->call_plugins ("check_wb_list", { permsgstatus => $PMS, type => 'from', list => 'bayes_ignore_from' });
+ my $ig_to = $self->{main}->call_plugins ("check_wb_list", { permsgstatus => $PMS, type => 'to', list => 'bayes_ignore_to' });
+
+ my $ignore = $ig_from || $ig_to;
dbg("bayes: not using bayes, bayes_ignore_from or _to rule") if $ignore;
Modified: spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Conf.pm
URL: http://svn.apache.org/viewcvs/spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Conf.pm?rev=356971&r1=356970&r2=356971&view=diff
==============================================================================
--- spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Conf.pm (original)
+++ spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Conf.pm Wed Dec 14 18:27:14 2005
@@ -254,300 +254,6 @@
=back
-=head2 WHITELIST AND BLACKLIST OPTIONS
-
-=over 4
-
-=item whitelist_from add@ress.com
-
-Used to specify addresses which send mail that is often tagged (incorrectly) as
-spam. If you want to whitelist your own domain, be aware that spammers will
-often impersonate the domain of the recipient. The recommended solution is to
-instead use C<whitelist_from_rcvd> as explained below.
-
-Whitelist and blacklist addresses are now file-glob-style patterns, so
-C<fr...@somewhere.com>, C<*...@isp.com>, or C<*.domain.net> will all work.
-Specifically, C<*> and C<?> are allowed, but all other metacharacters are not.
-Regular expressions are not used for security reasons.
-
-Multiple addresses per line, separated by spaces, is OK. Multiple
-C<whitelist_from> lines is also OK.
-
-The headers checked for whitelist addresses are as follows: if C<Resent-From>
-is set, use that; otherwise check all addresses taken from the following
-set of headers:
-
- Envelope-Sender
- Resent-Sender
- X-Envelope-From
- From
-
-In addition, the "envelope sender" data, taken from the SMTP envelope
-data where this is available, is looked up.
-
-e.g.
-
- whitelist_from joe@example.com fred@example.com
- whitelist_from *@example.com
-
-=cut
-
- push (@cmds, {
- setting => 'whitelist_from',
- type => $CONF_TYPE_ADDRLIST
- });
-
-=item unwhitelist_from add@ress.com
-
-Used to override a default whitelist_from entry, so for example a distribution
-whitelist_from can be overridden in a local.cf file, or an individual user can
-override a whitelist_from entry in their own C<user_prefs> file.
-The specified email address has to match exactly the address previously
-used in a whitelist_from line.
-
-e.g.
-
- unwhitelist_from joe@example.com fred@example.com
- unwhitelist_from *@example.com
-
-=cut
-
- push (@cmds, {
- command => 'unwhitelist_from',
- setting => 'whitelist_from',
- code => \&Mail::SpamAssassin::Conf::Parser::remove_addrlist_value
- });
-
-=item whitelist_from_rcvd addr@lists.sourceforge.net sourceforge.net
-
-Use this to supplement the whitelist_from addresses with a check against the
-Received headers. The first parameter is the address to whitelist, and the
-second is a string to match the relay's rDNS.
-
-This string is matched against the reverse DNS lookup used during the handover
-from the internet to your internal network's mail exchangers. It can
-either be the full hostname, or the domain component of that hostname. In
-other words, if the host that connected to your MX had an IP address that
-mapped to 'sendinghost.spamassassin.org', you should specify
-C<sendinghost.spamassassin.org> or just C<spamassassin.org> here.
-
-Note that this requires that C<internal_networks> be correct. For simple cases,
-it will be, but for a complex network, or running with DNS checks off
-or with C<-L>, you may get better results by setting that parameter.
-
-e.g.
-
- whitelist_from_rcvd joe@example.com example.com
- whitelist_from_rcvd *@axkit.org sergeant.org
-
-=item def_whitelist_from_rcvd addr@lists.sourceforge.net sourceforge.net
-
-Same as C<whitelist_from_rcvd>, but used for the default whitelist entries
-in the SpamAssassin distribution. The whitelist score is lower, because
-these are often targets for spammer spoofing.
-
-=cut
-
- push (@cmds, {
- setting => 'whitelist_from_rcvd',
- code => sub {
- my ($self, $key, $value, $line) = @_;
- unless (defined $value && $value !~ /^$/) {
- return $MISSING_REQUIRED_VALUE;
- }
- unless ($value =~ /^\S+\s+\S+$/) {
- return $INVALID_VALUE;
- }
- $self->{parser}->add_to_addrlist_rcvd ('whitelist_from_rcvd',
- split(/\s+/, $value));
- }
- });
-
- push (@cmds, {
- setting => 'def_whitelist_from_rcvd',
- code => sub {
- my ($self, $key, $value, $line) = @_;
- unless (defined $value && $value !~ /^$/) {
- return $MISSING_REQUIRED_VALUE;
- }
- unless ($value =~ /^\S+\s+\S+$/) {
- return $INVALID_VALUE;
- }
- $self->{parser}->add_to_addrlist_rcvd ('def_whitelist_from_rcvd',
- split(/\s+/, $value));
- }
- });
-
-=item whitelist_allows_relays add@ress.com
-
-Specify addresses which are in C<whitelist_from_rcvd> that sometimes
-send through a mail relay other than the listed ones. By default mail
-with a From address that is in C<whitelist_from_rcvd> that does not match
-the relay will trigger a forgery rule. Including the address in
-C<whitelist_allows_relay> prevents that.
-
-Whitelist and blacklist addresses are now file-glob-style patterns, so
-C<fr...@somewhere.com>, C<*...@isp.com>, or C<*.domain.net> will all work.
-Specifically, C<*> and C<?> are allowed, but all other metacharacters are not.
-Regular expressions are not used for security reasons.
-
-Multiple addresses per line, separated by spaces, is OK. Multiple
-C<whitelist_allows_relays> lines is also OK.
-
-The specified email address does not have to match exactly the address
-previously used in a whitelist_from_rcvd line as it is compared to the
-address in the header.
-
-e.g.
-
- whitelist_allows_relays joe@example.com fred@example.com
- whitelist_allows_relays *@example.com
-
-=cut
-
- push (@cmds, {
- setting => 'whitelist_allows_relays',
- type => $CONF_TYPE_ADDRLIST
- });
-
-=item unwhitelist_from_rcvd add@ress.com
-
-Used to override a default whitelist_from_rcvd entry, so for example a
-distribution whitelist_from_rcvd can be overridden in a local.cf file,
-or an individual user can override a whitelist_from_rcvd entry in
-their own C<user_prefs> file.
-
-The specified email address has to match exactly the address previously
-used in a whitelist_from_rcvd line.
-
-e.g.
-
- unwhitelist_from_rcvd joe@example.com fred@example.com
- unwhitelist_from_rcvd *@axkit.org
-
-=cut
-
- push (@cmds, {
- setting => 'unwhitelist_from_rcvd',
- code => sub {
- my ($self, $key, $value, $line) = @_;
- unless (defined $value && $value !~ /^$/) {
- return $MISSING_REQUIRED_VALUE;
- }
- unless ($value =~ /^(?:\S+(?:\s+\S+)*)$/) {
- return $INVALID_VALUE;
- }
- $self->{parser}->remove_from_addrlist_rcvd('whitelist_from_rcvd',
- split (/\s+/, $value));
- $self->{parser}->remove_from_addrlist_rcvd('def_whitelist_from_rcvd',
- split (/\s+/, $value));
- }
- });
-
-=item blacklist_from add@ress.com
-
-Used to specify addresses which send mail that is often tagged (incorrectly) as
-non-spam, but which the user doesn't want. Same format as C<whitelist_from>.
-
-=cut
-
- push (@cmds, {
- setting => 'blacklist_from',
- type => $CONF_TYPE_ADDRLIST
- });
-
-=item unblacklist_from add@ress.com
-
-Used to override a default blacklist_from entry, so for example a
-distribution blacklist_from can be overridden in a local.cf file, or
-an individual user can override a blacklist_from entry in their own
-C<user_prefs> file. The specified email address has to match exactly
-the address previously used in a blacklist_from line.
-
-
-e.g.
-
- unblacklist_from joe@example.com fred@example.com
- unblacklist_from *@spammer.com
-
-=cut
-
-
- push (@cmds, {
- command => 'unblacklist_from',
- setting => 'blacklist_from',
- code => \&Mail::SpamAssassin::Conf::Parser::remove_addrlist_value
- });
-
-
-=item whitelist_to add@ress.com
-
-If the given address appears as a recipient in the message headers
-(Resent-To, To, Cc, obvious envelope recipient, etc.) the mail will
-be whitelisted. Useful if you're deploying SpamAssassin system-wide,
-and don't want some users to have their mail filtered. Same format
-as C<whitelist_from>.
-
-There are three levels of To-whitelisting, C<whitelist_to>, C<more_spam_to>
-and C<all_spam_to>. Users in the first level may still get some spammish
-mails blocked, but users in C<all_spam_to> should never get mail blocked.
-
-The headers checked for whitelist addresses are as follows: if C<Resent-To> or
-C<Resent-Cc> are set, use those; otherwise check all addresses taken from the
-following set of headers:
-
- To
- Cc
- Apparently-To
- Delivered-To
- Envelope-Recipients
- Apparently-Resent-To
- X-Envelope-To
- Envelope-To
- X-Delivered-To
- X-Original-To
- X-Rcpt-To
- X-Real-To
-
-=item more_spam_to add@ress.com
-
-See above.
-
-=item all_spam_to add@ress.com
-
-See above.
-
-=cut
-
- push (@cmds, {
- setting => 'whitelist_to',
- type => $CONF_TYPE_ADDRLIST
- });
- push (@cmds, {
- setting => 'more_spam_to',
- type => $CONF_TYPE_ADDRLIST
- });
- push (@cmds, {
- setting => 'all_spam_to',
- type => $CONF_TYPE_ADDRLIST
- });
-
-=item blacklist_to add@ress.com
-
-If the given address appears as a recipient in the message headers
-(Resent-To, To, Cc, obvious envelope recipient, etc.) the mail will
-be blacklisted. Same format as C<blacklist_from>.
-
-=cut
-
-
- push (@cmds, {
- setting => 'blacklist_to',
- type => $CONF_TYPE_ADDRLIST
- });
-
-=back
-
=head2 BASIC MESSAGE TAGGING OPTIONS
=over 4
Modified: spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Constants.pm
URL: http://svn.apache.org/viewcvs/spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Constants.pm?rev=356971&r1=356970&r2=356971&view=diff
==============================================================================
--- spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Constants.pm (original)
+++ spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Constants.pm Wed Dec 14 18:27:14 2005
@@ -37,6 +37,7 @@
META_TEST_MIN_PRIORITY HARVEST_DNSBL_PRIORITY MBX_SEPARATOR
MAX_BODY_LINE_LENGTH MAX_HEADER_KEY_LENGTH MAX_HEADER_VALUE_LENGTH
MAX_HEADER_LENGTH ARITH_EXPRESSION_LEXER AI_TIME_UNKNOWN
+ CHARSETS_LIKELY_TO_FP_AS_CAPS
);
%EXPORT_TAGS = (
@@ -196,5 +197,10 @@
# second pass (when the message is actually read + processed) the received
# date is calculated. this value signifies "unknown" from the first pass.
use constant AI_TIME_UNKNOWN => 0;
+
+# Charsets which use capital letters heavily in their encoded representation.
+use constant CHARSETS_LIKELY_TO_FP_AS_CAPS => qr{[-_a-z0-9]*(?:
+ koi|jp|jis|euc|gb|big5|isoir|cp1251|georgianps|pt154|tis
+ )[-_a-z0-9]*}ix;
1;
Modified: spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Dns.pm
URL: http://svn.apache.org/viewcvs/spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Dns.pm?rev=356971&r1=356970&r2=356971&view=diff
==============================================================================
--- spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Dns.pm (original)
+++ spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Dns.pm Wed Dec 14 18:27:14 2005
@@ -735,4 +735,12 @@
###########################################################################
+# interface called by SPF plugin
+sub check_for_from_dns {
+ my ($self, $pms) = @_;
+ if (defined $pms->{sender_host_fail}) {
+ return ($pms->{sender_host_fail} == 2); # both MX and A need to fail
+ }
+}
+
1;
Modified: spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewcvs/spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/PerMsgStatus.pm?rev=356971&r1=356970&r2=356971&view=diff
==============================================================================
--- spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/PerMsgStatus.pm Wed Dec 14 18:27:14 2005
@@ -51,7 +51,6 @@
use Carp;
use Mail::SpamAssassin::Constants qw(:sa);
-use Mail::SpamAssassin::EvalTests;
use Mail::SpamAssassin::Conf;
use Mail::SpamAssassin::Util;
use Mail::SpamAssassin::Logger;
Added: spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/Bayes.pm
URL: http://svn.apache.org/viewcvs/spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/Bayes.pm?rev=356971&view=auto
==============================================================================
--- spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/Bayes.pm (added)
+++ spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/Bayes.pm Wed Dec 14 18:27:14 2005
@@ -0,0 +1,70 @@
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+
+package Mail::SpamAssassin::Plugin::Bayes;
+
+use Mail::SpamAssassin::Plugin;
+use strict;
+use warnings;
+use bytes;
+
+use vars qw(@ISA);
+@ISA = qw(Mail::SpamAssassin::Plugin);
+
+# constructor: register the eval rule
+sub new {
+ my $class = shift;
+ my $mailsaobject = shift;
+
+ # some boilerplate...
+ $class = ref($class) || $class;
+ my $self = $class->SUPER::new($mailsaobject);
+ bless ($self, $class);
+
+ # the important bit!
+ $self->register_eval_rule("check_bayes");
+
+ return $self;
+}
+
+sub check_bayes {
+ my ($self, $pms, $fulltext, $min, $max) = @_;
+
+ return 0 if (!$pms->{conf}->{use_bayes} || !$pms->{conf}->{use_bayes_rules});
+
+ if (!exists ($self->{bayes_score})) {
+ $self->{bayes_score} = $self->{main}->{bayes_scanner}->scan ($pms, $pms->{msg});
+ }
+
+ if (defined $self->{bayes_score} &&
+ ($min == 0 || $self->{bayes_score} > $min) &&
+ ($max eq "undef" || $self->{bayes_score} <= $max))
+ {
+ if ($pms->{conf}->{detailed_bayes_score}) {
+ $pms->test_log(sprintf ("score: %3.4f, hits: %s",
+ $self->{bayes_score},
+ $self->{bayes_hits}));
+ }
+ else {
+ $pms->test_log(sprintf ("score: %3.4f", $self->{bayes_score}));
+ }
+ return 1;
+ }
+
+ return 0;
+}
+
+1;
Added: spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/BodyEval.pm
URL: http://svn.apache.org/viewcvs/spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/BodyEval.pm?rev=356971&view=auto
==============================================================================
--- spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/BodyEval.pm (added)
+++ spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/BodyEval.pm Wed Dec 14 18:27:14 2005
@@ -0,0 +1,214 @@
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+
+package Mail::SpamAssassin::Plugin::BodyEval;
+
+use Mail::SpamAssassin::Plugin;
+use Mail::SpamAssassin::Constants qw(:sa);
+
+use strict;
+use warnings;
+use bytes;
+
+use vars qw(@ISA);
+@ISA = qw(Mail::SpamAssassin::Plugin);
+
+# constructor: register the eval rule
+sub new {
+ my $class = shift;
+ my $mailsaobject = shift;
+
+ # some boilerplate...
+ $class = ref($class) || $class;
+ my $self = $class->SUPER::new($mailsaobject);
+ bless ($self, $class);
+
+ # the important bit!
+ $self->register_eval_rule("check_unique_words");
+ $self->register_eval_rule("multipart_alternative_difference");
+ $self->register_eval_rule("multipart_alternative_difference_count");
+ $self->register_eval_rule("check_blank_line_ratio");
+
+ return $self;
+}
+
+sub check_unique_words {
+ my ($self, undef, $body, $m, $b) = @_;
+
+ if (!defined $self->{unique_words_repeat}) {
+ $self->{unique_words_repeat} = 0;
+ $self->{unique_words_unique} = 0;
+ my %count;
+ for (@$body) {
+ # copy to avoid changing @$body
+ my $line = $_;
+ # from tokenize_line in Bayes.pm
+ $line =~ tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs;
+ $line =~ s/(\w)(\.{3,6})(\w)/$1 $2 $3/gs;
+ $line =~ s/(\w)(\-{2,6})(\w)/$1 $2 $3/gs;
+ $line =~ s/(?:^|\.\s+)([A-Z])([^A-Z]+)(?:\s|$)/ ' '.(lc $1).$2.' '/ge;
+ for my $token (split(' ', $line)) {
+ $count{$token}++;
+ }
+ }
+ $self->{unique_words_unique} = scalar grep { $_ == 1 } values(%count);
+ $self->{unique_words_repeat} = scalar keys(%count) - $self->{unique_words_unique};
+ }
+
+ # y = mx+b where y is number of unique words needed
+ my $unique = $self->{unique_words_unique};
+ my $repeat = $self->{unique_words_repeat};
+ my $y = ($unique + $repeat) * $m + $b;
+ return ($unique > $y);
+}
+
+sub multipart_alternative_difference {
+ my ($self, $pms, $fulltext, $min, $max) = @_;
+
+ $self->_multipart_alternative_difference($pms->{msg}) unless (exists $self->{madiff});
+
+ if (($min == 0 || $self->{madiff} > $min) &&
+ ($max eq "undef" || $self->{madiff} <= $max)) {
+ return 1;
+ }
+ return 0;
+}
+
+sub multipart_alternative_difference_count {
+ my ($self, $pms, $fulltext, $ratio, $minhtml) = @_;
+ $self->_multipart_alternative_difference($pms->{msg}) unless (exists $self->{madiff});
+ return 0 unless $self->{madiff_html} > $minhtml;
+ return(($self->{madiff_text} / $self->{madiff_html}) > $ratio);
+}
+
+sub _multipart_alternative_difference {
+ my ($self, $msg) = @_;
+ $self->{madiff} = 0;
+ $self->{madiff_html} = 0;
+ $self->{madiff_text} = 0;
+
+ # Find all multipart/alternative parts in the message
+ my @ma = $msg->find_parts(qr@^multipart/alternative\b@i);
+
+ # If there are no multipart/alternative sections, skip this test.
+ return if (!@ma);
+
+ # Figure out what the MIME content of the message looks like
+ my @content = $msg->content_summary();
+
+ # Exchange meeting requests come in as m/a text/html text/calendar,
+ # which we want to ignore because of the high FP rate it would cause.
+ #
+ if (@content == 3 && $content[2] eq 'text/calendar' &&
+ $content[1] eq 'text/html' &&
+ $content[0] eq 'multipart/alternative') {
+ return;
+ }
+
+ # Go through each of the multipart parts
+ foreach my $part (@ma) {
+ my %html = ();
+ my %text = ();
+
+ # limit our search to text-based parts
+ my @txt = $part->find_parts(qr@^text\b@i);
+ foreach my $text (@txt) {
+ # we only care about the rendered version of the part
+ my ($type, $rnd) = $text->rendered();
+
+ # parse the rendered text into tokens. assume they are whitespace
+ # separated, and ignore anything that doesn't have a word-character
+ # in it (0-9a-zA-Z_) since those are probably things like bullet
+ # points, horizontal lines, etc. this assumes that punctuation
+ # in one part will be the same in other parts.
+ #
+ if ($type eq 'text/html') {
+ foreach my $w (grep(/\w/, split(/\s+/, $rnd))) {
+ #dbg("eval: HTML: $w");
+ $html{$w}++;
+ }
+
+ # If there are no words, mark if there's at least 1 image ...
+ if (keys %html == 0 && exists $self->{html}{inside}{img}) {
+ # Use "\n" as the mark since it can't ever occur normally
+ $html{"\n"}=1;
+ }
+ }
+ else {
+ foreach my $w (grep(/\w/, split(/\s+/, $rnd))) {
+ #dbg("eval: TEXT: $w");
+ $text{$w}++;
+ }
+ }
+ }
+
+ # How many HTML tokens do we have at the start?
+ my $orig = keys %html;
+ next if ($orig == 0);
+
+ $self->{madiff_html} = $orig;
+ $self->{madiff_text} = keys %text;
+ dbg('eval: text words: ' . $self->{madiff_text} . ', html words: ' . $self->{madiff_html});
+
+ # If the token appears at least as many times in the text part as
+ # in the html part, remove it from the list of html tokens.
+ while(my ($k,$v) = each %text) {
+ delete $html{$k} if (exists $html{$k} && $html{$k}-$text{$k} < 1);
+ }
+
+ #map { dbg("eval: LEFT: $_") } keys %html;
+
+ # In theory, the tokens should be the same in both text and html
+ # parts, so there would be 0 tokens left in the html token list, for
+ # a 0% difference rate. Calculate it here, and record the difference
+ # if it's been the highest so far in this message.
+ my $diff = scalar(keys %html)/$orig*100;
+ $self->{madiff} = $diff if ($diff > $self->{madiff});
+
+ dbg("eval: " . sprintf "madiff: left: %d, orig: %d, max-difference: %0.2f%%", scalar(keys %html), $orig, $self->{madiff});
+ }
+
+ return;
+}
+
+
+sub check_blank_line_ratio {
+ my ($self, $pms, $fulltext, $min, $max, $minlines) = @_;
+
+ if (!defined $minlines || $minlines < 1) {
+ $minlines = 1;
+ }
+
+ if (! exists $self->{blank_line_ratio}->{$minlines}) {
+ $fulltext = $pms->get_decoded_body_text_array();
+ my ($blank) = 0;
+ if (scalar @{$fulltext} >= $minlines) {
+ foreach my $line (@{$fulltext}) {
+ next if ($line =~ /\S/);
+ $blank++;
+ }
+ $self->{blank_line_ratio}->{$minlines} = 100 * $blank / scalar @{$fulltext};
+ }
+ else {
+ $self->{blank_line_ratio}->{$minlines} = -1; # don't report if it's a blank message ...
+ }
+ }
+
+ return (($min == 0 && $self->{blank_line_ratio}->{$minlines} <= $max) ||
+ ($self->{blank_line_ratio}->{$minlines} > $min &&
+ $self->{blank_line_ratio}->{$minlines} <= $max));
+}
+1;
Added: spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/DNSEval.pm
URL: http://svn.apache.org/viewcvs/spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/DNSEval.pm?rev=356971&view=auto
==============================================================================
--- spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/DNSEval.pm (added)
+++ spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/DNSEval.pm Wed Dec 14 18:27:14 2005
@@ -0,0 +1,349 @@
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+
+package Mail::SpamAssassin::Plugin::DNSEval;
+
+use Mail::SpamAssassin::Plugin;
+use Mail::SpamAssassin::Constants qw(:ip);
+
+use strict;
+use warnings;
+use bytes;
+
+use vars qw(@ISA);
+@ISA = qw(Mail::SpamAssassin::Plugin);
+
+# constructor: register the eval rule
+sub new {
+ my $class = shift;
+ my $mailsaobject = shift;
+
+ # some boilerplate...
+ $class = ref($class) || $class;
+ my $self = $class->SUPER::new($mailsaobject);
+ bless ($self, $class);
+
+ # the important bit!
+ $self->register_eval_rule("check_rbl_accreditor");
+ $self->register_eval_rule("check_rbl");
+ $self->register_eval_rule("check_rbl_txt");
+ $self->register_eval_rule("check_rbl_sub");
+ $self->register_eval_rule("check_rbl_results_for");
+ $self->register_eval_rule("check_rbl_from_host");
+ $self->register_eval_rule("check_rbl_envfrom");
+ $self->register_eval_rule("check_dns_sender");
+
+ return $self;
+}
+
+sub ip_list_uniq_and_strip_private {
+ my ($self, @origips) = @_;
+ my @ips = ();
+ my %seen = ();
+ my $IP_PRIVATE = IP_PRIVATE;
+ foreach my $ip (@origips) {
+ next unless $ip;
+ next if (exists ($seen{$ip})); $seen{$ip} = 1;
+ next if ($ip =~ /$IP_PRIVATE/o);
+ push(@ips, $ip);
+ }
+ return @ips;
+}
+
+# check an RBL if the message contains an "accreditor assertion,"
+# that is, the message contains the name of a service that will vouch
+# for their practices.
+#
+sub check_rbl_accreditor {
+ my ($self, $pms, $rule, $set, $rbl_server, $subtest, $accreditor) = @_;
+
+ if (!defined $self->{accreditor_tag}) {
+ $self->message_accreditor_tag($pms);
+ }
+ if ($self->{accreditor_tag}->{$accreditor}) {
+ $self->check_rbl_backend($pms, $rule, $set, $rbl_server, 'A', $subtest);
+ }
+ return 0;
+}
+
+# Check for an Accreditor Assertion within the message, that is, the name of
+# a third-party who will vouch for the sender's practices. The accreditor
+# can be asserted in the EnvelopeFrom like this:
+#
+# listowner@a--accreditor.mail.example.com
+#
+# or in an 'Accreditor" Header field, like this:
+#
+# Accreditor: accreditor1, parm=value; accreditor2, parm-value
+#
+# This implementation supports multiple accreditors, but ignores any
+# parameters in the header field.
+#
+sub message_accreditor_tag {
+ my ($self, $pms) = @_;
+ my %acctags;
+
+ if ($pms->get('EnvelopeFrom:addr') =~ /[@.]a--([a-z0-9]{3,})\./i) {
+ (my $tag = $1) =~ tr/A-Z/a-z/;
+ $acctags{$tag} = -1;
+ }
+ my $accreditor_field = $pms->get('Accreditor');
+ if (defined($accreditor_field)) {
+ my @accreditors = split(/,/, $accreditor_field);
+ foreach my $accreditor (@accreditors) {
+ my @terms = split(' ', $accreditor);
+ if ($#terms >= 0) {
+ my $tag = $terms[0];
+ $tag =~ tr/A-Z/a-z/;
+ $acctags{$tag} = -1;
+ }
+ }
+ }
+ $self->{accreditor_tag} = \%acctags;
+}
+
+sub check_rbl_backend {
+ my ($self, $pms, $rule, $set, $rbl_server, $type, $subtest) = @_;
+ local ($_);
+
+ # First check that DNS is available, if not do not perform this check
+ return 0 if $self->{main}->{conf}->{skip_rbl_checks};
+ return 0 unless $pms->is_dns_available();
+ $pms->load_resolver();
+
+ if (($rbl_server !~ /^(\d+)\.(\d+)\.(\d+)\.(\d+)$/) &&
+ (index($rbl_server, '.') >= 0) &&
+ ($rbl_server !~ /\.$/)) {
+ $rbl_server .= ".";
+ }
+
+ dbg("dns: checking RBL $rbl_server, set $set");
+
+ # ok, make a list of all the IPs in the untrusted set
+ my @fullips = map { $_->{ip} } @{$pms->{relays_untrusted}};
+
+ # now, make a list of all the IPs in the external set, for use in
+ # notfirsthop testing. this will often be more IPs than found
+ # in @fullips. It includes the IPs that are trusted, but
+ # not in internal_networks.
+ my @fullexternal = map {
+ (!$_->{internal}) ? ($_->{ip}) : ()
+ } @{$pms->{relays_trusted}};
+ push (@fullexternal, @fullips); # add untrusted set too
+
+ # Make sure a header significantly improves results before adding here
+ # X-Sender-Ip: could be worth using (very low occurance for me)
+ # X-Sender: has a very low bang-for-buck for me
+ my $IP_ADDRESS = IP_ADDRESS;
+ my @originating = ();
+ for my $header ('X-Originating-IP', 'X-Apparently-From') {
+ my $str = $pms->get($header);
+ next unless $str;
+ push (@originating, ($str =~ m/($IP_ADDRESS)/g));
+ }
+
+ # Let's go ahead and trim away all private ips (KLC)
+ # also uniq the list and strip dups. (jm)
+ my @ips = $self->ip_list_uniq_and_strip_private(@fullips);
+
+ # if there's no untrusted IPs, it means we trust all the open-internet
+ # relays, so we can return right now.
+ return 0 unless (scalar @ips + scalar @originating > 0);
+
+ dbg("dns: IPs found: full-external: ".join(", ", @fullexternal).
+ " untrusted: ".join(", ", @ips).
+ " originating: ".join(", ", @originating));
+
+ my $trusted = $self->{main}->{conf}->{trusted_networks};
+
+ if (scalar @ips + scalar @originating > 0) {
+ # If name is foo-notfirsthop, check all addresses except for
+ # the originating one. Suitable for use with dialup lists, like the PDL.
+ # note that if there's only 1 IP in the untrusted set, do NOT pop the
+ # list, since it'd remove that one, and a legit user is supposed to
+ # use their SMTP server (ie. have at least 1 more hop)!
+ # If name is foo-lastexternal, check only the Received header just before
+ # it enters our internal networks; we can trust it and it's the one that
+ # passed mail between networks
+ if ($set =~ /-(notfirsthop|lastexternal)$/)
+ {
+ # use the external IP set, instead of the trusted set; the user may have
+ # specified some third-party relays as trusted. Also, don't use
+ # @originating; those headers are added by a phase of relaying through
+ # a server like Hotmail, which is not going to be in dialup lists anyway.
+ @ips = $self->ip_list_uniq_and_strip_private(@fullexternal);
+ if ($1 eq "lastexternal") {
+ @ips = (defined $ips[0]) ? ($ips[0]) : ();
+ } else {
+ pop @ips if (scalar @ips > 1);
+ }
+ }
+ # If name is foo-firsttrusted, check only the Received header just
+ # after it enters our trusted networks; that's the only one we can
+ # trust the IP address from (since our relay added that header).
+ # And if name is foo-untrusted, check any untrusted IP address.
+ elsif ($set =~ /-(first|un)trusted$/)
+ {
+ my @tips = ();
+ foreach my $ip (@originating) {
+ if ($ip && !$trusted->contains_ip($ip)) {
+ push(@tips, $ip);
+ }
+ }
+ @ips = $self->ip_list_uniq_and_strip_private (@ips, @tips);
+ if ($1 eq "first") {
+ @ips = (defined $ips[0]) ? ($ips[0]) : ();
+ } else {
+ shift @ips;
+ }
+ }
+ else
+ {
+ my @tips = ();
+ foreach my $ip (@originating) {
+ if ($ip && !$trusted->contains_ip($ip)) {
+ push(@tips, $ip);
+ }
+ }
+ # add originating IPs as untrusted IPs (if they are untrusted)
+ @ips = reverse $self->ip_list_uniq_and_strip_private (@ips, @tips);
+
+ # How many IPs max you check in the received lines
+ my $checklast=$self->{main}->{conf}->{num_check_received};
+
+ if (scalar @ips > $checklast) {
+ splice (@ips, $checklast); # remove all others
+ }
+ }
+ }
+ dbg("dns: only inspecting the following IPs: ".join(", ", @ips));
+
+ eval {
+ foreach my $ip (@ips) {
+ next unless ($ip =~ /(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})/);
+ $pms->do_rbl_lookup($rule, $set, $type, $rbl_server,
+ "$4.$3.$2.$1.$rbl_server", $subtest);
+ }
+ };
+
+ # note that results are not handled here, hits are handled directly
+ # as DNS responses are harvested
+ return 0;
+}
+
+sub check_rbl {
+ my ($self, $pms, $rule, $set, $rbl_server, $subtest) = @_;
+ $self->check_rbl_backend($pms, $rule, $set, $rbl_server, 'A', $subtest);
+}
+
+sub check_rbl_txt {
+ my ($self, $pms, $rule, $set, $rbl_server, $subtest) = @_;
+ $self->check_rbl_backend($pms, $rule, $set, $rbl_server, 'TXT', $subtest);
+}
+
+# run for first message
+sub check_rbl_sub {
+ my ($self, $pms, $rule, $set, $subtest) = @_;
+
+ return 0 if $self->{main}->{conf}->{skip_rbl_checks};
+ return 0 unless $pms->is_dns_available();
+
+ $pms->register_rbl_subtest($rule, $set, $subtest);
+}
+
+# backward compatibility
+sub check_rbl_results_for {
+ #warn "dns: check_rbl_results_for() is deprecated, use check_rbl_sub()\n";
+ check_rbl_sub(@_);
+}
+
+# this only checks the address host name and not the domain name because
+# using the domain name had much worse results for dsn.rfc-ignorant.org
+sub check_rbl_from_host {
+ _check_rbl_addresses(@_, $_[0]->all_from_addrs());
+}
+
+# this only checks the address host name and not the domain name because
+# using the domain name had much worse results for dsn.rfc-ignorant.org
+sub check_rbl_envfrom {
+ _check_rbl_addresses(@_, $_[0]->get('EnvelopeFrom:addr'));
+}
+
+sub _check_rbl_addresses {
+ my ($self, $pms, $rule, $set, $rbl_server, @addresses) = @_;
+
+ return 0 if $self->{main}->{conf}->{skip_rbl_checks};
+ return 0 unless $pms->is_dns_available();
+
+ my %hosts;
+ for my $address (@addresses) {
+ if ($address =~ m/\@(\S+\.\S+)/) {
+ $hosts{lc($1)} = 1;
+ }
+ }
+ return unless scalar keys %hosts;
+
+ $pms->load_resolver();
+
+ if (($rbl_server !~ /^(\d+)\.(\d+)\.(\d+)\.(\d+)$/) &&
+ (index($rbl_server, '.') >= 0) &&
+ ($rbl_server !~ /\.$/)) {
+ $rbl_server .= ".";
+ }
+ dbg("dns: _check_rbl_addresses RBL $rbl_server, set $set");
+
+ for my $host (keys %hosts) {
+ $pms->do_rbl_lookup($rule, $set, 'A', $rbl_server, "$host.$rbl_server");
+ }
+}
+
+sub check_dns_sender {
+ my ($self, $pms, $rule) = @_;
+
+ my $host;
+ for my $from ($pms->get('EnvelopeFrom:addr')) {
+ next unless defined $from;
+
+ $from =~ tr/././s; # bug 3366
+ if ($from =~ /\@(\S+\.\S+)/) {
+ $host = lc($1);
+ last;
+ }
+ }
+ return 0 unless defined $host;
+
+ # First check that DNS is available, if not do not perform this check
+ # TODO: need a way to skip DNS checks as a whole in configuration
+ return 0 unless $pms->is_dns_available();
+ $pms->load_resolver();
+
+ if ($host eq 'compiling.spamassassin.taint.org') {
+ # only used when compiling
+ return 0;
+ }
+
+ dbg("dns: checking A and MX for host $host");
+
+ $pms->do_dns_lookup($rule, 'A', $host);
+ $pms->do_dns_lookup($rule, 'MX', $host);
+
+ # cache name of host for later checking
+ $pms->{sender_host} = $host;
+
+ return 0;
+}
+
+1;
Added: spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/HTMLEval.pm
URL: http://svn.apache.org/viewcvs/spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/HTMLEval.pm?rev=356971&view=auto
==============================================================================
--- spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/HTMLEval.pm (added)
+++ spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/HTMLEval.pm Wed Dec 14 18:27:14 2005
@@ -0,0 +1,188 @@
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+
+package Mail::SpamAssassin::Plugin::HTMLEval;
+
+use Mail::SpamAssassin::Plugin;
+use strict;
+use warnings;
+use bytes;
+
+use vars qw(@ISA);
+@ISA = qw(Mail::SpamAssassin::Plugin);
+
+# constructor: register the eval rule
+sub new {
+ my $class = shift;
+ my $mailsaobject = shift;
+
+ # some boilerplate...
+ $class = ref($class) || $class;
+ my $self = $class->SUPER::new($mailsaobject);
+ bless ($self, $class);
+
+ # the important bit!
+ $self->register_eval_rule("html_tag_balance");
+ $self->register_eval_rule("html_image_only");
+ $self->register_eval_rule("html_image_ratio");
+ $self->register_eval_rule("html_charset_faraway");
+ $self->register_eval_rule("html_tag_exists");
+ $self->register_eval_rule("html_test");
+ $self->register_eval_rule("html_eval");
+ $self->register_eval_rule("html_text_match");
+ $self->register_eval_rule("html_title_subject_ratio");
+ $self->register_eval_rule("html_text_not_match");
+ $self->register_eval_rule("html_range");
+
+ return $self;
+}
+
+sub html_tag_balance {
+ my ($self, $pms, undef, $rawtag, $rawexpr) = @_;
+ $rawtag =~ /^([a-zA-Z0-9]+)$/; my $tag = $1;
+ $rawexpr =~ /^([\<\>\=\!\-\+ 0-9]+)$/; my $expr = $1;
+
+ return 0 unless exists $pms->{html}{inside}{$tag};
+
+ $pms->{html}{inside}{$tag} =~ /^([\<\>\=\!\-\+ 0-9]+)$/;
+ my $val = $1;
+ return eval "\$val $expr";
+}
+
+sub html_image_only {
+ my ($self, $pms, undef, $min, $max) = @_;
+
+ return (exists $pms->{html}{inside}{img} &&
+ exists $pms->{html}{length} &&
+ $pms->{html}{length} > $min &&
+ $pms->{html}{length} <= $max);
+}
+
+sub html_image_ratio {
+ my ($self, $pms, undef, $min, $max) = @_;
+
+ return 0 unless (exists $pms->{html}{non_space_len} &&
+ exists $pms->{html}{image_area} &&
+ $pms->{html}{image_area} > 0);
+ my $ratio = $pms->{html}{non_space_len} / $pms->{html}{image_area};
+ return ($ratio > $min && $ratio <= $max);
+}
+
+sub html_charset_faraway {
+ my ($self, $pms) = @_;
+
+ return 0 unless exists $pms->{html}{charsets};
+
+ my @locales = Mail::SpamAssassin::Util::get_my_locales($pms->{conf}->{ok_locales});
+ return 0 if grep { $_ eq "all" } @locales;
+
+ my $okay = 0;
+ my $bad = 0;
+ for my $c (split(' ', $pms->{html}{charsets})) {
+ if (Mail::SpamAssassin::Locales::is_charset_ok_for_locales($c, @locales)) {
+ $okay++;
+ }
+ else {
+ $bad++;
+ }
+ }
+ return ($bad && ($bad >= $okay));
+}
+
+sub html_tag_exists {
+ my ($self, $pms, undef, $tag) = @_;
+ return exists $pms->{html}{inside}{$tag};
+}
+
+sub html_test {
+ my ($self, $pms, undef, $test) = @_;
+ return $pms->{html}{$test};
+}
+
+sub html_eval {
+ my ($self, $pms, undef, $test, $rawexpr) = @_;
+ $rawexpr =~ /^([\<\>\=\!\-\+ 0-9]+)$/; my $expr = $1;
+
+ # workaround bug 3320: wierd perl bug where additional, very explicit
+ # untainting into a new var is required.
+ my $tainted = $pms->{html}{$test};
+ return unless defined($tainted);
+ $tainted =~ /^(.*)$/; my $val = $1;
+
+ # just use the value in $val, don't copy it needlessly
+ return eval "\$val $expr";
+}
+
+sub html_text_match {
+ my ($self, $pms, undef, $text, $regexp) = @_;
+ for my $string (@{ $pms->{html}{$text} }) {
+ if (defined $string && $string =~ /${regexp}/) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+sub html_title_subject_ratio {
+ my ($self, $pms, undef, $ratio) = @_;
+
+ my $subject = $pms->get('Subject');
+ if (! $subject) {
+ return 0;
+ }
+ my $max = 0;
+ for my $string (@{ $pms->{html}{title} }) {
+ if ($string) {
+ my $ratio = length($string) / length($subject);
+ $max = $ratio if $ratio > $max;
+ }
+ }
+ return $max > $ratio;
+}
+
+sub html_text_not_match {
+ my ($self, $pms, undef, $text, $regexp) = @_;
+ for my $string (@{ $pms->{html}{$text} }) {
+ if (defined $string && $string !~ /${regexp}/) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+sub html_range {
+ my ($self, $pms, undef, $test, $min, $max) = @_;
+
+ return 0 unless exists $pms->{html}{$test};
+
+ $test = $pms->{html}{$test};
+
+ # not all perls understand what "inf" means, so we need to do
+ # non-numeric tests! urg!
+ if (!defined $max || $max eq "inf") {
+ return ($test eq "inf") ? 1 : ($test > $min);
+ }
+ elsif ($test eq "inf") {
+ # $max < inf, so $test == inf means $test > $max
+ return 0;
+ }
+ else {
+ # if we get here everything should be a number
+ return ($test > $min && $test <= $max);
+ }
+}
+
+1;
Added: spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/HeaderEval.pm
URL: http://svn.apache.org/viewcvs/spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/HeaderEval.pm?rev=356971&view=auto
==============================================================================
--- spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/HeaderEval.pm (added)
+++ spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/HeaderEval.pm Wed Dec 14 18:27:14 2005
@@ -0,0 +1,1107 @@
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+
+package Mail::SpamAssassin::Plugin::HeaderEval;
+
+use Mail::SpamAssassin::Plugin;
+use Mail::SpamAssassin::Constants qw(:sa :ip);
+
+use strict;
+use warnings;
+use bytes;
+
+use vars qw(@ISA);
+@ISA = qw(Mail::SpamAssassin::Plugin);
+
+# constructor: register the eval rule
+sub new {
+ my $class = shift;
+ my $mailsaobject = shift;
+
+ # some boilerplate...
+ $class = ref($class) || $class;
+ my $self = $class->SUPER::new($mailsaobject);
+ bless ($self, $class);
+
+ # the important bit!
+ $self->register_eval_rule("check_for_fake_aol_relay_in_rcvd");
+ $self->register_eval_rule("check_for_faraway_charset_in_headers");
+ $self->register_eval_rule("check_for_unique_subject_id");
+ $self->register_eval_rule("word_is_in_dictionary");
+ $self->register_eval_rule("check_illegal_chars");
+ $self->register_eval_rule("check_for_forged_hotmail_received_headers");
+ $self->register_eval_rule("check_for_no_hotmail_received_headers");
+ $self->register_eval_rule("check_for_msn_groups_headers");
+ $self->register_eval_rule("check_for_forged_eudoramail_received_headers");
+ $self->register_eval_rule("check_for_forged_yahoo_received_headers");
+ $self->register_eval_rule("check_for_forged_juno_received_headers");
+ $self->register_eval_rule("check_for_from_to_same");
+ $self->register_eval_rule("check_for_matching_env_and_hdr_from");
+ $self->register_eval_rule("sorted_recipients");
+ $self->register_eval_rule("similar_recipients");
+ $self->register_eval_rule("check_for_missing_to_header");
+ $self->register_eval_rule("check_for_forged_gw05_received_headers");
+ $self->register_eval_rule("check_for_round_the_world_received_helo");
+ $self->register_eval_rule("check_for_round_the_world_received_revdns");
+ $self->register_eval_rule("check_for_shifted_date");
+ $self->register_eval_rule("received_within_months");
+ $self->register_eval_rule("subject_is_all_caps");
+ $self->register_eval_rule("check_for_to_in_subject");
+ $self->register_eval_rule("check_outlook_message_id");
+ $self->register_eval_rule("check_messageid_not_usable");
+ $self->register_eval_rule("check_header_count_range");
+ $self->register_eval_rule("check_unresolved_template");
+ $self->register_eval_rule("check_ratware_name_id");
+ $self->register_eval_rule("check_ratware_envelope_from");
+
+ return $self;
+}
+
+# sad but true. sort it out, sysadmins!
+my $CCTLDS_WITH_LOTS_OF_OPEN_RELAYS = qr{(?:kr|cn|cl|ar|hk|il|th|tw|sg|za|tr|ma|ua|in|pe|br)};
+my $ROUND_THE_WORLD_RELAYERS = qr{(?:net|com|ca)};
+
+sub check_for_fake_aol_relay_in_rcvd {
+ my ($self, $pms) = @_;
+ local ($_);
+
+ $_ = $pms->get('Received'); s/\s/ /gs;
+
+ # this is the hostname format used by AOL for their relays. Spammers love
+ # forging it. Don't make it more specific to match aol.com only, though --
+ # there's another set of spammers who generate fake hostnames to go with
+ # it!
+ if (/ rly-[a-z][a-z]\d\d\./i) {
+ return 0 if /\/AOL-\d+\.\d+\.\d+\)/; # via AOL mail relay
+ return 0 if /ESMTP id (?:RELAY|MAILRELAY|MAILIN)/; # AOLish
+ return 1;
+ }
+
+# spam: Received: from unknown (HELO mta05bw.bigpond.com) (80.71.176.130) by
+# rly-xw01.mx.aol.com with QMQP; Sat, 15 Jun 2002 23:37:16 -0000
+
+# non: Received: from rly-xj02.mx.aol.com (rly-xj02.mail.aol.com [172.20.116.39]) by
+# omr-r05.mx.aol.com (v83.35) with ESMTP id RELAYIN7-0501132011; Wed, 01
+# May 2002 13:20:11 -0400
+
+# non: Received: from logs-tr.proxy.aol.com (logs-tr.proxy.aol.com [152.163.201.132])
+# by rly-ip01.mx.aol.com (8.8.8/8.8.8/AOL-5.0.0)
+# with ESMTP id NAA08955 for <sa...@yahoogroups.com>;
+# Thu, 4 Apr 2002 13:11:20 -0500 (EST)
+
+ return 0;
+}
+
+sub check_for_faraway_charset_in_headers {
+ my ($self, $pms) = @_;
+ my $hdr;
+
+ my @locales = Mail::SpamAssassin::Util::get_my_locales($self->{main}->{conf}->{ok_locales});
+
+ return 0 if grep { $_ eq "all" } @locales;
+
+ for my $h (qw(From Subject)) {
+ my @hdrs = $pms->get("$h:raw");
+ if ($#hdrs >= 0) {
+ $hdr = join(" ", @hdrs);
+ } else {
+ $hdr = '';
+ }
+ while ($hdr =~ /=\?(.+?)\?.\?.*?\?=/g) {
+ Mail::SpamAssassin::Locales::is_charset_ok_for_locales($1, @locales)
+ or return 1;
+ }
+ }
+ 0;
+}
+
+sub check_for_unique_subject_id {
+ my ($self, $pms) = @_;
+ local ($_);
+ $_ = lc $pms->get('Subject');
+ study;
+
+ my $id = 0;
+ if (/[-_\.\s]{7,}([-a-z0-9]{4,})$/
+ || /\s{10,}(?:\S\s)?(\S+)$/
+ || /\s{3,}[-:\#\(\[]+([-a-z0-9]{4,})[\]\)]+$/
+ || /\s{3,}[:\#\(\[]*([a-f0-9]{4,})[\]\)]*$/
+ || /\s{3,}[-:\#]([a-z0-9]{5,})$/
+ || /[\s._]{3,}([^0\s._]\d{3,})$/
+ || /[\s._]{3,}\[(\S+)\]$/
+
+ # (7217vPhZ0-478TLdy5829qicU9-0@26) and similar
+ || /\(([-\w]{7,}\@\d+)\)$/
+
+ # Seven or more digits at the end of a subject is almost certainly a id
+ || /\b(\d{7,})\s*$/
+
+ # stuff at end of line after "!" or "?" is usually an id
+ || /[!\?]\s*(\d{4,}|\w+(-\w+)+)\s*$/
+
+ # 9095IPZK7-095wsvp8715rJgY8-286-28 and similar
+ # excluding 'Re:', etc and the first word
+ || /(?:\w{2,3}:\s)?\w+\s+(\w{7,}-\w{7,}(-\w+)*)\s*$/
+
+ # #30D7 and similar
+ || /\s#\s*([a-f0-9]{4,})\s*$/
+ )
+ {
+ $id = $1;
+ # exempt online purchases
+ if ($id =~ /\d{5,}/
+ && /(?:item|invoice|order|number|confirmation).{1,6}\Q$id\E\s*$/)
+ {
+ $id = 0;
+ }
+
+ # for the "foo-bar-baz" case, otherwise it won't
+ # be found in the dict:
+ $id =~ s/-//;
+ }
+
+ return ($id && !$self->word_is_in_dictionary($id));
+}
+
+# word_is_in_dictionary()
+#
+# See if the word looks like an English word, by checking if each triplet
+# of letters it contains is one that can be found in the English language.
+# Does not include triplets only found in proper names, or in the Latin
+# and Greek terms that might be found in a larger dictionary
+
+my %triplets = ();
+my $triplets_loaded = 0;
+
+sub word_is_in_dictionary {
+ my ($self, $word) = @_;
+ local ($_);
+ local $/ = "\n"; # Ensure $/ is set appropriately
+
+ # $word =~ tr/A-Z/a-z/; # already done by this stage
+ $word =~ s/^\s+//;
+ $word =~ s/\s+$//;
+
+ # If it contains a digit, dash, etc, it's not a valid word.
+ # Don't reject words like "can't" and "I'll"
+ return 0 if ($word =~ /[^a-z\']/);
+
+ # handle a few common "blah blah blah (comment)" styles
+ return 1 if ($word eq "ot"); # off-topic
+ return 1 if ($word =~ /(?:linux|nix|bsd)/); # not in most dicts
+ return 1 if ($word =~ /(?:whew|phew|attn|tha?nx)/); # not in most dicts
+
+ my $word_len = length($word);
+
+ # Unique IDs probably aren't going to be only one or two letters long
+ return 1 if ($word_len < 3);
+
+ if (!$triplets_loaded) {
+ # take a copy to avoid modifying the real one
+ my @default_triplets_path = @Mail::SpamAssassin::default_rules_path;
+ @default_triplets_path = map { s,$,/triplets.txt,; $_; }
+ @default_triplets_path;
+ my $filename = $self->{main}->first_existing_path (@default_triplets_path);
+
+ if (!defined $filename) {
+ dbg("eval: failed to locate the triplets.txt file");
+ return 1;
+ }
+
+ if (!open (TRIPLETS, "<$filename")) {
+ dbg("eval: failed to open '$filename', cannot check dictionary");
+ return 1;
+ }
+
+ while(<TRIPLETS>) {
+ chomp;
+ $triplets{$_} = 1;
+ }
+ close(TRIPLETS);
+
+ $triplets_loaded = 1;
+ } # if (!$triplets_loaded)
+
+
+ my $i;
+
+ for ($i = 0; $i < ($word_len - 2); $i++) {
+ my $triplet = substr($word, $i, 3);
+ if (!$triplets{$triplet}) {
+ dbg("eval: unique ID: letter triplet '$triplet' from word '$word' not valid");
+ return 0;
+ }
+ } # for ($i = 0; $i < ($word_len - 2); $i++)
+
+ # All letter triplets in word were found to be valid
+ return 1;
+}
+
+# look for 8-bit and other illegal characters that should be MIME
+# encoded, these might want to exempt languages that do not use
+# Latin-based alphabets, but only if the user wants it that way
+sub check_illegal_chars {
+ my ($self, $pms, $header, $ratio, $count) = @_;
+
+ $header .= ":raw" unless ($header eq "ALL" || $header =~ /:raw$/);
+ my $str = $pms->get($header);
+ return 0 unless $str;
+
+ # avoid overlap between tests
+ if ($header eq "ALL") {
+ # fix continuation lines, then remove Subject and From
+ $str =~ s/\n[ \t]+/ /gs;
+ $str =~ s/^(?:Subject|From):.*$//gm;
+ }
+
+ # count illegal substrings (RFC 2045)
+ my $illegal = () = ($str =~ /[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]/g);
+
+ # minor exemptions for Subject
+ if ($header eq "Subject:raw") {
+ # only exempt a single cent sign, pound sign, or registered sign
+ my $exempt = () = ($str =~ /[\xa2\xa3\xae]/g);
+ $illegal-- if $exempt == 1;
+ }
+
+ return 0 if (length($str) == 0);
+ return (($illegal / length($str)) >= $ratio && $illegal >= $count);
+}
+
+# ezmlm has a very bad habit of removing Received: headers! bad ezmlm.
+#
+sub gated_through_received_hdr_remover {
+ my ($self, $pms) = @_;
+
+ my $txt = $pms->get("Mailing-List");
+ if (defined $txt && $txt =~ /^contact \S+\@\S+\; run by ezmlm$/) {
+ my $dlto = $pms->get("Delivered-To");
+ my $rcvd = $pms->get("Received");
+
+ # ensure we have other indicative headers too
+ if ($dlto =~ /^mailing list \S+\@\S+/ &&
+ $rcvd =~ /qmail \d+ invoked (?:from network|by .{3,20})\); \d+ ... \d+/)
+ {
+ return 1;
+ }
+ }
+
+ if ($pms->get("Received") !~ /\S/) {
+ # we have no Received headers! These tests cannot run in that case
+ return 1;
+ }
+
+ # MSN groups removes Received lines. thanks MSN
+ if ($pms->get("Received") =~ /from groups\.msn\.com \(\S+\.msn\.com /) {
+ return 1;
+ }
+
+ return 0;
+}
+
+# FORGED_HOTMAIL_RCVD
+sub _check_for_forged_hotmail_received_headers {
+ my ($self, $pms) = @_;
+
+ if (defined $self->{hotmail_addr_but_no_hotmail_received}) { return; }
+
+ $self->{hotmail_addr_with_forged_hotmail_received} = 0;
+ $self->{hotmail_addr_but_no_hotmail_received} = 0;
+
+ my $rcvd = $pms->get('Received');
+ $rcvd =~ s/\s+/ /gs; # just spaces, simplify the regexp
+
+ return if ($rcvd =~
+ /from mail pickup service by hotmail\.com with Microsoft SMTPSVC;/);
+
+ # Microsoft passes Hotmail mail directly to MSN Group servers.
+ return if $self->check_for_msn_groups_headers($pms);
+
+ my $ip = $pms->get('X-Originating-Ip');
+ my $IP_ADDRESS = IP_ADDRESS;
+
+ if ($ip =~ /$IP_ADDRESS/) { $ip = 1; } else { $ip = 0; }
+
+ # Hotmail formats its received headers like this:
+ # Received: from hotmail.com (f135.law8.hotmail.com [216.33.241.135])
+ # spammers do not ;)
+
+ if ($self->gated_through_received_hdr_remover($pms)) { return; }
+
+ if ($rcvd =~ /from \S*hotmail.com \(\S+\.hotmail(?:\.msn)?\.com[ \)]/ && $ip)
+ { return; }
+ if ($rcvd =~ /from \S+ by \S+\.hotmail(?:\.msn)?\.com with HTTP\;/ && $ip)
+ { return; }
+ if ($rcvd =~ /from \[66\.218.\S+\] by \S+\.yahoo\.com/ && $ip)
+ { return; }
+
+ if ($rcvd =~ /(?:from |HELO |helo=)\S*hotmail\.com\b/) {
+ # HELO'd as hotmail.com, despite not being hotmail
+ $self->{hotmail_addr_with_forged_hotmail_received} = 1;
+ } else {
+ # check to see if From claimed to be @hotmail.com
+ my $from = $pms->get('From:addr');
+ if ($from !~ /hotmail.com/) { return; }
+ $self->{hotmail_addr_but_no_hotmail_received} = 1;
+ }
+}
+
+# FORGED_HOTMAIL_RCVD
+sub check_for_forged_hotmail_received_headers {
+ my ($self, $pms) = @_;
+ $self->_check_for_forged_hotmail_received_headers($pms);
+ return $self->{hotmail_addr_with_forged_hotmail_received};
+}
+
+# SEMIFORGED_HOTMAIL_RCVD
+sub check_for_no_hotmail_received_headers {
+ my ($self, $pms) = @_;
+ $self->_check_for_forged_hotmail_received_headers($pms);
+ return $self->{hotmail_addr_but_no_hotmail_received};
+}
+
+# MSN_GROUPS
+sub check_for_msn_groups_headers {
+ my ($self, $pms) = @_;
+
+ return 0 unless ($pms->get('To') =~ /<(\S+)\@groups\.msn\.com>/i);
+ my $listname = $1;
+
+ # from Theo Van Dinter, see
+ # http://www.hughes-family.org/bugzilla/show_bug.cgi?id=591
+ # Updated by DOS, based on messages from Bob Menschel, bug 4301
+
+ return 0 unless $pms->get('Received') =~ /from mail pickup service by ((?:p\d\d\.)groups\.msn\.com)\b/;
+ my $server = $1;
+
+ if ($listname =~ /^notifications$/) {
+ return 0 unless $pms->get('Message-Id') =~ /^<\S+\@$server>/;
+ } else {
+ return 0 unless $pms->get('Message-Id') =~ /^<$listname-\S+\@groups\.msn\.com>/;
+ return 0 unless $pms->get('EnvelopeFrom') =~ /$listname-bounce\@groups\.msn\.com/;
+ }
+ return 1;
+
+# MSN Groups
+# Return-path: <Li...@groups.msn.com>
+# Received: from groups.msn.com (tk2dcpuba02.msn.com [65.54.195.210]) by
+# dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g72K35v10457 for
+# <zz...@jmason.org>; Fri, 2 Aug 2002 21:03:05 +0100
+# Received: from mail pickup service by groups.msn.com with Microsoft
+# SMTPSVC; Fri, 2 Aug 2002 13:01:30 -0700
+# Message-id: <Li...@groups.msn.com>
+# X-loop: notifications@groups.msn.com
+# Reply-to: "List Full Name" <Li...@groups.msn.com>
+# To: "List Full Name" <Li...@groups.msn.com>
+
+# Return-path: <Li...@groups.msn.com>
+# Received: from p04.groups.msn.com ([65.54.195.216]) etc...
+# Received: from mail pickup service by p04.groups.msn.com with Microsoft SMTPSVC;
+# Thu, 5 May 2005 20:30:37 -0700
+# X-Originating-Ip: 207.68.170.30
+# From: =?iso-8859-1?B?IqSj4/D9pEbzeN9s9vLw6qQiIA==?=<zz...@hotmail.com>
+# To: "Managers of List Name" <no...@groups.msn.com>
+# Subject: =?iso-8859-1?Q?APPROVAL_NEEDED:_=A4=A3=E3=F0=FD=A4F=F3x=DFl?=
+# =?iso-8859-1?Q?=F6=F2=F0=EA=A4_applied_to_join_List_Name=2C?=
+# =?iso-8859-1?Q?_an_MSN_Group?=
+# Date: Thu, 5 May 2005 20:30:37 -0700
+# MIME-Version: 1.0
+# Content-Type: multipart/alternative;
+# boundary="----=_NextPart_000_333944_01C551B1.4BBA02B0"
+# X-MimeOLE: Produced By Microsoft MimeOLE V5.50.4927.1200
+# Message-ID: <TK...@p04.groups.msn.com>
+
+# Return-path: <Li...@groups.msn.com>
+# Received: from [65.54.208.83] (helo=p05.groups.msn.com) etc...
+# Received: from mail pickup service by p05.groups.msn.com with Microsoft SMTPSVC;
+# Fri, 6 May 2005 14:59:25 -0700
+# X-Originating-Ip: 207.68.170.30
+# Message-Id: <Li...@groups.msn.com>
+# Reply-To: "List Name" <Li...@groups.msn.com>
+# From: "whoever" <zz...@hotmail.com>
+# To: "List Name" <Li...@groups.msn.com>
+# Subject: whatever
+# Date: Fri, 6 May 2005 14:59:25 -0700
+
+}
+
+###########################################################################
+
+sub check_for_forged_eudoramail_received_headers {
+ my ($self, $pms) = @_;
+
+ my $from = $pms->get('From:addr');
+ if ($from !~ /eudoramail.com/) { return 0; }
+
+ my $rcvd = $pms->get('Received');
+ $rcvd =~ s/\s+/ /gs; # just spaces, simplify the regexp
+
+ my $ip = $pms->get('X-Sender-Ip');
+ my $IP_ADDRESS = IP_ADDRESS;
+ if ($ip =~ /$IP_ADDRESS/) { $ip = 1; } else { $ip = 0; }
+
+ # Eudoramail formats its received headers like this:
+ # Received: from Unknown/Local ([?.?.?.?]) by shared1-mail.whowhere.com;
+ # Thu Nov 29 13:44:25 2001
+ # Message-Id: <JG...@shared1-mail.whowhere.com>
+ # Organization: QUALCOMM Eudora Web-Mail (http://www.eudoramail.com:80)
+ # X-Sender-Ip: 192.175.21.146
+ # X-Mailer: MailCity Service
+
+ if ($self->gated_through_received_hdr_remover($pms)) { return 0; }
+
+ if ($rcvd =~ /by \S*whowhere.com\;/ && $ip) { return 0; }
+
+ return 1;
+}
+
+###########################################################################
+
+sub check_for_forged_yahoo_received_headers {
+ my ($self, $pms) = @_;
+
+ my $from = $pms->get('From:addr');
+ if ($from !~ /yahoo\.com$/) { return 0; }
+
+ my $rcvd = $pms->get('Received');
+
+ if ($pms->get("Resent-From") && $pms->get("Resent-To")) {
+ my $xrcvd = $pms->get("X-Received");
+ $rcvd = $xrcvd if $xrcvd;
+ }
+ $rcvd =~ s/\s+/ /gs; # just spaces, simplify the regexp
+
+ # not sure about this
+ #if ($rcvd !~ /from \S*yahoo\.com/) { return 0; }
+
+ if ($self->gated_through_received_hdr_remover($pms)) { return 0; }
+
+ # bug 3740: ignore bounces from Yahoo!. only honoured if the
+ # correct rDNS shows up in the trusted relay list, or first untrusted relay
+ if ($from eq 'MAILER-DAEMON@yahoo.com' &&
+ ($pms->{relays_trusted_str} =~ / rdns=\S+\.yahoo\.com /
+ || $pms->{relays_untrusted_str} =~ /^[^\]]+ rdns=\S+\.yahoo\.com /))
+ { return 0; }
+
+ if ($rcvd =~ /by web\S+\.mail\S*\.yahoo\.com via HTTP/) { return 0; }
+ if ($rcvd =~ /by smtp\S+\.yahoo\.com with SMTP/) { return 0; }
+ my $IP_ADDRESS = IP_ADDRESS;
+ if ($rcvd =~
+ /from \[$IP_ADDRESS\] by \S+\.(?:groups|scd|dcn)\.yahoo\.com with NNFMP/) {
+ return 0;
+ }
+
+ # used in "forward this news item to a friend" links. There's no better
+ # received hdrs to match on, unfortunately. I'm not sure if the next test is
+ # still useful, as a result.
+ #
+ # search for msgid <20...@xent.com>, subject "Yahoo!
+ # News Story - Top Stories", date Sep 29 2002 on
+ # <http://xent.com/pipermail/fork/> for an example.
+ #
+ if ($rcvd =~ /\bmailer\d+\.bulk\.scd\.yahoo\.com\b/
+ && $from =~ /\@reply\.yahoo\.com$/) { return 0; }
+
+ if ($rcvd =~ /by \w+\.\w+\.yahoo\.com \(\d+\.\d+\.\d+\/\d+\.\d+\.\d+\)(?: with ESMTP)? id \w+/) {
+ # possibly sent from "mail this story to a friend"
+ return 0;
+ }
+
+ return 1;
+}
+
+sub check_for_forged_juno_received_headers {
+ my ($self, $pms) = @_;
+
+ my $from = $pms->get('From:addr');
+ if($from !~ /\bjuno.com/) { return 0; }
+
+ if($self->gated_through_received_hdr_remover($pms)) { return 0; }
+
+ my $xmailer = $pms->get('X-Mailer');
+ my $xorig = $pms->get('X-Originating-IP');
+ my $rcvd = $pms->get('Received');
+ my $IP_ADDRESS = IP_ADDRESS;
+
+ if (!$xorig) { # New style Juno has no X-Originating-IP header, and other changes
+ if($rcvd !~ /from.*\b(?:juno|untd)\.com.*[\[\(]$IP_ADDRESS[\]\)].*by/
+ && $rcvd !~ / cookie\.(?:juno|untd)\.com /) { return 1; }
+ if($xmailer !~ /Juno /) { return 1; }
+ } else {
+ if($rcvd !~ /from.*\bmail\.com.*\[$IP_ADDRESS\].*by/) { return 1; }
+ if($xorig !~ /$IP_ADDRESS/) { return 1; }
+ if($xmailer !~ /\bmail\.com/) { return 1; }
+ }
+
+ return 0;
+}
+
+# From and To have same address, but are not exactly the same and
+# neither contains intermediate spaces.
+sub check_for_from_to_same {
+ my ($self, $pms) = @_;
+
+ my $hdr_from = $pms->get('From');
+ my $hdr_to = $pms->get('To');
+ return 0 if (!length($hdr_from) || !length($hdr_to) ||
+ $hdr_from eq $hdr_to);
+
+ my $addr_from = $pms->get('From:addr');
+ my $addr_to = $pms->get('To:addr');
+ # BUG: From:addr and To:addr sometimes contain whitespace
+ $addr_from =~ s/\s+//g;
+ $addr_to =~ s/\s+//g;
+ return 0 if (!length($addr_from) || !length($addr_to) ||
+ $addr_from ne $addr_to);
+
+ if ($hdr_from =~ /^\s*\S+\s*$/ && $hdr_to =~ /^\s*\S+\s*$/) {
+ return 1;
+ }
+}
+
+sub check_for_matching_env_and_hdr_from {
+ my ($self, $pms) =@_;
+ # two blank headers match so don't bother checking
+ return (lc $pms->get('EnvelopeFrom:addr') eq lc $pms->get('From:addr'));
+}
+
+sub sorted_recipients {
+ my ($self, $pms) = @_;
+
+ if (!exists $self->{tocc_sorted}) {
+ $self->_check_recipients($pms);
+ }
+ return $self->{tocc_sorted};
+}
+
+sub similar_recipients {
+ my ($self, $pms, $min, $max) = @_;
+
+ if (!exists $self->{tocc_similar}) {
+ $self->_check_recipients($pms);
+ }
+ return (($min eq 'undef' || $self->{tocc_similar} >= $min) &&
+ ($max eq 'undef' || $self->{tocc_similar} < $max));
+}
+
+# best experimentally derived values
+use constant TOCC_SORTED_COUNT => 7;
+use constant TOCC_SIMILAR_COUNT => 5;
+use constant TOCC_SIMILAR_LENGTH => 2;
+
+sub _check_recipients {
+ my ($self, $pms) = @_;
+
+ my @inputs;
+
+ # ToCc: pseudo-header works best, but sometimes Bcc: is better
+ for ('ToCc', 'Bcc') {
+ my $to = $pms->get($_); # get recipients
+ $to =~ s/\(.*?\)//g; # strip out the (comments)
+ push(@inputs, ($to =~ m/([\w.=-]+\@\w+(?:[\w.-]+\.)+\w+)/g));
+ last if scalar(@inputs) >= TOCC_SIMILAR_COUNT;
+ }
+
+ # remove duplicate addresses only when they appear next to each other
+ my @address;
+ my $previous = '';
+ while (my $current = shift @inputs) {
+ push(@address, ($previous = $current)) if lc($current) ne lc($previous);
+ last if @address == 256;
+ }
+
+ # ideas that had both poor S/O ratios and poor hit rates:
+ # - testing for reverse sorted recipient lists
+ # - testing To: and Cc: headers separately
+ $self->{tocc_sorted} = (scalar(@address) >= TOCC_SORTED_COUNT &&
+ join(',', @address) eq (join(',', sort @address)));
+
+ # a good S/O ratio and hit rate is achieved by comparing 2-byte
+ # substrings and requiring 5 or more addresses
+ $self->{tocc_similar} = 0;
+ if (scalar (@address) >= TOCC_SIMILAR_COUNT) {
+ my @user = map { substr($_,0,TOCC_SIMILAR_LENGTH) } @address;
+ my @fqhn = map { m/\@(.*)/ } @address;
+ my @host = map { substr($_,0,TOCC_SIMILAR_LENGTH) } @fqhn;
+ my $hits = 0;
+ my $combinations = 0;
+ for (my $i = 0; $i <= $#address; $i++) {
+ for (my $j = $i+1; $j <= $#address; $j++) {
+ $hits++ if $user[$i] eq $user[$j];
+ $hits++ if $host[$i] eq $host[$j] && $fqhn[$i] ne $fqhn[$j];
+ $combinations++;
+ }
+ }
+ $self->{tocc_similar} = $hits / $combinations;
+ }
+}
+
+sub check_for_missing_to_header {
+ my ($self, $pms) = @_;
+
+ my $hdr = $pms->get('To');
+ $hdr ||= $pms->get('Apparently-To');
+ return 1 if ($hdr eq '');
+
+ return 0;
+}
+
+sub check_for_forged_gw05_received_headers {
+ my ($self, $pms) = @_;
+ local ($_);
+
+ my $rcv = $pms->get('Received');
+
+ # e.g.
+ # Received: from mail3.icytundra.com by gw05 with ESMTP; Thu, 21 Jun 2001 02:28:32 -0400
+ my ($h1, $h2) = ($rcv =~
+ m/\nfrom\s(\S+)\sby\s(\S+)\swith\sESMTP\;\s+\S\S\S,\s+\d+\s+\S\S\S\s+
+ \d{4}\s+\d\d:\d\d:\d\d\s+[-+]*\d{4}\n$/xs);
+
+ if (defined ($h1) && defined ($h2) && $h2 !~ /\./) {
+ return 1;
+ }
+
+ 0;
+}
+
+sub _check_for_round_the_world_received {
+ my ($self, $pms) = @_;
+ my ($relayer, $relayerip, $relay);
+
+ $self->{round_the_world_revdns} = 0;
+ $self->{round_the_world_helo} = 0;
+ my $rcvd = $pms->get('Received');
+ my $IPV4_ADDRESS = IPV4_ADDRESS;
+
+ # TODO: use new Received header parser
+
+ # trad sendmail/postfix fmt:
+ # Received: from hitower.parkgroup.ru (unknown [212.107.207.26]) by
+ # mail.netnoteinc.com (Postfix) with ESMTP id B8CAC11410E for
+ # <me...@netnoteinc.com>; Fri, 30 Nov 2001 02:42:05 +0000 (Eire)
+ # Received: from fmx1.freemail.hu ([212.46.197.200]) by hitower.parkgroup.ru
+ # (Lotus Domino Release 5.0.8) with ESMTP id 2001113008574773:260 ;
+ # Fri, 30 Nov 2001 08:57:47 +1000
+ if ($rcvd =~ /
+ \nfrom\b.{0,20}\s(\S+\.${CCTLDS_WITH_LOTS_OF_OPEN_RELAYS})\s\(.{0,200}
+ \nfrom\b.{0,20}\s([-_A-Za-z0-9.]+)\s.{0,30}\[($IPV4_ADDRESS)\]
+ /osix) { $relay = $1; $relayer = $2; $relayerip = $3; goto gotone; }
+
+ return 0;
+
+gotone:
+ my $revdns = $pms->lookup_ptr ($relayerip);
+ if (!defined $revdns) { $revdns = '(unknown)'; }
+
+ dbg("eval: round-the-world: mail relayed through $relay by ".
+ "$relayerip (HELO $relayer, rev DNS says $revdns)");
+
+ if ($revdns =~ /\.${ROUND_THE_WORLD_RELAYERS}$/oi) {
+ dbg("eval: round-the-world: yep, I think so (from rev dns)");
+ $self->{round_the_world_revdns} = 1;
+ return;
+ }
+
+ if ($relayer =~ /\.${ROUND_THE_WORLD_RELAYERS}$/oi) {
+ dbg("eval: round-the-world: yep, I think so (from HELO)");
+ $self->{round_the_world_helo} = 1;
+ return;
+ }
+
+ dbg("eval: round-the-world: probably not");
+ return;
+}
+
+sub check_for_round_the_world_received_helo {
+ my ($self, $pms) = @_;
+ if (!defined $self->{round_the_world_helo}) {
+ $self->_check_for_round_the_world_received($pms);
+ }
+ if ($self->{round_the_world_helo}) { return 1; }
+ return 0;
+}
+
+sub check_for_round_the_world_received_revdns {
+ my ($self, $pms) = @_;
+ if (!defined $self->{round_the_world_revdns}) {
+ $self->_check_for_round_the_world_received($pms);
+ }
+ if ($self->{round_the_world_revdns}) { return 1; }
+ return 0;
+}
+
+###########################################################################
+
+sub check_for_shifted_date {
+ my ($self, $pms, $min, $max) = @_;
+
+ if (!exists $self->{date_diff}) {
+ $self->_check_date_diff($pms);
+ }
+ return (($min eq 'undef' || $self->{date_diff} >= (3600 * $min)) &&
+ ($max eq 'undef' || $self->{date_diff} < (3600 * $max)));
+}
+
+# filters out some false positives in old corpus mail - Allen
+sub received_within_months {
+ my ($self,$pms,$min,$max) = @_;
+
+ if (!exists($self->{date_received})) {
+ $self->_check_date_received($pms);
+ }
+ my $diff = time() - $self->{date_received};
+
+ # 365.2425 * 24 * 60 * 60 = 31556952 = seconds in year (including leap)
+
+ if (((! defined($min)) || ($min eq 'undef') ||
+ ($diff >= (31556952 * ($min/12)))) &&
+ ((! defined($max)) || ($max eq 'undef') ||
+ ($diff < (31556952 * ($max/12))))) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+sub _get_date_header_time {
+ my ($self, $pms) = @_;
+
+ my $time;
+ # a Resent-Date: header takes precedence over any Date: header
+ for my $header ('Resent-Date', 'Date') {
+ my $date = $pms->get($header);
+ if (defined($date) && length($date)) {
+ chomp($date);
+ $time = Mail::SpamAssassin::Util::parse_rfc822_date($date);
+ }
+ last if defined($time);
+ }
+ if (defined($time)) {
+ $self->{date_header_time} = $time;
+ }
+ else {
+ $self->{date_header_time} = undef;
+ }
+}
+
+sub _get_received_header_times {
+ my ($self, $pms) = @_;
+
+ $self->{received_header_times} = [ () ];
+ $self->{received_fetchmail_time} = undef;
+
+ my (@received);
+ my $received = $pms->get('Received');
+ if (defined($received) && length($received)) {
+ @received = grep {$_ =~ m/\S/} (split(/\n/,$received));
+ }
+ # if we have no Received: headers, chances are we're archived mail
+ # with a limited set of headers
+ if (!scalar(@received)) {
+ return;
+ }
+
+ # handle fetchmail headers
+ my (@local);
+ if (($received[0] =~
+ m/\bfrom (?:localhost\s|(?:\S+ ){1,2}\S*\b127\.0\.0\.1\b)/) ||
+ ($received[0] =~ m/qmail \d+ invoked by uid \d+/)) {
+ push @local, (shift @received);
+ }
+ if (scalar(@received) &&
+ ($received[0] =~ m/\bby localhost with \w+ \(fetchmail-[\d.]+/)) {
+ push @local, (shift @received);
+ }
+ elsif (scalar(@local)) {
+ unshift @received, (shift @local);
+ }
+
+ my $rcvd;
+
+ if (scalar(@local)) {
+ my (@fetchmail_times);
+ foreach $rcvd (@local) {
+ if ($rcvd =~ m/(\s.?\d+ \S\S\S \d+ \d+:\d+:\d+ \S+)/) {
+ my $date = $1;
+ dbg("eval: trying Received fetchmail header date for real time: $date");
+ my $time = Mail::SpamAssassin::Util::parse_rfc822_date($date);
+ if (defined($time) && (time() >= $time)) {
+ dbg("eval: time_t from date=$time, rcvd=$date");
+ push @fetchmail_times, $time;
+ }
+ }
+ }
+ if (scalar(@fetchmail_times) > 1) {
+ $self->{received_fetchmail_time} =
+ (sort {$b <=> $a} (@fetchmail_times))[0];
+ } elsif (scalar(@fetchmail_times)) {
+ $self->{received_fetchmail_time} = $fetchmail_times[0];
+ }
+ }
+
+ my (@header_times);
+ foreach $rcvd (@received) {
+ if ($rcvd =~ m/(\s.?\d+ \S\S\S \d+ \d+:\d+:\d+ \S+)/) {
+ my $date = $1;
+ dbg("eval: trying Received header date for real time: $date");
+ my $time = Mail::SpamAssassin::Util::parse_rfc822_date($date);
+ if (defined($time)) {
+ dbg("eval: time_t from date=$time, rcvd=$date");
+ push @header_times, $time;
+ }
+ }
+ }
+
+ if (scalar(@header_times)) {
+ $self->{received_header_times} = [ @header_times ];
+ } else {
+ dbg("eval: no dates found in Received headers");
+ }
+}
+
+sub _check_date_received {
+ my ($self, $pms) = @_;
+
+ my (@dates_poss);
+
+ $self->{date_received} = 0;
+
+ if (!exists($self->{date_header_time})) {
+ $self->_get_date_header_time($pms);
+ }
+
+ if (defined($self->{date_header_time})) {
+ push @dates_poss, $self->{date_header_time};
+ }
+
+ if (!exists($self->{received_header_times})) {
+ $self->_get_received_header_times($pms);
+ }
+ my (@received_header_times) = @{ $self->{received_header_times} };
+ if (scalar(@received_header_times)) {
+ push @dates_poss, $received_header_times[0];
+ }
+ if (defined($self->{received_fetchmail_time})) {
+ push @dates_poss, $self->{received_fetchmail_time};
+ }
+
+ if (defined($self->{date_header_time}) && scalar(@received_header_times)) {
+ if (!exists($self->{date_diff})) {
+ $self->_check_date_diff($pms);
+ }
+ push @dates_poss, $self->{date_header_time} - $self->{date_diff};
+ }
+
+ if (scalar(@dates_poss)) { # use median
+ $self->{date_received} = (sort {$b <=> $a}
+ (@dates_poss))[int($#dates_poss/2)];
+ dbg("eval: date chosen from message: " .
+ scalar(localtime($self->{date_received})));
+ } else {
+ dbg("eval: no dates found in message");
+ }
+}
+
+sub _check_date_diff {
+ my ($self, $pms) = @_;
+
+ $self->{date_diff} = 0;
+
+ if (!exists($self->{date_header_time})) {
+ $self->_get_date_header_time($pms);
+ }
+
+ if (!defined($self->{date_header_time})) {
+ return; # already have tests for this
+ }
+
+ if (!exists($self->{received_header_times})) {
+ $self->_get_received_header_times($pms);
+ }
+ my (@header_times) = @{ $self->{received_header_times} };
+
+ if (!scalar(@header_times)) {
+ return; # archived mail?
+ }
+
+ my (@diffs) = map {$self->{date_header_time} - $_} (@header_times);
+
+ # if the last Received: header has no difference, then we choose to
+ # exclude it
+ if ($#diffs > 0 && $diffs[$#diffs] == 0) {
+ pop(@diffs);
+ }
+
+ # use the date with the smallest absolute difference
+ # (experimentally, this results in the fewest false positives)
+ @diffs = sort { abs($a) <=> abs($b) } @diffs;
+ $self->{date_diff} = $diffs[0];
+}
+
+
+sub subject_is_all_caps {
+ my ($self, $pms) = @_;
+ my $subject = $pms->get('Subject');
+
+ $subject =~ s/^\s+//;
+ $subject =~ s/\s+$//;
+ return 0 if $subject !~ /\s/; # don't match one word subjects
+ return 0 if (length $subject < 10); # don't match short subjects
+ $subject =~ s/[^a-zA-Z]//g; # only look at letters
+
+ # now, check to see if the subject is encoded using a non-ASCII charset.
+ # If so, punt on this test to avoid FPs. We just list the known charsets
+ # this test will FP on, here.
+ my $subjraw = $pms->get('Subject:raw');
+ if ($subjraw =~ /=\?${Mail::SpamAssassin::Constants::CHARSETS_LIKELY_TO_FP_AS_CAPS}\?/i) {
+ return 0;
+ }
+
+ return length($subject) && ($subject eq uc($subject));
+}
+
+sub check_for_to_in_subject {
+ my ($self, $pms, $test) = @_;
+
+ my $full_to = $pms->get('To:addr');
+ return 0 unless $full_to;
+
+ my $subject = $pms->get('Subject');
+
+ if ($test eq "address") {
+ return $subject =~ /\b\Q$full_to\E\b/i; # "user@domain.com"
+ }
+ elsif ($test eq "user") {
+ my $to = $full_to;
+ $to =~ s/\@.*//;
+ return $subject =~ /^\s*\Q$to\E,\s/i; # "user,\s" case insensitive
+ }
+ return 0;
+}
+
+sub check_outlook_message_id {
+ my ($self, $pms) = @_;
+ local ($_);
+
+ my $id = $pms->get('MESSAGEID');
+ return 0 if $id !~ /^<[0-9a-f]{4}([0-9a-f]{8})\$[0-9a-f]{8}\$[0-9a-f]{8}\@/;
+
+ my $timetoken = hex($1);
+ my $x = 0.0023283064365387;
+ my $y = 27111902.8329849;
+
+ my $fudge = 250;
+
+ $_ = $pms->get('Date');
+ $_ = Mail::SpamAssassin::Util::parse_rfc822_date($_) || 0;
+ my $expected = int (($_ * $x) + $y);
+ my $diff = $timetoken - $expected;
+ return 0 if (abs($diff) < $fudge);
+
+ $_ = $self->get('Received');
+ /(\s.?\d+ \S\S\S \d+ \d+:\d+:\d+ \S+).*?$/;
+ $_ = Mail::SpamAssassin::Util::parse_rfc822_date($_) || 0;
+ $expected = int(($_ * $x) + $y);
+ $diff = $timetoken - $expected;
+
+ return (abs($diff) >= $fudge);
+}
+
+sub check_messageid_not_usable {
+ my ($self, $pms) = @_;
+ local ($_);
+
+ # Lyris eats message-ids. also some ezmlm, I think :(
+ $_ = $pms->get("List-Unsubscribe");
+ return 1 if (/<mailto:(?:leave-\S+|\S+-unsubscribe)\@\S+>$/);
+
+ # ezmlm again
+ if($self->gated_through_received_hdr_remover($pms)) { return 1; }
+
+ # Allen notes this as 'Wacky sendmail version?'
+ $_ = $pms->get("Received");
+ return 1 if /\/CWT\/DCE\)/;
+
+ # Apr 2 2003 jm: iPlanet rewrites lots of stuff, including Message-IDs
+ return 1 if /iPlanet Messaging Server/;
+
+ # too old; older versions of clients used different formats
+ return 1 if ($self->received_within_months('6','undef'));
+
+ return 0;
+}
+
+# Return true if the count of $hdr headers are within the given range
+sub check_header_count_range {
+ my ($self, $pms, $hdr, $min, $max) = @_;
+ my %uniq = ();
+ my @hdrs = grep(!$uniq{$_}++, $pms->{msg}->get_header ($hdr));
+ return (scalar @hdrs >= $min && scalar @hdrs <= $max);
+}
+
+sub check_unresolved_template {
+ my ($self, $pms) = @_;
+
+ my $all = $pms->get('ALL'); # cached access
+ $all =~ s/\n[ \t]+/ /gs; # fix continuation lines
+
+ for my $header (split(/\n/, $all)) {
+ # slightly faster to test in this order
+ if ($header =~ /%[A-Z][A-Z_-]/ &&
+ $header !~ /^(?:X-UIDL|X-Face|To|Cc|From|Subject|References|In-Reply-To|(?:X-|Resent-|X-Original-)?Message-Id):/i)
+ {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+sub check_ratware_name_id {
+ my ($self, $pms) = @_;
+
+ my $mid = $pms->get('MESSAGEID');
+ my $from = $pms->get('From');
+ if ($mid =~ m/<[A-Z]{28}\.([^>]+?)>/) {
+ if ($from =~ m/\"[^\"]+\"\s*<\Q$1\E>/) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+sub check_ratware_envelope_from {
+ my ($self, $pms) = @_;
+
+ my $to = $pms->get('To:addr');
+ my $from = $pms->get('EnvelopeFrom');
+
+ return 0 unless ($to && $from);
+ return 0 if ($from =~ /^SRS\d=/);
+
+ if ($to =~ /^([^@]+)@(.+)$/) {
+ my($user,$dom) = ($1,$2);
+ $dom = Mail::SpamAssassin::Util::RegistrarBoundaries::trim_domain($dom);
+ return unless
+ (Mail::SpamAssassin::Util::RegistrarBoundaries::is_domain_valid($dom));
+
+ return 1 if ($from =~ /\b\Q$dom\E.\Q$user\E@/i);
+ }
+
+ return 0;
+}
+
+1;