You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2005/12/15 03:27:20 UTC

svn commit: r356971 [1/2] - in /spamassassin/branches/tvd-evaltoplugin: ./ lib/Mail/SpamAssassin/ lib/Mail/SpamAssassin/Plugin/ rules/

Author: felicity
Date: Wed Dec 14 18:27:14 2005
New Revision: 356971

URL: http://svn.apache.org/viewcvs?rev=356971&view=rev
Log:
first pass at moving EvalTests.pm into various different plugins

Added:
    spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/Bayes.pm
    spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/BodyEval.pm
    spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/DNSEval.pm
    spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/HTMLEval.pm
    spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/HeaderEval.pm
    spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/MIMEEval.pm
    spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/RelayEval.pm
    spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/URIEval.pm
    spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/WLBLEval.pm
    spamassassin/branches/tvd-evaltoplugin/rules/v320.pre
Removed:
    spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/EvalTests.pm
Modified:
    spamassassin/branches/tvd-evaltoplugin/MANIFEST
    spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Bayes.pm
    spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Conf.pm
    spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Constants.pm
    spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Dns.pm
    spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/PerMsgStatus.pm
    spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Util.pm

Modified: spamassassin/branches/tvd-evaltoplugin/MANIFEST
URL: http://svn.apache.org/viewcvs/spamassassin/branches/tvd-evaltoplugin/MANIFEST?rev=356971&r1=356970&r2=356971&view=diff
==============================================================================
--- spamassassin/branches/tvd-evaltoplugin/MANIFEST (original)
+++ spamassassin/branches/tvd-evaltoplugin/MANIFEST Wed Dec 14 18:27:14 2005
@@ -46,7 +46,6 @@
 lib/Mail/SpamAssassin/DBBasedAddrList.pm
 lib/Mail/SpamAssassin/Dns.pm
 lib/Mail/SpamAssassin/DnsResolver.pm
-lib/Mail/SpamAssassin/EvalTests.pm
 lib/Mail/SpamAssassin/HTML.pm
 lib/Mail/SpamAssassin/Locales.pm
 lib/Mail/SpamAssassin/Locker.pm
@@ -71,19 +70,28 @@
 lib/Mail/SpamAssassin/Plugin/AccessDB.pm
 lib/Mail/SpamAssassin/Plugin/AntiVirus.pm
 lib/Mail/SpamAssassin/Plugin/AutoLearnThreshold.pm
+lib/Mail/SpamAssassin/Plugin/Bayes.pm
+lib/Mail/SpamAssassin/Plugin/BodyEval.pm
 lib/Mail/SpamAssassin/Plugin/DCC.pm
+lib/Mail/SpamAssassin/Plugin/DNSEval.pm
 lib/Mail/SpamAssassin/Plugin/DomainKeys.pm
+lib/Mail/SpamAssassin/Plugin/HTMLEval.pm
 lib/Mail/SpamAssassin/Plugin/Hashcash.pm
+lib/Mail/SpamAssassin/Plugin/HeaderEval.pm
+lib/Mail/SpamAssassin/Plugin/MIMEEval.pm
 lib/Mail/SpamAssassin/Plugin/MIMEHeader.pm
 lib/Mail/SpamAssassin/Plugin/Pyzor.pm
 lib/Mail/SpamAssassin/Plugin/Razor2.pm
 lib/Mail/SpamAssassin/Plugin/RelayCountry.pm
+lib/Mail/SpamAssassin/Plugin/RelayEval.pm
 lib/Mail/SpamAssassin/Plugin/ReplaceTags.pm
 lib/Mail/SpamAssassin/Plugin/SPF.pm
 lib/Mail/SpamAssassin/Plugin/SpamCop.pm
 lib/Mail/SpamAssassin/Plugin/Test.pm
 lib/Mail/SpamAssassin/Plugin/TextCat.pm
 lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm
+lib/Mail/SpamAssassin/Plugin/URIEval.pm
+lib/Mail/SpamAssassin/Plugin/WLBLEval.pm
 lib/Mail/SpamAssassin/Plugin/WhiteListSubject.pm
 lib/Mail/SpamAssassin/PluginHandler.pm
 lib/Mail/SpamAssassin/Reporter.pm

Modified: spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Bayes.pm
URL: http://svn.apache.org/viewcvs/spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Bayes.pm?rev=356971&r1=356970&r2=356971&view=diff
==============================================================================
--- spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Bayes.pm (original)
+++ spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Bayes.pm Wed Dec 14 18:27:14 2005
@@ -670,8 +670,10 @@
 
   return 0 unless $self->{use_ignores};
 
-  my $ignore = $PMS->check_from_in_list('bayes_ignore_from')
-    		|| $PMS->check_to_in_list('bayes_ignore_to');
+  my $ig_from = $self->{main}->call_plugins ("check_wb_list", { permsgstatus => $PMS, type => 'from', list => 'bayes_ignore_from' });
+  my $ig_to = $self->{main}->call_plugins ("check_wb_list", { permsgstatus => $PMS, type => 'to', list => 'bayes_ignore_to' });
+
+  my $ignore = $ig_from || $ig_to;
 
   dbg("bayes: not using bayes, bayes_ignore_from or _to rule") if $ignore;
 

Modified: spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Conf.pm
URL: http://svn.apache.org/viewcvs/spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Conf.pm?rev=356971&r1=356970&r2=356971&view=diff
==============================================================================
--- spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Conf.pm (original)
+++ spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Conf.pm Wed Dec 14 18:27:14 2005
@@ -254,300 +254,6 @@
 
 =back
 
-=head2 WHITELIST AND BLACKLIST OPTIONS
-
-=over 4
-
-=item whitelist_from add@ress.com
-
-Used to specify addresses which send mail that is often tagged (incorrectly) as
-spam. If you want to whitelist your own domain, be aware that spammers will
-often impersonate the domain of the recipient.  The recommended solution is to
-instead use C<whitelist_from_rcvd> as explained below.
-
-Whitelist and blacklist addresses are now file-glob-style patterns, so
-C<fr...@somewhere.com>, C<*...@isp.com>, or C<*.domain.net> will all work.
-Specifically, C<*> and C<?> are allowed, but all other metacharacters are not.
-Regular expressions are not used for security reasons.
-
-Multiple addresses per line, separated by spaces, is OK.  Multiple
-C<whitelist_from> lines is also OK.
-
-The headers checked for whitelist addresses are as follows: if C<Resent-From>
-is set, use that; otherwise check all addresses taken from the following
-set of headers:
-
-	Envelope-Sender
-	Resent-Sender
-	X-Envelope-From
-	From
-
-In addition, the "envelope sender" data, taken from the SMTP envelope
-data where this is available, is looked up.
-
-e.g.
-
-  whitelist_from joe@example.com fred@example.com
-  whitelist_from *@example.com
-
-=cut
-
-  push (@cmds, {
-    setting => 'whitelist_from',
-    type => $CONF_TYPE_ADDRLIST
-  });
-
-=item unwhitelist_from add@ress.com
-
-Used to override a default whitelist_from entry, so for example a distribution
-whitelist_from can be overridden in a local.cf file, or an individual user can
-override a whitelist_from entry in their own C<user_prefs> file.
-The specified email address has to match exactly the address previously
-used in a whitelist_from line.
-
-e.g.
-
-  unwhitelist_from joe@example.com fred@example.com
-  unwhitelist_from *@example.com
-
-=cut
-
-  push (@cmds, {
-    command => 'unwhitelist_from',
-    setting => 'whitelist_from',
-    code => \&Mail::SpamAssassin::Conf::Parser::remove_addrlist_value
-  });
-
-=item whitelist_from_rcvd addr@lists.sourceforge.net sourceforge.net
-
-Use this to supplement the whitelist_from addresses with a check against the
-Received headers. The first parameter is the address to whitelist, and the
-second is a string to match the relay's rDNS.
-
-This string is matched against the reverse DNS lookup used during the handover
-from the internet to your internal network's mail exchangers.  It can
-either be the full hostname, or the domain component of that hostname.  In
-other words, if the host that connected to your MX had an IP address that
-mapped to 'sendinghost.spamassassin.org', you should specify
-C<sendinghost.spamassassin.org> or just C<spamassassin.org> here.
-
-Note that this requires that C<internal_networks> be correct.  For simple cases,
-it will be, but for a complex network, or running with DNS checks off
-or with C<-L>, you may get better results by setting that parameter.
-
-e.g.
-
-  whitelist_from_rcvd joe@example.com  example.com
-  whitelist_from_rcvd *@axkit.org      sergeant.org
-
-=item def_whitelist_from_rcvd addr@lists.sourceforge.net sourceforge.net
-
-Same as C<whitelist_from_rcvd>, but used for the default whitelist entries
-in the SpamAssassin distribution.  The whitelist score is lower, because
-these are often targets for spammer spoofing.
-
-=cut
-
-  push (@cmds, {
-    setting => 'whitelist_from_rcvd',
-    code => sub {
-      my ($self, $key, $value, $line) = @_;
-      unless (defined $value && $value !~ /^$/) {
-	return $MISSING_REQUIRED_VALUE;
-      }
-      unless ($value =~ /^\S+\s+\S+$/) {
-	return $INVALID_VALUE;
-      }
-      $self->{parser}->add_to_addrlist_rcvd ('whitelist_from_rcvd',
-                                        split(/\s+/, $value));
-    }
-  });
-
-  push (@cmds, {
-    setting => 'def_whitelist_from_rcvd',
-    code => sub {
-      my ($self, $key, $value, $line) = @_;
-      unless (defined $value && $value !~ /^$/) {
-	return $MISSING_REQUIRED_VALUE;
-      }
-      unless ($value =~ /^\S+\s+\S+$/) {
-	return $INVALID_VALUE;
-      }
-      $self->{parser}->add_to_addrlist_rcvd ('def_whitelist_from_rcvd',
-                                        split(/\s+/, $value));
-    }
-  });
-
-=item whitelist_allows_relays add@ress.com
-
-Specify addresses which are in C<whitelist_from_rcvd> that sometimes
-send through a mail relay other than the listed ones. By default mail
-with a From address that is in C<whitelist_from_rcvd> that does not match
-the relay will trigger a forgery rule. Including the address in
-C<whitelist_allows_relay> prevents that.
-
-Whitelist and blacklist addresses are now file-glob-style patterns, so
-C<fr...@somewhere.com>, C<*...@isp.com>, or C<*.domain.net> will all work.
-Specifically, C<*> and C<?> are allowed, but all other metacharacters are not.
-Regular expressions are not used for security reasons.
-
-Multiple addresses per line, separated by spaces, is OK.  Multiple
-C<whitelist_allows_relays> lines is also OK.
-
-The specified email address does not have to match exactly the address
-previously used in a whitelist_from_rcvd line as it is compared to the
-address in the header.
-
-e.g.
-
-  whitelist_allows_relays joe@example.com fred@example.com
-  whitelist_allows_relays *@example.com
-
-=cut
-
-  push (@cmds, {
-    setting => 'whitelist_allows_relays',
-    type => $CONF_TYPE_ADDRLIST
-  });
-
-=item unwhitelist_from_rcvd add@ress.com
-
-Used to override a default whitelist_from_rcvd entry, so for example a
-distribution whitelist_from_rcvd can be overridden in a local.cf file,
-or an individual user can override a whitelist_from_rcvd entry in
-their own C<user_prefs> file.
-
-The specified email address has to match exactly the address previously
-used in a whitelist_from_rcvd line.
-
-e.g.
-
-  unwhitelist_from_rcvd joe@example.com fred@example.com
-  unwhitelist_from_rcvd *@axkit.org
-
-=cut
-
-  push (@cmds, {
-    setting => 'unwhitelist_from_rcvd',
-    code => sub {
-      my ($self, $key, $value, $line) = @_;
-      unless (defined $value && $value !~ /^$/) {
-	return $MISSING_REQUIRED_VALUE;
-      }
-      unless ($value =~ /^(?:\S+(?:\s+\S+)*)$/) {
-	return $INVALID_VALUE;
-      }
-      $self->{parser}->remove_from_addrlist_rcvd('whitelist_from_rcvd',
-                                        split (/\s+/, $value));
-      $self->{parser}->remove_from_addrlist_rcvd('def_whitelist_from_rcvd',
-                                        split (/\s+/, $value));
-    }
-  });
-
-=item blacklist_from add@ress.com
-
-Used to specify addresses which send mail that is often tagged (incorrectly) as
-non-spam, but which the user doesn't want.  Same format as C<whitelist_from>.
-
-=cut
-
-  push (@cmds, {
-    setting => 'blacklist_from',
-    type => $CONF_TYPE_ADDRLIST
-  });
-
-=item unblacklist_from add@ress.com
-
-Used to override a default blacklist_from entry, so for example a
-distribution blacklist_from can be overridden in a local.cf file, or
-an individual user can override a blacklist_from entry in their own
-C<user_prefs> file. The specified email address has to match exactly
-the address previously used in a blacklist_from line.
-
-
-e.g.
-
-  unblacklist_from joe@example.com fred@example.com
-  unblacklist_from *@spammer.com
-
-=cut
-
-
-  push (@cmds, {
-    command => 'unblacklist_from',
-    setting => 'blacklist_from',
-    code => \&Mail::SpamAssassin::Conf::Parser::remove_addrlist_value
-  });
-
-
-=item whitelist_to add@ress.com
-
-If the given address appears as a recipient in the message headers
-(Resent-To, To, Cc, obvious envelope recipient, etc.) the mail will
-be whitelisted.  Useful if you're deploying SpamAssassin system-wide,
-and don't want some users to have their mail filtered.  Same format
-as C<whitelist_from>.
-
-There are three levels of To-whitelisting, C<whitelist_to>, C<more_spam_to>
-and C<all_spam_to>.  Users in the first level may still get some spammish
-mails blocked, but users in C<all_spam_to> should never get mail blocked.
-
-The headers checked for whitelist addresses are as follows: if C<Resent-To> or
-C<Resent-Cc> are set, use those; otherwise check all addresses taken from the
-following set of headers:
-
-        To
-        Cc
-        Apparently-To
-        Delivered-To
-        Envelope-Recipients
-        Apparently-Resent-To
-        X-Envelope-To
-        Envelope-To
-        X-Delivered-To
-        X-Original-To
-        X-Rcpt-To
-        X-Real-To
-
-=item more_spam_to add@ress.com
-
-See above.
-
-=item all_spam_to add@ress.com
-
-See above.
-
-=cut
-
-  push (@cmds, {
-    setting => 'whitelist_to',
-    type => $CONF_TYPE_ADDRLIST
-  });
-  push (@cmds, {
-    setting => 'more_spam_to',
-    type => $CONF_TYPE_ADDRLIST
-  });
-  push (@cmds, {
-    setting => 'all_spam_to',
-    type => $CONF_TYPE_ADDRLIST
-  });
-
-=item blacklist_to add@ress.com
-
-If the given address appears as a recipient in the message headers
-(Resent-To, To, Cc, obvious envelope recipient, etc.) the mail will
-be blacklisted.  Same format as C<blacklist_from>.
-
-=cut
-
-
-  push (@cmds, {
-    setting => 'blacklist_to',
-    type => $CONF_TYPE_ADDRLIST
-  });
-
-=back
-
 =head2 BASIC MESSAGE TAGGING OPTIONS
 
 =over 4

Modified: spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Constants.pm
URL: http://svn.apache.org/viewcvs/spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Constants.pm?rev=356971&r1=356970&r2=356971&view=diff
==============================================================================
--- spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Constants.pm (original)
+++ spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Constants.pm Wed Dec 14 18:27:14 2005
@@ -37,6 +37,7 @@
 	META_TEST_MIN_PRIORITY HARVEST_DNSBL_PRIORITY MBX_SEPARATOR
 	MAX_BODY_LINE_LENGTH MAX_HEADER_KEY_LENGTH MAX_HEADER_VALUE_LENGTH
 	MAX_HEADER_LENGTH ARITH_EXPRESSION_LEXER AI_TIME_UNKNOWN
+	CHARSETS_LIKELY_TO_FP_AS_CAPS
 );
 
 %EXPORT_TAGS = (
@@ -196,5 +197,10 @@
 # second pass (when the message is actually read + processed) the received
 # date is calculated.  this value signifies "unknown" from the first pass.
 use constant AI_TIME_UNKNOWN => 0;
+
+# Charsets which use capital letters heavily in their encoded representation.
+use constant CHARSETS_LIKELY_TO_FP_AS_CAPS => qr{[-_a-z0-9]*(?:
+	  koi|jp|jis|euc|gb|big5|isoir|cp1251|georgianps|pt154|tis
+	)[-_a-z0-9]*}ix;
 
 1;

Modified: spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Dns.pm
URL: http://svn.apache.org/viewcvs/spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Dns.pm?rev=356971&r1=356970&r2=356971&view=diff
==============================================================================
--- spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Dns.pm (original)
+++ spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Dns.pm Wed Dec 14 18:27:14 2005
@@ -735,4 +735,12 @@
 
 ###########################################################################
 
+# interface called by SPF plugin
+sub check_for_from_dns {
+  my ($self, $pms) = @_;
+  if (defined $pms->{sender_host_fail}) {
+    return ($pms->{sender_host_fail} == 2); # both MX and A need to fail
+  }
+}
+
 1;

Modified: spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewcvs/spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/PerMsgStatus.pm?rev=356971&r1=356970&r2=356971&view=diff
==============================================================================
--- spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/PerMsgStatus.pm Wed Dec 14 18:27:14 2005
@@ -51,7 +51,6 @@
 use Carp;
 
 use Mail::SpamAssassin::Constants qw(:sa);
-use Mail::SpamAssassin::EvalTests;
 use Mail::SpamAssassin::Conf;
 use Mail::SpamAssassin::Util;
 use Mail::SpamAssassin::Logger;

Added: spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/Bayes.pm
URL: http://svn.apache.org/viewcvs/spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/Bayes.pm?rev=356971&view=auto
==============================================================================
--- spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/Bayes.pm (added)
+++ spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/Bayes.pm Wed Dec 14 18:27:14 2005
@@ -0,0 +1,70 @@
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+
+package Mail::SpamAssassin::Plugin::Bayes;
+
+use Mail::SpamAssassin::Plugin;
+use strict;
+use warnings;
+use bytes;
+
+use vars qw(@ISA);
+@ISA = qw(Mail::SpamAssassin::Plugin);
+
+# constructor: register the eval rule
+sub new {
+  my $class = shift;
+  my $mailsaobject = shift;
+
+  # some boilerplate...
+  $class = ref($class) || $class;
+  my $self = $class->SUPER::new($mailsaobject);
+  bless ($self, $class);
+
+  # the important bit!
+  $self->register_eval_rule("check_bayes");
+
+  return $self;
+}
+
+sub check_bayes {
+  my ($self, $pms, $fulltext, $min, $max) = @_;
+
+  return 0 if (!$pms->{conf}->{use_bayes} || !$pms->{conf}->{use_bayes_rules});
+
+  if (!exists ($self->{bayes_score})) {
+    $self->{bayes_score} = $self->{main}->{bayes_scanner}->scan ($pms, $pms->{msg});
+  }
+
+  if (defined $self->{bayes_score} &&
+      ($min == 0 || $self->{bayes_score} > $min) &&
+      ($max eq "undef" || $self->{bayes_score} <= $max))
+  {
+      if ($pms->{conf}->{detailed_bayes_score}) {
+        $pms->test_log(sprintf ("score: %3.4f, hits: %s",
+                                 $self->{bayes_score},
+                                 $self->{bayes_hits}));
+      }
+      else {
+        $pms->test_log(sprintf ("score: %3.4f", $self->{bayes_score}));
+      }
+      return 1;
+  }
+
+  return 0;
+}
+
+1;

Added: spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/BodyEval.pm
URL: http://svn.apache.org/viewcvs/spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/BodyEval.pm?rev=356971&view=auto
==============================================================================
--- spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/BodyEval.pm (added)
+++ spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/BodyEval.pm Wed Dec 14 18:27:14 2005
@@ -0,0 +1,214 @@
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+
+package Mail::SpamAssassin::Plugin::BodyEval;
+
+use Mail::SpamAssassin::Plugin;
+use Mail::SpamAssassin::Constants qw(:sa);
+
+use strict;
+use warnings;
+use bytes;
+
+use vars qw(@ISA);
+@ISA = qw(Mail::SpamAssassin::Plugin);
+
+# constructor: register the eval rule
+sub new {
+  my $class = shift;
+  my $mailsaobject = shift;
+
+  # some boilerplate...
+  $class = ref($class) || $class;
+  my $self = $class->SUPER::new($mailsaobject);
+  bless ($self, $class);
+
+  # the important bit!
+  $self->register_eval_rule("check_unique_words");
+  $self->register_eval_rule("multipart_alternative_difference");
+  $self->register_eval_rule("multipart_alternative_difference_count");
+  $self->register_eval_rule("check_blank_line_ratio");
+
+  return $self;
+}
+
+sub check_unique_words {
+  my ($self, undef, $body, $m, $b) = @_;
+
+  if (!defined $self->{unique_words_repeat}) {
+    $self->{unique_words_repeat} = 0;
+    $self->{unique_words_unique} = 0;
+    my %count;
+    for (@$body) {
+      # copy to avoid changing @$body
+      my $line = $_;
+      # from tokenize_line in Bayes.pm
+      $line =~ tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs;
+      $line =~ s/(\w)(\.{3,6})(\w)/$1 $2 $3/gs;
+      $line =~ s/(\w)(\-{2,6})(\w)/$1 $2 $3/gs;
+      $line =~ s/(?:^|\.\s+)([A-Z])([^A-Z]+)(?:\s|$)/ ' '.(lc $1).$2.' '/ge;
+      for my $token (split(' ', $line)) {
+        $count{$token}++;
+      }
+    }
+    $self->{unique_words_unique} = scalar grep { $_ == 1 } values(%count);
+    $self->{unique_words_repeat} = scalar keys(%count) - $self->{unique_words_unique};
+  }
+
+  # y = mx+b where y is number of unique words needed
+  my $unique = $self->{unique_words_unique};
+  my $repeat = $self->{unique_words_repeat};
+  my $y = ($unique + $repeat) * $m + $b;
+  return ($unique > $y);
+}
+
+sub multipart_alternative_difference {
+  my ($self, $pms, $fulltext, $min, $max) = @_;
+
+  $self->_multipart_alternative_difference($pms->{msg}) unless (exists $self->{madiff});
+
+  if (($min == 0 || $self->{madiff} > $min) &&
+      ($max eq "undef" || $self->{madiff} <= $max)) {
+      return 1;
+  }
+  return 0;
+}
+
+sub multipart_alternative_difference_count {
+  my ($self, $pms, $fulltext, $ratio, $minhtml) = @_;
+  $self->_multipart_alternative_difference($pms->{msg}) unless (exists $self->{madiff});
+  return 0 unless $self->{madiff_html} > $minhtml;
+  return(($self->{madiff_text} / $self->{madiff_html}) > $ratio);
+}
+
+sub _multipart_alternative_difference {
+  my ($self, $msg) = @_;
+  $self->{madiff} = 0;
+  $self->{madiff_html} = 0;
+  $self->{madiff_text} = 0;
+
+  # Find all multipart/alternative parts in the message
+  my @ma = $msg->find_parts(qr@^multipart/alternative\b@i);
+
+  # If there are no multipart/alternative sections, skip this test.
+  return if (!@ma);
+
+  # Figure out what the MIME content of the message looks like
+  my @content = $msg->content_summary();
+
+  # Exchange meeting requests come in as m/a text/html text/calendar,
+  # which we want to ignore because of the high FP rate it would cause.
+  # 
+  if (@content == 3 && $content[2] eq 'text/calendar' &&
+  	$content[1] eq 'text/html' &&
+  	$content[0] eq 'multipart/alternative') {
+    return;
+  }
+
+  # Go through each of the multipart parts
+  foreach my $part (@ma) {
+    my %html = ();
+    my %text = ();
+
+    # limit our search to text-based parts
+    my @txt = $part->find_parts(qr@^text\b@i);
+    foreach my $text (@txt) {
+      # we only care about the rendered version of the part
+      my ($type, $rnd) = $text->rendered();
+
+      # parse the rendered text into tokens.  assume they are whitespace
+      # separated, and ignore anything that doesn't have a word-character
+      # in it (0-9a-zA-Z_) since those are probably things like bullet
+      # points, horizontal lines, etc.  this assumes that punctuation
+      # in one part will be the same in other parts.
+      #
+      if ($type eq 'text/html') {
+        foreach my $w (grep(/\w/, split(/\s+/, $rnd))) {
+	  #dbg("eval: HTML: $w");
+          $html{$w}++;
+        }
+
+	# If there are no words, mark if there's at least 1 image ...
+	if (keys %html == 0 && exists $self->{html}{inside}{img}) {
+	  # Use "\n" as the mark since it can't ever occur normally
+	  $html{"\n"}=1;
+	}
+      }
+      else {
+        foreach my $w (grep(/\w/, split(/\s+/, $rnd))) {
+	  #dbg("eval: TEXT: $w");
+          $text{$w}++;
+        }
+      }
+    }
+
+    # How many HTML tokens do we have at the start?
+    my $orig = keys %html;
+    next if ($orig == 0);
+
+    $self->{madiff_html} = $orig;
+    $self->{madiff_text} = keys %text;
+    dbg('eval: text words: ' . $self->{madiff_text} . ', html words: ' . $self->{madiff_html});
+
+    # If the token appears at least as many times in the text part as
+    # in the html part, remove it from the list of html tokens.
+    while(my ($k,$v) = each %text) {
+      delete $html{$k} if (exists $html{$k} && $html{$k}-$text{$k} < 1);
+    }
+
+    #map { dbg("eval: LEFT: $_") } keys %html;
+
+    # In theory, the tokens should be the same in both text and html
+    # parts, so there would be 0 tokens left in the html token list, for
+    # a 0% difference rate.  Calculate it here, and record the difference
+    # if it's been the highest so far in this message.
+    my $diff = scalar(keys %html)/$orig*100;
+    $self->{madiff} = $diff if ($diff > $self->{madiff});
+
+    dbg("eval: " . sprintf "madiff: left: %d, orig: %d, max-difference: %0.2f%%", scalar(keys %html), $orig, $self->{madiff});
+  }
+
+  return;
+}
+
+
+sub check_blank_line_ratio {
+  my ($self, $pms, $fulltext, $min, $max, $minlines) = @_;
+
+  if (!defined $minlines || $minlines < 1) {
+    $minlines = 1;
+  }
+
+  if (! exists $self->{blank_line_ratio}->{$minlines}) {
+    $fulltext = $pms->get_decoded_body_text_array();
+    my ($blank) = 0;
+    if (scalar @{$fulltext} >= $minlines) {
+      foreach my $line (@{$fulltext}) {
+        next if ($line =~ /\S/);
+        $blank++;
+      }
+      $self->{blank_line_ratio}->{$minlines} = 100 * $blank / scalar @{$fulltext};
+    }
+    else {
+      $self->{blank_line_ratio}->{$minlines} = -1; # don't report if it's a blank message ...
+    }
+  }
+
+  return (($min == 0 && $self->{blank_line_ratio}->{$minlines} <= $max) ||
+	  ($self->{blank_line_ratio}->{$minlines} > $min &&
+	   $self->{blank_line_ratio}->{$minlines} <= $max));
+}
+1;

Added: spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/DNSEval.pm
URL: http://svn.apache.org/viewcvs/spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/DNSEval.pm?rev=356971&view=auto
==============================================================================
--- spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/DNSEval.pm (added)
+++ spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/DNSEval.pm Wed Dec 14 18:27:14 2005
@@ -0,0 +1,349 @@
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+
+package Mail::SpamAssassin::Plugin::DNSEval;
+
+use Mail::SpamAssassin::Plugin;
+use Mail::SpamAssassin::Constants qw(:ip);
+
+use strict;
+use warnings;
+use bytes;
+
+use vars qw(@ISA);
+@ISA = qw(Mail::SpamAssassin::Plugin);
+
+# constructor: register the eval rule
+sub new {
+  my $class = shift;
+  my $mailsaobject = shift;
+
+  # some boilerplate...
+  $class = ref($class) || $class;
+  my $self = $class->SUPER::new($mailsaobject);
+  bless ($self, $class);
+
+  # the important bit!
+  $self->register_eval_rule("check_rbl_accreditor");
+  $self->register_eval_rule("check_rbl");
+  $self->register_eval_rule("check_rbl_txt");
+  $self->register_eval_rule("check_rbl_sub");
+  $self->register_eval_rule("check_rbl_results_for");
+  $self->register_eval_rule("check_rbl_from_host");
+  $self->register_eval_rule("check_rbl_envfrom");
+  $self->register_eval_rule("check_dns_sender");
+
+  return $self;
+}
+
+sub ip_list_uniq_and_strip_private {
+  my ($self, @origips) = @_;
+  my @ips = ();
+  my %seen = ();
+  my $IP_PRIVATE = IP_PRIVATE;
+  foreach my $ip (@origips) {
+    next unless $ip;
+    next if (exists ($seen{$ip})); $seen{$ip} = 1;
+    next if ($ip =~ /$IP_PRIVATE/o);
+    push(@ips, $ip);
+  }
+  return @ips;
+}
+
+# check an RBL if the message contains an "accreditor assertion,"
+# that is, the message contains the name of a service that will vouch
+# for their practices.
+#
+sub check_rbl_accreditor {
+  my ($self, $pms, $rule, $set, $rbl_server, $subtest, $accreditor) = @_;
+
+  if (!defined $self->{accreditor_tag}) {
+    $self->message_accreditor_tag($pms);
+  }
+  if ($self->{accreditor_tag}->{$accreditor}) {
+    $self->check_rbl_backend($pms, $rule, $set, $rbl_server, 'A', $subtest);
+  }
+  return 0;
+}
+
+# Check for an Accreditor Assertion within the message, that is, the name of
+#	a third-party who will vouch for the sender's practices. The accreditor
+#	can be asserted in the EnvelopeFrom like this:
+#
+#	    listowner@a--accreditor.mail.example.com
+#
+#	or in an 'Accreditor" Header field, like this:
+#
+#	    Accreditor: accreditor1, parm=value; accreditor2, parm-value
+#
+#	This implementation supports multiple accreditors, but ignores any
+#	parameters in the header field.
+#
+sub message_accreditor_tag {
+  my ($self, $pms) = @_;
+  my %acctags;
+
+  if ($pms->get('EnvelopeFrom:addr') =~ /[@.]a--([a-z0-9]{3,})\./i) {
+    (my $tag = $1) =~ tr/A-Z/a-z/;
+    $acctags{$tag} = -1;
+  }
+  my $accreditor_field = $pms->get('Accreditor');
+  if (defined($accreditor_field)) {
+    my @accreditors = split(/,/, $accreditor_field);
+    foreach my $accreditor (@accreditors) {
+      my @terms = split(' ', $accreditor);
+      if ($#terms >= 0) {
+	  my $tag = $terms[0];
+	  $tag =~ tr/A-Z/a-z/;
+	  $acctags{$tag} = -1;
+      }
+    }
+  }
+  $self->{accreditor_tag} = \%acctags;
+}
+
+sub check_rbl_backend {
+  my ($self, $pms, $rule, $set, $rbl_server, $type, $subtest) = @_;
+  local ($_);
+
+  # First check that DNS is available, if not do not perform this check
+  return 0 if $self->{main}->{conf}->{skip_rbl_checks};
+  return 0 unless $pms->is_dns_available();
+  $pms->load_resolver();
+
+  if (($rbl_server !~ /^(\d+)\.(\d+)\.(\d+)\.(\d+)$/) &&
+      (index($rbl_server, '.') >= 0) &&
+      ($rbl_server !~ /\.$/)) {
+    $rbl_server .= ".";
+  }
+
+  dbg("dns: checking RBL $rbl_server, set $set");
+
+  # ok, make a list of all the IPs in the untrusted set
+  my @fullips = map { $_->{ip} } @{$pms->{relays_untrusted}};
+
+  # now, make a list of all the IPs in the external set, for use in
+  # notfirsthop testing.  this will often be more IPs than found
+  # in @fullips.  It includes the IPs that are trusted, but
+  # not in internal_networks.
+  my @fullexternal = map {
+	(!$_->{internal}) ? ($_->{ip}) : ()
+      } @{$pms->{relays_trusted}};
+  push (@fullexternal, @fullips);	# add untrusted set too
+
+  # Make sure a header significantly improves results before adding here
+  # X-Sender-Ip: could be worth using (very low occurance for me)
+  # X-Sender: has a very low bang-for-buck for me
+  my $IP_ADDRESS = IP_ADDRESS;
+  my @originating = ();
+  for my $header ('X-Originating-IP', 'X-Apparently-From') {
+    my $str = $pms->get($header);
+    next unless $str;
+    push (@originating, ($str =~ m/($IP_ADDRESS)/g));
+  }
+
+  # Let's go ahead and trim away all private ips (KLC)
+  # also uniq the list and strip dups. (jm)
+  my @ips = $self->ip_list_uniq_and_strip_private(@fullips);
+
+  # if there's no untrusted IPs, it means we trust all the open-internet
+  # relays, so we can return right now.
+  return 0 unless (scalar @ips + scalar @originating > 0);
+
+  dbg("dns: IPs found: full-external: ".join(", ", @fullexternal).
+	" untrusted: ".join(", ", @ips).
+	" originating: ".join(", ", @originating));
+
+  my $trusted = $self->{main}->{conf}->{trusted_networks};
+
+  if (scalar @ips + scalar @originating > 0) {
+    # If name is foo-notfirsthop, check all addresses except for
+    # the originating one.  Suitable for use with dialup lists, like the PDL.
+    # note that if there's only 1 IP in the untrusted set, do NOT pop the
+    # list, since it'd remove that one, and a legit user is supposed to
+    # use their SMTP server (ie. have at least 1 more hop)!
+    # If name is foo-lastexternal, check only the Received header just before
+    # it enters our internal networks; we can trust it and it's the one that
+    # passed mail between networks
+    if ($set =~ /-(notfirsthop|lastexternal)$/)
+    {
+      # use the external IP set, instead of the trusted set; the user may have
+      # specified some third-party relays as trusted.  Also, don't use
+      # @originating; those headers are added by a phase of relaying through
+      # a server like Hotmail, which is not going to be in dialup lists anyway.
+      @ips = $self->ip_list_uniq_and_strip_private(@fullexternal);
+      if ($1 eq "lastexternal") {
+        @ips = (defined $ips[0]) ? ($ips[0]) : ();
+      } else {
+	pop @ips if (scalar @ips > 1);
+      }
+    }
+    # If name is foo-firsttrusted, check only the Received header just
+    # after it enters our trusted networks; that's the only one we can
+    # trust the IP address from (since our relay added that header).
+    # And if name is foo-untrusted, check any untrusted IP address.
+    elsif ($set =~ /-(first|un)trusted$/)
+    {
+      my @tips = ();
+      foreach my $ip (@originating) {
+        if ($ip && !$trusted->contains_ip($ip)) {
+          push(@tips, $ip);
+        }
+      }
+      @ips = $self->ip_list_uniq_and_strip_private (@ips, @tips);
+      if ($1 eq "first") {
+        @ips = (defined $ips[0]) ? ($ips[0]) : ();
+      } else {
+        shift @ips;
+      }
+    }
+    else
+    {
+      my @tips = ();
+      foreach my $ip (@originating) {
+        if ($ip && !$trusted->contains_ip($ip)) {
+          push(@tips, $ip);
+        }
+      }
+      # add originating IPs as untrusted IPs (if they are untrusted)
+      @ips = reverse $self->ip_list_uniq_and_strip_private (@ips, @tips);
+
+      # How many IPs max you check in the received lines
+      my $checklast=$self->{main}->{conf}->{num_check_received};
+
+      if (scalar @ips > $checklast) {
+	splice (@ips, $checklast);	# remove all others
+      }
+    }
+  }
+  dbg("dns: only inspecting the following IPs: ".join(", ", @ips));
+
+  eval {
+    foreach my $ip (@ips) {
+      next unless ($ip =~ /(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})/);
+      $pms->do_rbl_lookup($rule, $set, $type, $rbl_server,
+			   "$4.$3.$2.$1.$rbl_server", $subtest);
+    }
+  };
+
+  # note that results are not handled here, hits are handled directly
+  # as DNS responses are harvested
+  return 0;
+}
+
+sub check_rbl {
+  my ($self, $pms, $rule, $set, $rbl_server, $subtest) = @_;
+  $self->check_rbl_backend($pms, $rule, $set, $rbl_server, 'A', $subtest);
+}
+
+sub check_rbl_txt {
+  my ($self, $pms, $rule, $set, $rbl_server, $subtest) = @_;
+  $self->check_rbl_backend($pms, $rule, $set, $rbl_server, 'TXT', $subtest);
+}
+
+# run for first message 
+sub check_rbl_sub {
+  my ($self, $pms, $rule, $set, $subtest) = @_;
+
+  return 0 if $self->{main}->{conf}->{skip_rbl_checks};
+  return 0 unless $pms->is_dns_available();
+
+  $pms->register_rbl_subtest($rule, $set, $subtest);
+}
+
+# backward compatibility
+sub check_rbl_results_for {
+  #warn "dns: check_rbl_results_for() is deprecated, use check_rbl_sub()\n";
+  check_rbl_sub(@_);
+}
+
+# this only checks the address host name and not the domain name because
+# using the domain name had much worse results for dsn.rfc-ignorant.org
+sub check_rbl_from_host {
+  _check_rbl_addresses(@_, $_[0]->all_from_addrs());
+}
+
+# this only checks the address host name and not the domain name because
+# using the domain name had much worse results for dsn.rfc-ignorant.org
+sub check_rbl_envfrom {
+  _check_rbl_addresses(@_, $_[0]->get('EnvelopeFrom:addr'));
+}
+
+sub _check_rbl_addresses {
+  my ($self, $pms, $rule, $set, $rbl_server, @addresses) = @_;
+  
+  return 0 if $self->{main}->{conf}->{skip_rbl_checks};
+  return 0 unless $pms->is_dns_available();
+
+  my %hosts;
+  for my $address (@addresses) {
+    if ($address =~ m/\@(\S+\.\S+)/) {
+      $hosts{lc($1)} = 1;
+    }
+  }
+  return unless scalar keys %hosts;
+
+  $pms->load_resolver();
+
+  if (($rbl_server !~ /^(\d+)\.(\d+)\.(\d+)\.(\d+)$/) &&
+      (index($rbl_server, '.') >= 0) &&
+      ($rbl_server !~ /\.$/)) {
+    $rbl_server .= ".";
+  }
+  dbg("dns: _check_rbl_addresses RBL $rbl_server, set $set");
+
+  for my $host (keys %hosts) {
+    $pms->do_rbl_lookup($rule, $set, 'A', $rbl_server, "$host.$rbl_server");
+  }
+}
+
+sub check_dns_sender {
+  my ($self, $pms, $rule) = @_;
+
+  my $host;
+  for my $from ($pms->get('EnvelopeFrom:addr')) {
+    next unless defined $from;
+
+    $from =~ tr/././s;		# bug 3366
+    if ($from =~ /\@(\S+\.\S+)/) {
+      $host = lc($1);
+      last;
+    }
+  }
+  return 0 unless defined $host;
+
+  # First check that DNS is available, if not do not perform this check
+  # TODO: need a way to skip DNS checks as a whole in configuration
+  return 0 unless $pms->is_dns_available();
+  $pms->load_resolver();
+
+  if ($host eq 'compiling.spamassassin.taint.org') {
+    # only used when compiling
+    return 0;
+  }
+
+  dbg("dns: checking A and MX for host $host");
+
+  $pms->do_dns_lookup($rule, 'A', $host);
+  $pms->do_dns_lookup($rule, 'MX', $host);
+
+  # cache name of host for later checking
+  $pms->{sender_host} = $host;
+
+  return 0;
+}
+
+1;

Added: spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/HTMLEval.pm
URL: http://svn.apache.org/viewcvs/spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/HTMLEval.pm?rev=356971&view=auto
==============================================================================
--- spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/HTMLEval.pm (added)
+++ spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/HTMLEval.pm Wed Dec 14 18:27:14 2005
@@ -0,0 +1,188 @@
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+
+package Mail::SpamAssassin::Plugin::HTMLEval;
+
+use Mail::SpamAssassin::Plugin;
+use strict;
+use warnings;
+use bytes;
+
+use vars qw(@ISA);
+@ISA = qw(Mail::SpamAssassin::Plugin);
+
+# constructor: register the eval rule
+sub new {
+  my $class = shift;
+  my $mailsaobject = shift;
+
+  # some boilerplate...
+  $class = ref($class) || $class;
+  my $self = $class->SUPER::new($mailsaobject);
+  bless ($self, $class);
+
+  # the important bit!
+  $self->register_eval_rule("html_tag_balance");
+  $self->register_eval_rule("html_image_only");
+  $self->register_eval_rule("html_image_ratio");
+  $self->register_eval_rule("html_charset_faraway");
+  $self->register_eval_rule("html_tag_exists");
+  $self->register_eval_rule("html_test");
+  $self->register_eval_rule("html_eval");
+  $self->register_eval_rule("html_text_match");
+  $self->register_eval_rule("html_title_subject_ratio");
+  $self->register_eval_rule("html_text_not_match");
+  $self->register_eval_rule("html_range");
+
+  return $self;
+}
+
+sub html_tag_balance {
+  my ($self, $pms, undef, $rawtag, $rawexpr) = @_;
+  $rawtag =~ /^([a-zA-Z0-9]+)$/; my $tag = $1;
+  $rawexpr =~ /^([\<\>\=\!\-\+ 0-9]+)$/; my $expr = $1;
+
+  return 0 unless exists $pms->{html}{inside}{$tag};
+
+  $pms->{html}{inside}{$tag} =~ /^([\<\>\=\!\-\+ 0-9]+)$/;
+  my $val = $1;
+  return eval "\$val $expr";
+}
+
+sub html_image_only {
+  my ($self, $pms, undef, $min, $max) = @_;
+
+  return (exists $pms->{html}{inside}{img} &&
+	  exists $pms->{html}{length} &&
+	  $pms->{html}{length} > $min &&
+	  $pms->{html}{length} <= $max);
+}
+
+sub html_image_ratio {
+  my ($self, $pms, undef, $min, $max) = @_;
+
+  return 0 unless (exists $pms->{html}{non_space_len} &&
+		   exists $pms->{html}{image_area} &&
+		   $pms->{html}{image_area} > 0);
+  my $ratio = $pms->{html}{non_space_len} / $pms->{html}{image_area};
+  return ($ratio > $min && $ratio <= $max);
+}
+
+sub html_charset_faraway {
+  my ($self, $pms) = @_;
+
+  return 0 unless exists $pms->{html}{charsets};
+
+  my @locales = Mail::SpamAssassin::Util::get_my_locales($pms->{conf}->{ok_locales});
+  return 0 if grep { $_ eq "all" } @locales;
+
+  my $okay = 0;
+  my $bad = 0;
+  for my $c (split(' ', $pms->{html}{charsets})) {
+    if (Mail::SpamAssassin::Locales::is_charset_ok_for_locales($c, @locales)) {
+      $okay++;
+    }
+    else {
+      $bad++;
+    }
+  }
+  return ($bad && ($bad >= $okay));
+}
+
+sub html_tag_exists {
+  my ($self, $pms, undef, $tag) = @_;
+  return exists $pms->{html}{inside}{$tag};
+}
+
+sub html_test {
+  my ($self, $pms, undef, $test) = @_;
+  return $pms->{html}{$test};
+}
+
+sub html_eval {
+  my ($self, $pms, undef, $test, $rawexpr) = @_;
+  $rawexpr =~ /^([\<\>\=\!\-\+ 0-9]+)$/; my $expr = $1;
+
+  # workaround bug 3320: wierd perl bug where additional, very explicit
+  # untainting into a new var is required.
+  my $tainted = $pms->{html}{$test};
+  return unless defined($tainted);
+  $tainted =~ /^(.*)$/; my $val = $1;
+
+  # just use the value in $val, don't copy it needlessly
+  return eval "\$val $expr";
+}
+
+sub html_text_match {
+  my ($self, $pms, undef, $text, $regexp) = @_;
+  for my $string (@{ $pms->{html}{$text} }) {
+    if (defined $string && $string =~ /${regexp}/) {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+sub html_title_subject_ratio {
+  my ($self, $pms, undef, $ratio) = @_;
+
+  my $subject = $pms->get('Subject');
+  if (! $subject) {
+    return 0;
+  }
+  my $max = 0;
+  for my $string (@{ $pms->{html}{title} }) {
+    if ($string) {
+      my $ratio = length($string) / length($subject);
+      $max = $ratio if $ratio > $max;
+    }
+  }
+  return $max > $ratio;
+}
+
+sub html_text_not_match {
+  my ($self, $pms, undef, $text, $regexp) = @_;
+  for my $string (@{ $pms->{html}{$text} }) {
+    if (defined $string && $string !~ /${regexp}/) {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+sub html_range {
+  my ($self, $pms, undef, $test, $min, $max) = @_;
+
+  return 0 unless exists $pms->{html}{$test};
+
+  $test = $pms->{html}{$test};
+
+  # not all perls understand what "inf" means, so we need to do
+  # non-numeric tests!  urg!
+  if (!defined $max || $max eq "inf") {
+    return ($test eq "inf") ? 1 : ($test > $min);
+  }
+  elsif ($test eq "inf") {
+    # $max < inf, so $test == inf means $test > $max
+    return 0;
+  }
+  else {
+    # if we get here everything should be a number
+    return ($test > $min && $test <= $max);
+  }
+}
+
+1;

Added: spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/HeaderEval.pm
URL: http://svn.apache.org/viewcvs/spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/HeaderEval.pm?rev=356971&view=auto
==============================================================================
--- spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/HeaderEval.pm (added)
+++ spamassassin/branches/tvd-evaltoplugin/lib/Mail/SpamAssassin/Plugin/HeaderEval.pm Wed Dec 14 18:27:14 2005
@@ -0,0 +1,1107 @@
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+
+package Mail::SpamAssassin::Plugin::HeaderEval;
+
+use Mail::SpamAssassin::Plugin;
+use Mail::SpamAssassin::Constants qw(:sa :ip);
+
+use strict;
+use warnings;
+use bytes;
+
+use vars qw(@ISA);
+@ISA = qw(Mail::SpamAssassin::Plugin);
+
+# constructor: register the eval rule
+sub new {
+  my $class = shift;
+  my $mailsaobject = shift;
+
+  # some boilerplate...
+  $class = ref($class) || $class;
+  my $self = $class->SUPER::new($mailsaobject);
+  bless ($self, $class);
+
+  # the important bit!
+  $self->register_eval_rule("check_for_fake_aol_relay_in_rcvd");
+  $self->register_eval_rule("check_for_faraway_charset_in_headers");
+  $self->register_eval_rule("check_for_unique_subject_id");
+  $self->register_eval_rule("word_is_in_dictionary");
+  $self->register_eval_rule("check_illegal_chars");
+  $self->register_eval_rule("check_for_forged_hotmail_received_headers");
+  $self->register_eval_rule("check_for_no_hotmail_received_headers");
+  $self->register_eval_rule("check_for_msn_groups_headers");
+  $self->register_eval_rule("check_for_forged_eudoramail_received_headers");
+  $self->register_eval_rule("check_for_forged_yahoo_received_headers");
+  $self->register_eval_rule("check_for_forged_juno_received_headers");
+  $self->register_eval_rule("check_for_from_to_same");
+  $self->register_eval_rule("check_for_matching_env_and_hdr_from");
+  $self->register_eval_rule("sorted_recipients");
+  $self->register_eval_rule("similar_recipients");
+  $self->register_eval_rule("check_for_missing_to_header");
+  $self->register_eval_rule("check_for_forged_gw05_received_headers");
+  $self->register_eval_rule("check_for_round_the_world_received_helo");
+  $self->register_eval_rule("check_for_round_the_world_received_revdns");
+  $self->register_eval_rule("check_for_shifted_date");
+  $self->register_eval_rule("received_within_months");
+  $self->register_eval_rule("subject_is_all_caps");
+  $self->register_eval_rule("check_for_to_in_subject");
+  $self->register_eval_rule("check_outlook_message_id");
+  $self->register_eval_rule("check_messageid_not_usable");
+  $self->register_eval_rule("check_header_count_range");
+  $self->register_eval_rule("check_unresolved_template");
+  $self->register_eval_rule("check_ratware_name_id");
+  $self->register_eval_rule("check_ratware_envelope_from");
+
+  return $self;
+}
+
+# sad but true. sort it out, sysadmins!
+my $CCTLDS_WITH_LOTS_OF_OPEN_RELAYS = qr{(?:kr|cn|cl|ar|hk|il|th|tw|sg|za|tr|ma|ua|in|pe|br)};
+my $ROUND_THE_WORLD_RELAYERS = qr{(?:net|com|ca)};
+
+sub check_for_fake_aol_relay_in_rcvd {
+  my ($self, $pms) = @_;
+  local ($_);
+
+  $_ = $pms->get('Received'); s/\s/ /gs;
+
+  # this is the hostname format used by AOL for their relays. Spammers love 
+  # forging it.  Don't make it more specific to match aol.com only, though --
+  # there's another set of spammers who generate fake hostnames to go with
+  # it!
+  if (/ rly-[a-z][a-z]\d\d\./i) {
+    return 0 if /\/AOL-\d+\.\d+\.\d+\)/;    # via AOL mail relay
+    return 0 if /ESMTP id (?:RELAY|MAILRELAY|MAILIN)/; # AOLish
+    return 1;
+  }
+
+# spam: Received: from unknown (HELO mta05bw.bigpond.com) (80.71.176.130) by
+#    rly-xw01.mx.aol.com with QMQP; Sat, 15 Jun 2002 23:37:16 -0000
+
+# non: Received: from  rly-xj02.mx.aol.com (rly-xj02.mail.aol.com [172.20.116.39]) by
+#    omr-r05.mx.aol.com (v83.35) with ESMTP id RELAYIN7-0501132011; Wed, 01
+#    May 2002 13:20:11 -0400
+
+# non: Received: from logs-tr.proxy.aol.com (logs-tr.proxy.aol.com [152.163.201.132])
+#    by rly-ip01.mx.aol.com (8.8.8/8.8.8/AOL-5.0.0)
+#    with ESMTP id NAA08955 for <sa...@yahoogroups.com>;
+#    Thu, 4 Apr 2002 13:11:20 -0500 (EST)
+
+  return 0;
+}
+
+sub check_for_faraway_charset_in_headers {
+  my ($self, $pms) = @_;
+  my $hdr;
+
+  my @locales = Mail::SpamAssassin::Util::get_my_locales($self->{main}->{conf}->{ok_locales});
+
+  return 0 if grep { $_ eq "all" } @locales;
+
+  for my $h (qw(From Subject)) {
+    my @hdrs = $pms->get("$h:raw");
+    if ($#hdrs >= 0) {
+      $hdr = join(" ", @hdrs);
+    } else {
+      $hdr = '';
+    }
+    while ($hdr =~ /=\?(.+?)\?.\?.*?\?=/g) {
+      Mail::SpamAssassin::Locales::is_charset_ok_for_locales($1, @locales)
+	  or return 1;
+    }
+  }
+  0;
+}
+
+sub check_for_unique_subject_id {
+  my ($self, $pms) = @_;
+  local ($_);
+  $_ = lc $pms->get('Subject');
+  study;
+
+  my $id = 0;
+  if (/[-_\.\s]{7,}([-a-z0-9]{4,})$/
+	|| /\s{10,}(?:\S\s)?(\S+)$/
+	|| /\s{3,}[-:\#\(\[]+([-a-z0-9]{4,})[\]\)]+$/
+	|| /\s{3,}[:\#\(\[]*([a-f0-9]{4,})[\]\)]*$/
+	|| /\s{3,}[-:\#]([a-z0-9]{5,})$/
+	|| /[\s._]{3,}([^0\s._]\d{3,})$/
+	|| /[\s._]{3,}\[(\S+)\]$/
+
+        # (7217vPhZ0-478TLdy5829qicU9-0@26) and similar
+        || /\(([-\w]{7,}\@\d+)\)$/
+
+        # Seven or more digits at the end of a subject is almost certainly a id
+        || /\b(\d{7,})\s*$/
+
+        # stuff at end of line after "!" or "?" is usually an id
+        || /[!\?]\s*(\d{4,}|\w+(-\w+)+)\s*$/
+
+        # 9095IPZK7-095wsvp8715rJgY8-286-28 and similar
+	# excluding 'Re:', etc and the first word
+        || /(?:\w{2,3}:\s)?\w+\s+(\w{7,}-\w{7,}(-\w+)*)\s*$/
+
+        # #30D7 and similar
+        || /\s#\s*([a-f0-9]{4,})\s*$/
+     )
+  {
+    $id = $1;
+    # exempt online purchases
+    if ($id =~ /\d{5,}/
+	&& /(?:item|invoice|order|number|confirmation).{1,6}\Q$id\E\s*$/)
+    {
+      $id = 0;
+    }
+
+    # for the "foo-bar-baz" case, otherwise it won't
+    # be found in the dict:
+    $id =~ s/-//;
+  }
+
+  return ($id && !$self->word_is_in_dictionary($id));
+}
+
+# word_is_in_dictionary()
+#
+# See if the word looks like an English word, by checking if each triplet
+# of letters it contains is one that can be found in the English language.
+# Does not include triplets only found in proper names, or in the Latin
+# and Greek terms that might be found in a larger dictionary
+
+my %triplets = ();
+my $triplets_loaded = 0;
+
+sub word_is_in_dictionary {
+  my ($self, $word) = @_;
+  local ($_);
+  local $/ = "\n";		# Ensure $/ is set appropriately
+
+  # $word =~ tr/A-Z/a-z/;	# already done by this stage
+  $word =~ s/^\s+//;
+  $word =~ s/\s+$//;
+
+  # If it contains a digit, dash, etc, it's not a valid word.
+  # Don't reject words like "can't" and "I'll"
+  return 0 if ($word =~ /[^a-z\']/);
+
+  # handle a few common "blah blah blah (comment)" styles
+  return 1 if ($word eq "ot");	# off-topic
+  return 1 if ($word =~ /(?:linux|nix|bsd)/); # not in most dicts
+  return 1 if ($word =~ /(?:whew|phew|attn|tha?nx)/);  # not in most dicts
+
+  my $word_len = length($word);
+
+  # Unique IDs probably aren't going to be only one or two letters long
+  return 1 if ($word_len < 3);
+
+  if (!$triplets_loaded) {
+    # take a copy to avoid modifying the real one
+    my @default_triplets_path = @Mail::SpamAssassin::default_rules_path;
+    @default_triplets_path = map { s,$,/triplets.txt,; $_; }
+				    @default_triplets_path;
+    my $filename = $self->{main}->first_existing_path (@default_triplets_path);
+
+    if (!defined $filename) {
+      dbg("eval: failed to locate the triplets.txt file");
+      return 1;
+    }
+
+    if (!open (TRIPLETS, "<$filename")) {
+      dbg("eval: failed to open '$filename', cannot check dictionary");
+      return 1;
+    }
+
+    while(<TRIPLETS>) {
+      chomp;
+      $triplets{$_} = 1;
+    }
+    close(TRIPLETS);
+
+    $triplets_loaded = 1;
+  } # if (!$triplets_loaded)
+
+
+  my $i;
+
+  for ($i = 0; $i < ($word_len - 2); $i++) {
+    my $triplet = substr($word, $i, 3);
+    if (!$triplets{$triplet}) {
+      dbg("eval: unique ID: letter triplet '$triplet' from word '$word' not valid");
+      return 0;
+    }
+  } # for ($i = 0; $i < ($word_len - 2); $i++)
+
+  # All letter triplets in word were found to be valid
+  return 1;
+}
+
+# look for 8-bit and other illegal characters that should be MIME
+# encoded, these might want to exempt languages that do not use
+# Latin-based alphabets, but only if the user wants it that way
+sub check_illegal_chars {
+  my ($self, $pms, $header, $ratio, $count) = @_;
+
+  $header .= ":raw" unless ($header eq "ALL" || $header =~ /:raw$/);
+  my $str = $pms->get($header);
+  return 0 unless $str;
+
+  # avoid overlap between tests
+  if ($header eq "ALL") {
+    # fix continuation lines, then remove Subject and From
+    $str =~ s/\n[ \t]+/  /gs;
+    $str =~ s/^(?:Subject|From):.*$//gm;
+  }
+
+  # count illegal substrings (RFC 2045)
+  my $illegal = () = ($str =~ /[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]/g);
+
+  # minor exemptions for Subject
+  if ($header eq "Subject:raw") {
+    # only exempt a single cent sign, pound sign, or registered sign
+    my $exempt = () = ($str =~ /[\xa2\xa3\xae]/g);
+    $illegal-- if $exempt == 1;
+  }
+
+  return 0 if (length($str) == 0);
+  return (($illegal / length($str)) >= $ratio && $illegal >= $count);
+}
+
+# ezmlm has a very bad habit of removing Received: headers! bad ezmlm.
+#
+sub gated_through_received_hdr_remover {
+  my ($self, $pms) = @_;
+
+  my $txt = $pms->get("Mailing-List");
+  if (defined $txt && $txt =~ /^contact \S+\@\S+\; run by ezmlm$/) {
+    my $dlto = $pms->get("Delivered-To");
+    my $rcvd = $pms->get("Received");
+
+    # ensure we have other indicative headers too
+    if ($dlto =~ /^mailing list \S+\@\S+/ &&
+        $rcvd =~ /qmail \d+ invoked (?:from network|by .{3,20})\); \d+ ... \d+/)
+    {
+      return 1;
+    }
+  }
+
+  if ($pms->get("Received") !~ /\S/) {
+    # we have no Received headers!  These tests cannot run in that case
+    return 1;
+  }
+
+  # MSN groups removes Received lines. thanks MSN
+  if ($pms->get("Received") =~ /from groups\.msn\.com \(\S+\.msn\.com /) {
+    return 1;
+  }
+
+  return 0;
+}
+
+# FORGED_HOTMAIL_RCVD
+sub _check_for_forged_hotmail_received_headers {
+  my ($self, $pms) = @_;
+
+  if (defined $self->{hotmail_addr_but_no_hotmail_received}) { return; }
+
+  $self->{hotmail_addr_with_forged_hotmail_received} = 0;
+  $self->{hotmail_addr_but_no_hotmail_received} = 0;
+
+  my $rcvd = $pms->get('Received');
+  $rcvd =~ s/\s+/ /gs;		# just spaces, simplify the regexp
+
+  return if ($rcvd =~
+	/from mail pickup service by hotmail\.com with Microsoft SMTPSVC;/);
+
+  # Microsoft passes Hotmail mail directly to MSN Group servers.
+  return if $self->check_for_msn_groups_headers($pms);
+
+  my $ip = $pms->get('X-Originating-Ip');
+  my $IP_ADDRESS = IP_ADDRESS;
+
+  if ($ip =~ /$IP_ADDRESS/) { $ip = 1; } else { $ip = 0; }
+
+  # Hotmail formats its received headers like this:
+  # Received: from hotmail.com (f135.law8.hotmail.com [216.33.241.135])
+  # spammers do not ;)
+
+  if ($self->gated_through_received_hdr_remover($pms)) { return; }
+
+  if ($rcvd =~ /from \S*hotmail.com \(\S+\.hotmail(?:\.msn)?\.com[ \)]/ && $ip)
+                { return; }
+  if ($rcvd =~ /from \S+ by \S+\.hotmail(?:\.msn)?\.com with HTTP\;/ && $ip)
+                { return; }
+  if ($rcvd =~ /from \[66\.218.\S+\] by \S+\.yahoo\.com/ && $ip)
+                { return; }
+
+  if ($rcvd =~ /(?:from |HELO |helo=)\S*hotmail\.com\b/) {
+    # HELO'd as hotmail.com, despite not being hotmail
+    $self->{hotmail_addr_with_forged_hotmail_received} = 1;
+  } else {
+    # check to see if From claimed to be @hotmail.com
+    my $from = $pms->get('From:addr');
+    if ($from !~ /hotmail.com/) { return; }
+    $self->{hotmail_addr_but_no_hotmail_received} = 1;
+  }
+}
+
+# FORGED_HOTMAIL_RCVD
+sub check_for_forged_hotmail_received_headers {
+  my ($self, $pms) = @_;
+  $self->_check_for_forged_hotmail_received_headers($pms);
+  return $self->{hotmail_addr_with_forged_hotmail_received};
+}
+
+# SEMIFORGED_HOTMAIL_RCVD
+sub check_for_no_hotmail_received_headers {
+  my ($self, $pms) = @_;
+  $self->_check_for_forged_hotmail_received_headers($pms);
+  return $self->{hotmail_addr_but_no_hotmail_received};
+}
+
+# MSN_GROUPS
+sub check_for_msn_groups_headers {
+  my ($self, $pms) = @_;
+
+  return 0 unless ($pms->get('To') =~ /<(\S+)\@groups\.msn\.com>/i);
+  my $listname = $1;
+
+  # from Theo Van Dinter, see
+  # http://www.hughes-family.org/bugzilla/show_bug.cgi?id=591
+  # Updated by DOS, based on messages from Bob Menschel, bug 4301
+
+  return 0 unless $pms->get('Received') =~ /from mail pickup service by ((?:p\d\d\.)groups\.msn\.com)\b/;
+  my $server = $1;
+
+  if ($listname =~ /^notifications$/) {
+    return 0 unless $pms->get('Message-Id') =~ /^<\S+\@$server>/;
+  } else {
+    return 0 unless $pms->get('Message-Id') =~ /^<$listname-\S+\@groups\.msn\.com>/;
+    return 0 unless $pms->get('EnvelopeFrom') =~ /$listname-bounce\@groups\.msn\.com/;
+  }
+  return 1;
+
+# MSN Groups
+# Return-path: <Li...@groups.msn.com>
+# Received: from groups.msn.com (tk2dcpuba02.msn.com [65.54.195.210]) by
+#    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g72K35v10457 for
+#    <zz...@jmason.org>; Fri, 2 Aug 2002 21:03:05 +0100
+# Received: from mail pickup service by groups.msn.com with Microsoft
+#    SMTPSVC; Fri, 2 Aug 2002 13:01:30 -0700
+# Message-id: <Li...@groups.msn.com>
+# X-loop: notifications@groups.msn.com
+# Reply-to: "List Full Name" <Li...@groups.msn.com>
+# To: "List Full Name" <Li...@groups.msn.com>
+
+# Return-path: <Li...@groups.msn.com>
+# Received: from p04.groups.msn.com ([65.54.195.216]) etc...
+# Received: from mail pickup service by p04.groups.msn.com with Microsoft SMTPSVC;
+#          Thu, 5 May 2005 20:30:37 -0700
+# X-Originating-Ip: 207.68.170.30
+# From: =?iso-8859-1?B?IqSj4/D9pEbzeN9s9vLw6qQiIA==?=<zz...@hotmail.com>
+# To: "Managers of List Name" <no...@groups.msn.com>
+# Subject: =?iso-8859-1?Q?APPROVAL_NEEDED:_=A4=A3=E3=F0=FD=A4F=F3x=DFl?=
+#         =?iso-8859-1?Q?=F6=F2=F0=EA=A4_applied_to_join_List_Name=2C?=
+#         =?iso-8859-1?Q?_an_MSN_Group?=
+# Date: Thu, 5 May 2005 20:30:37 -0700
+# MIME-Version: 1.0
+# Content-Type: multipart/alternative;
+#         boundary="----=_NextPart_000_333944_01C551B1.4BBA02B0"
+# X-MimeOLE: Produced By Microsoft MimeOLE V5.50.4927.1200
+# Message-ID: <TK...@p04.groups.msn.com>
+
+# Return-path: <Li...@groups.msn.com>
+# Received: from [65.54.208.83] (helo=p05.groups.msn.com) etc...
+# Received: from mail pickup service by p05.groups.msn.com with Microsoft SMTPSVC;
+#          Fri, 6 May 2005 14:59:25 -0700
+# X-Originating-Ip: 207.68.170.30
+# Message-Id: <Li...@groups.msn.com>
+# Reply-To: "List Name" <Li...@groups.msn.com>
+# From: "whoever" <zz...@hotmail.com>
+# To: "List Name" <Li...@groups.msn.com>
+# Subject: whatever
+# Date: Fri, 6 May 2005 14:59:25 -0700
+
+}
+
+###########################################################################
+
+sub check_for_forged_eudoramail_received_headers {
+  my ($self, $pms) = @_;
+
+  my $from = $pms->get('From:addr');
+  if ($from !~ /eudoramail.com/) { return 0; }
+
+  my $rcvd = $pms->get('Received');
+  $rcvd =~ s/\s+/ /gs;		# just spaces, simplify the regexp
+
+  my $ip = $pms->get('X-Sender-Ip');
+  my $IP_ADDRESS = IP_ADDRESS;
+  if ($ip =~ /$IP_ADDRESS/) { $ip = 1; } else { $ip = 0; }
+
+  # Eudoramail formats its received headers like this:
+  # Received: from Unknown/Local ([?.?.?.?]) by shared1-mail.whowhere.com;
+  #      Thu Nov 29 13:44:25 2001
+  # Message-Id: <JG...@shared1-mail.whowhere.com>
+  # Organization: QUALCOMM Eudora Web-Mail  (http://www.eudoramail.com:80)
+  # X-Sender-Ip: 192.175.21.146
+  # X-Mailer: MailCity Service
+
+  if ($self->gated_through_received_hdr_remover($pms)) { return 0; }
+
+  if ($rcvd =~ /by \S*whowhere.com\;/ && $ip) { return 0; }
+  
+  return 1;
+}
+
+###########################################################################
+
+sub check_for_forged_yahoo_received_headers {
+  my ($self, $pms) = @_;
+
+  my $from = $pms->get('From:addr');
+  if ($from !~ /yahoo\.com$/) { return 0; }
+
+  my $rcvd = $pms->get('Received');
+  
+  if ($pms->get("Resent-From") && $pms->get("Resent-To")) {
+    my $xrcvd = $pms->get("X-Received");
+    $rcvd = $xrcvd if $xrcvd;
+  }
+  $rcvd =~ s/\s+/ /gs;		# just spaces, simplify the regexp
+
+  # not sure about this
+  #if ($rcvd !~ /from \S*yahoo\.com/) { return 0; }
+
+  if ($self->gated_through_received_hdr_remover($pms)) { return 0; }
+
+  # bug 3740: ignore bounces from Yahoo!.   only honoured if the
+  # correct rDNS shows up in the trusted relay list, or first untrusted relay
+  if ($from eq 'MAILER-DAEMON@yahoo.com' &&
+      ($pms->{relays_trusted_str} =~ / rdns=\S+\.yahoo\.com /
+        || $pms->{relays_untrusted_str} =~ /^[^\]]+ rdns=\S+\.yahoo\.com /))
+            { return 0; }
+
+  if ($rcvd =~ /by web\S+\.mail\S*\.yahoo\.com via HTTP/) { return 0; }
+  if ($rcvd =~ /by smtp\S+\.yahoo\.com with SMTP/) { return 0; }
+  my $IP_ADDRESS = IP_ADDRESS;
+  if ($rcvd =~
+      /from \[$IP_ADDRESS\] by \S+\.(?:groups|scd|dcn)\.yahoo\.com with NNFMP/) {
+    return 0;
+  }
+
+  # used in "forward this news item to a friend" links.  There's no better
+  # received hdrs to match on, unfortunately.  I'm not sure if the next test is
+  # still useful, as a result.
+  #
+  # search for msgid <20...@xent.com>, subject "Yahoo!
+  # News Story - Top Stories", date Sep 29 2002 on
+  # <http://xent.com/pipermail/fork/> for an example.
+  #
+  if ($rcvd =~ /\bmailer\d+\.bulk\.scd\.yahoo\.com\b/
+                && $from =~ /\@reply\.yahoo\.com$/) { return 0; }
+
+  if ($rcvd =~ /by \w+\.\w+\.yahoo\.com \(\d+\.\d+\.\d+\/\d+\.\d+\.\d+\)(?: with ESMTP)? id \w+/) {
+      # possibly sent from "mail this story to a friend"
+      return 0;
+  }
+
+  return 1;
+}
+
+sub check_for_forged_juno_received_headers {
+  my ($self, $pms) = @_;
+
+  my $from = $pms->get('From:addr');
+  if($from !~ /\bjuno.com/) { return 0; }
+
+  if($self->gated_through_received_hdr_remover($pms)) { return 0; }
+
+  my $xmailer = $pms->get('X-Mailer');
+  my $xorig = $pms->get('X-Originating-IP');
+  my $rcvd = $pms->get('Received');
+  my $IP_ADDRESS = IP_ADDRESS;
+
+  if (!$xorig) {  # New style Juno has no X-Originating-IP header, and other changes
+    if($rcvd !~ /from.*\b(?:juno|untd)\.com.*[\[\(]$IP_ADDRESS[\]\)].*by/
+        && $rcvd !~ / cookie\.(?:juno|untd)\.com /) { return 1; }
+    if($xmailer !~ /Juno /) { return 1; }
+  } else {
+    if($rcvd !~ /from.*\bmail\.com.*\[$IP_ADDRESS\].*by/) { return 1; }
+    if($xorig !~ /$IP_ADDRESS/) { return 1; }
+    if($xmailer !~ /\bmail\.com/) { return 1; }
+  }
+
+  return 0;   
+}
+
+# From and To have same address, but are not exactly the same and
+# neither contains intermediate spaces.
+sub check_for_from_to_same {
+  my ($self, $pms) = @_;
+
+  my $hdr_from = $pms->get('From');
+  my $hdr_to = $pms->get('To');
+  return 0 if (!length($hdr_from) || !length($hdr_to) ||
+	       $hdr_from eq $hdr_to);
+
+  my $addr_from = $pms->get('From:addr');
+  my $addr_to = $pms->get('To:addr');
+  # BUG: From:addr and To:addr sometimes contain whitespace
+  $addr_from =~ s/\s+//g;
+  $addr_to =~ s/\s+//g;
+  return 0 if (!length($addr_from) || !length($addr_to) ||
+	       $addr_from ne $addr_to);
+
+  if ($hdr_from =~ /^\s*\S+\s*$/ && $hdr_to =~ /^\s*\S+\s*$/) {
+    return 1;
+  }
+}
+
+sub check_for_matching_env_and_hdr_from {
+  my ($self, $pms) =@_;
+  # two blank headers match so don't bother checking
+  return (lc $pms->get('EnvelopeFrom:addr') eq lc $pms->get('From:addr'));
+}
+
+sub sorted_recipients {
+  my ($self, $pms) = @_;
+
+  if (!exists $self->{tocc_sorted}) {
+    $self->_check_recipients($pms);
+  }
+  return $self->{tocc_sorted};
+}
+
+sub similar_recipients {
+  my ($self, $pms, $min, $max) = @_;
+
+  if (!exists $self->{tocc_similar}) {
+    $self->_check_recipients($pms);
+  }
+  return (($min eq 'undef' || $self->{tocc_similar} >= $min) &&
+	  ($max eq 'undef' || $self->{tocc_similar} < $max));
+}
+
+# best experimentally derived values
+use constant TOCC_SORTED_COUNT => 7;
+use constant TOCC_SIMILAR_COUNT => 5;
+use constant TOCC_SIMILAR_LENGTH => 2;
+
+sub _check_recipients {
+  my ($self, $pms) = @_;
+
+  my @inputs;
+
+  # ToCc: pseudo-header works best, but sometimes Bcc: is better
+  for ('ToCc', 'Bcc') {
+    my $to = $pms->get($_);	# get recipients
+    $to =~ s/\(.*?\)//g;	# strip out the (comments)
+    push(@inputs, ($to =~ m/([\w.=-]+\@\w+(?:[\w.-]+\.)+\w+)/g));
+    last if scalar(@inputs) >= TOCC_SIMILAR_COUNT;
+  }
+
+  # remove duplicate addresses only when they appear next to each other
+  my @address;
+  my $previous = '';
+  while (my $current = shift @inputs) {
+    push(@address, ($previous = $current)) if lc($current) ne lc($previous);
+    last if @address == 256;
+  }
+
+  # ideas that had both poor S/O ratios and poor hit rates:
+  # - testing for reverse sorted recipient lists
+  # - testing To: and Cc: headers separately
+  $self->{tocc_sorted} = (scalar(@address) >= TOCC_SORTED_COUNT &&
+			  join(',', @address) eq (join(',', sort @address)));
+
+  # a good S/O ratio and hit rate is achieved by comparing 2-byte
+  # substrings and requiring 5 or more addresses
+  $self->{tocc_similar} = 0;
+  if (scalar (@address) >= TOCC_SIMILAR_COUNT) {
+    my @user = map { substr($_,0,TOCC_SIMILAR_LENGTH) } @address;
+    my @fqhn = map { m/\@(.*)/ } @address;
+    my @host = map { substr($_,0,TOCC_SIMILAR_LENGTH) } @fqhn;
+    my $hits = 0;
+    my $combinations = 0;
+    for (my $i = 0; $i <= $#address; $i++) {
+      for (my $j = $i+1; $j <= $#address; $j++) {
+	$hits++ if $user[$i] eq $user[$j];
+	$hits++ if $host[$i] eq $host[$j] && $fqhn[$i] ne $fqhn[$j];
+	$combinations++;
+      }
+    }
+    $self->{tocc_similar} = $hits / $combinations;
+  }
+}
+
+sub check_for_missing_to_header {
+  my ($self, $pms) = @_;
+
+  my $hdr = $pms->get('To');
+  $hdr ||= $pms->get('Apparently-To');
+  return 1 if ($hdr eq '');
+
+  return 0;
+}
+
+sub check_for_forged_gw05_received_headers {
+  my ($self, $pms) = @_;
+  local ($_);
+
+  my $rcv = $pms->get('Received');
+
+  # e.g.
+  # Received: from mail3.icytundra.com by gw05 with ESMTP; Thu, 21 Jun 2001 02:28:32 -0400
+  my ($h1, $h2) = ($rcv =~ 
+  	m/\nfrom\s(\S+)\sby\s(\S+)\swith\sESMTP\;\s+\S\S\S,\s+\d+\s+\S\S\S\s+
+			\d{4}\s+\d\d:\d\d:\d\d\s+[-+]*\d{4}\n$/xs);
+
+  if (defined ($h1) && defined ($h2) && $h2 !~ /\./) {
+    return 1;
+  }
+
+  0;
+}
+
+sub _check_for_round_the_world_received {
+  my ($self, $pms) = @_;
+  my ($relayer, $relayerip, $relay);
+
+  $self->{round_the_world_revdns} = 0;
+  $self->{round_the_world_helo} = 0;
+  my $rcvd = $pms->get('Received');
+  my $IPV4_ADDRESS = IPV4_ADDRESS;
+
+  # TODO: use new Received header parser
+
+  # trad sendmail/postfix fmt:
+  # Received: from hitower.parkgroup.ru (unknown [212.107.207.26]) by
+  #     mail.netnoteinc.com (Postfix) with ESMTP id B8CAC11410E for
+  #     <me...@netnoteinc.com>; Fri, 30 Nov 2001 02:42:05 +0000 (Eire)
+  # Received: from fmx1.freemail.hu ([212.46.197.200]) by hitower.parkgroup.ru
+  #     (Lotus Domino Release 5.0.8) with ESMTP id 2001113008574773:260 ;
+  #     Fri, 30 Nov 2001 08:57:47 +1000
+  if ($rcvd =~ /
+  	\nfrom\b.{0,20}\s(\S+\.${CCTLDS_WITH_LOTS_OF_OPEN_RELAYS})\s\(.{0,200}
+  	\nfrom\b.{0,20}\s([-_A-Za-z0-9.]+)\s.{0,30}\[($IPV4_ADDRESS)\]
+  /osix) { $relay = $1; $relayer = $2; $relayerip = $3; goto gotone; }
+
+  return 0;
+
+gotone:
+  my $revdns = $pms->lookup_ptr ($relayerip);
+  if (!defined $revdns) { $revdns = '(unknown)'; }
+
+  dbg("eval: round-the-world: mail relayed through $relay by ".	
+  	"$relayerip (HELO $relayer, rev DNS says $revdns)");
+
+  if ($revdns =~ /\.${ROUND_THE_WORLD_RELAYERS}$/oi) {
+    dbg("eval: round-the-world: yep, I think so (from rev dns)");
+    $self->{round_the_world_revdns} = 1;
+    return;
+  }
+
+  if ($relayer =~ /\.${ROUND_THE_WORLD_RELAYERS}$/oi) {
+    dbg("eval: round-the-world: yep, I think so (from HELO)");
+    $self->{round_the_world_helo} = 1;
+    return;
+  }
+
+  dbg("eval: round-the-world: probably not");
+  return;
+}
+
+sub check_for_round_the_world_received_helo {
+  my ($self, $pms) = @_;
+  if (!defined $self->{round_the_world_helo}) {
+    $self->_check_for_round_the_world_received($pms);
+  }
+  if ($self->{round_the_world_helo}) { return 1; }
+  return 0;
+}
+
+sub check_for_round_the_world_received_revdns {
+  my ($self, $pms) = @_;
+  if (!defined $self->{round_the_world_revdns}) {
+    $self->_check_for_round_the_world_received($pms);
+  }
+  if ($self->{round_the_world_revdns}) { return 1; }
+  return 0;
+}
+
+###########################################################################
+
+sub check_for_shifted_date {
+  my ($self, $pms, $min, $max) = @_;
+
+  if (!exists $self->{date_diff}) {
+    $self->_check_date_diff($pms);
+  }
+  return (($min eq 'undef' || $self->{date_diff} >= (3600 * $min)) &&
+	  ($max eq 'undef' || $self->{date_diff} < (3600 * $max)));
+}
+
+# filters out some false positives in old corpus mail - Allen
+sub received_within_months {
+  my ($self,$pms,$min,$max) = @_;
+
+  if (!exists($self->{date_received})) {
+    $self->_check_date_received($pms);
+  }
+  my $diff = time() - $self->{date_received};
+
+  # 365.2425 * 24 * 60 * 60 = 31556952 = seconds in year (including leap)
+
+  if (((! defined($min)) || ($min eq 'undef') ||
+       ($diff >= (31556952 * ($min/12)))) &&
+      ((! defined($max)) || ($max eq 'undef') ||
+       ($diff < (31556952 * ($max/12))))) {
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+sub _get_date_header_time {
+  my ($self, $pms) = @_;
+
+  my $time;
+  # a Resent-Date: header takes precedence over any Date: header
+  for my $header ('Resent-Date', 'Date') {
+    my $date = $pms->get($header);
+    if (defined($date) && length($date)) {
+      chomp($date);
+      $time = Mail::SpamAssassin::Util::parse_rfc822_date($date);
+    }
+    last if defined($time);
+  }
+  if (defined($time)) {
+    $self->{date_header_time} = $time;
+  }
+  else {
+    $self->{date_header_time} = undef;
+  }
+}
+
+sub _get_received_header_times {
+  my ($self, $pms) = @_;
+
+  $self->{received_header_times} = [ () ];
+  $self->{received_fetchmail_time} = undef;
+
+  my (@received);
+  my $received = $pms->get('Received');
+  if (defined($received) && length($received)) {
+    @received = grep {$_ =~ m/\S/} (split(/\n/,$received));
+  }
+  # if we have no Received: headers, chances are we're archived mail
+  # with a limited set of headers
+  if (!scalar(@received)) {
+    return;
+  }
+
+  # handle fetchmail headers
+  my (@local);
+  if (($received[0] =~
+      m/\bfrom (?:localhost\s|(?:\S+ ){1,2}\S*\b127\.0\.0\.1\b)/) ||
+      ($received[0] =~ m/qmail \d+ invoked by uid \d+/)) {
+    push @local, (shift @received);
+  }
+  if (scalar(@received) &&
+      ($received[0] =~ m/\bby localhost with \w+ \(fetchmail-[\d.]+/)) {
+    push @local, (shift @received);
+  }
+  elsif (scalar(@local)) {
+    unshift @received, (shift @local);
+  }
+
+  my $rcvd;
+
+  if (scalar(@local)) {
+    my (@fetchmail_times);
+    foreach $rcvd (@local) {
+      if ($rcvd =~ m/(\s.?\d+ \S\S\S \d+ \d+:\d+:\d+ \S+)/) {
+	my $date = $1;
+	dbg("eval: trying Received fetchmail header date for real time: $date");
+	my $time = Mail::SpamAssassin::Util::parse_rfc822_date($date);
+	if (defined($time) && (time() >= $time)) {
+	  dbg("eval: time_t from date=$time, rcvd=$date");
+	  push @fetchmail_times, $time;
+	}
+      }
+    }
+    if (scalar(@fetchmail_times) > 1) {
+      $self->{received_fetchmail_time} =
+       (sort {$b <=> $a} (@fetchmail_times))[0];
+    } elsif (scalar(@fetchmail_times)) {
+      $self->{received_fetchmail_time} = $fetchmail_times[0];
+    }
+  }
+
+  my (@header_times);
+  foreach $rcvd (@received) {
+    if ($rcvd =~ m/(\s.?\d+ \S\S\S \d+ \d+:\d+:\d+ \S+)/) {
+      my $date = $1;
+      dbg("eval: trying Received header date for real time: $date");
+      my $time = Mail::SpamAssassin::Util::parse_rfc822_date($date);
+      if (defined($time)) {
+	dbg("eval: time_t from date=$time, rcvd=$date");
+	push @header_times, $time;
+      }
+    }
+  }
+
+  if (scalar(@header_times)) {
+    $self->{received_header_times} = [ @header_times ];
+  } else {
+    dbg("eval: no dates found in Received headers");
+  }
+}
+
+sub _check_date_received {
+  my ($self, $pms) = @_;
+
+  my (@dates_poss);
+
+  $self->{date_received} = 0;
+
+  if (!exists($self->{date_header_time})) {
+    $self->_get_date_header_time($pms);
+  }
+
+  if (defined($self->{date_header_time})) {
+    push @dates_poss, $self->{date_header_time};
+  }
+
+  if (!exists($self->{received_header_times})) {
+    $self->_get_received_header_times($pms);
+  }
+  my (@received_header_times) = @{ $self->{received_header_times} };
+  if (scalar(@received_header_times)) {
+    push @dates_poss, $received_header_times[0];
+  }
+  if (defined($self->{received_fetchmail_time})) {
+    push @dates_poss, $self->{received_fetchmail_time};
+  }
+
+  if (defined($self->{date_header_time}) && scalar(@received_header_times)) {
+    if (!exists($self->{date_diff})) {
+      $self->_check_date_diff($pms);
+    }
+    push @dates_poss, $self->{date_header_time} - $self->{date_diff};
+  }
+
+  if (scalar(@dates_poss)) {	# use median
+    $self->{date_received} = (sort {$b <=> $a}
+			      (@dates_poss))[int($#dates_poss/2)];
+    dbg("eval: date chosen from message: " .
+	scalar(localtime($self->{date_received})));
+  } else {
+    dbg("eval: no dates found in message");
+  }
+}
+
+sub _check_date_diff {
+  my ($self, $pms) = @_;
+
+  $self->{date_diff} = 0;
+
+  if (!exists($self->{date_header_time})) {
+    $self->_get_date_header_time($pms);
+  }
+
+  if (!defined($self->{date_header_time})) {
+    return;			# already have tests for this
+  }
+
+  if (!exists($self->{received_header_times})) {
+    $self->_get_received_header_times($pms);
+  }
+  my (@header_times) = @{ $self->{received_header_times} };
+
+  if (!scalar(@header_times)) {
+    return;			# archived mail?
+  }
+
+  my (@diffs) = map {$self->{date_header_time} - $_} (@header_times);
+
+  # if the last Received: header has no difference, then we choose to
+  # exclude it
+  if ($#diffs > 0 && $diffs[$#diffs] == 0) {
+    pop(@diffs);
+  }
+
+  # use the date with the smallest absolute difference
+  # (experimentally, this results in the fewest false positives)
+  @diffs = sort { abs($a) <=> abs($b) } @diffs;
+  $self->{date_diff} = $diffs[0];
+}
+
+
+sub subject_is_all_caps {
+   my ($self, $pms) = @_;
+   my $subject = $pms->get('Subject');
+
+   $subject =~ s/^\s+//;
+   $subject =~ s/\s+$//;
+   return 0 if $subject !~ /\s/;	# don't match one word subjects
+   return 0 if (length $subject < 10);  # don't match short subjects
+   $subject =~ s/[^a-zA-Z]//g;		# only look at letters
+
+   # now, check to see if the subject is encoded using a non-ASCII charset.
+   # If so, punt on this test to avoid FPs.  We just list the known charsets
+   # this test will FP on, here.
+   my $subjraw = $pms->get('Subject:raw');
+   if ($subjraw =~ /=\?${Mail::SpamAssassin::Constants::CHARSETS_LIKELY_TO_FP_AS_CAPS}\?/i) {
+     return 0;
+   }
+
+   return length($subject) && ($subject eq uc($subject));
+}
+
+sub check_for_to_in_subject {
+  my ($self, $pms, $test) = @_;
+
+  my $full_to = $pms->get('To:addr');
+  return 0 unless $full_to;
+
+  my $subject = $pms->get('Subject');
+
+  if ($test eq "address") {
+    return $subject =~ /\b\Q$full_to\E\b/i;	# "user@domain.com"
+  }
+  elsif ($test eq "user") {
+    my $to = $full_to;
+    $to =~ s/\@.*//;
+    return $subject =~ /^\s*\Q$to\E,\s/i;	# "user,\s" case insensitive
+  }
+  return 0;
+}
+
+sub check_outlook_message_id {
+  my ($self, $pms) = @_;
+  local ($_);
+
+  my $id = $pms->get('MESSAGEID');
+  return 0 if $id !~ /^<[0-9a-f]{4}([0-9a-f]{8})\$[0-9a-f]{8}\$[0-9a-f]{8}\@/;
+
+  my $timetoken = hex($1);
+  my $x = 0.0023283064365387;
+  my $y = 27111902.8329849;
+
+  my $fudge = 250;
+
+  $_ = $pms->get('Date');
+  $_ = Mail::SpamAssassin::Util::parse_rfc822_date($_) || 0;
+  my $expected = int (($_ * $x) + $y);
+  my $diff = $timetoken - $expected;
+  return 0 if (abs($diff) < $fudge);
+
+  $_ = $self->get('Received');
+  /(\s.?\d+ \S\S\S \d+ \d+:\d+:\d+ \S+).*?$/;
+  $_ = Mail::SpamAssassin::Util::parse_rfc822_date($_) || 0;
+  $expected = int(($_ * $x) + $y);
+  $diff = $timetoken - $expected;
+
+  return (abs($diff) >= $fudge);
+}
+
+sub check_messageid_not_usable {
+  my ($self, $pms) = @_;
+  local ($_);
+
+  # Lyris eats message-ids.  also some ezmlm, I think :(
+  $_ = $pms->get("List-Unsubscribe");
+  return 1 if (/<mailto:(?:leave-\S+|\S+-unsubscribe)\@\S+>$/);
+
+  # ezmlm again
+  if($self->gated_through_received_hdr_remover($pms)) { return 1; }
+
+  # Allen notes this as 'Wacky sendmail version?'
+  $_ = $pms->get("Received");
+  return 1 if /\/CWT\/DCE\)/;
+
+  # Apr  2 2003 jm: iPlanet rewrites lots of stuff, including Message-IDs
+  return 1 if /iPlanet Messaging Server/;
+
+  # too old; older versions of clients used different formats
+  return 1 if ($self->received_within_months('6','undef'));
+
+  return 0;
+}
+
+# Return true if the count of $hdr headers are within the given range
+sub check_header_count_range {
+  my ($self, $pms, $hdr, $min, $max) = @_;
+  my %uniq = ();
+  my @hdrs = grep(!$uniq{$_}++, $pms->{msg}->get_header ($hdr));
+  return (scalar @hdrs >= $min && scalar @hdrs <= $max);
+}
+
+sub check_unresolved_template {
+  my ($self, $pms) = @_;
+
+  my $all = $pms->get('ALL');	# cached access
+  $all =~ s/\n[ \t]+/ /gs;	# fix continuation lines
+  
+  for my $header (split(/\n/, $all)) {
+    # slightly faster to test in this order
+    if ($header =~ /%[A-Z][A-Z_-]/ &&
+	$header !~ /^(?:X-UIDL|X-Face|To|Cc|From|Subject|References|In-Reply-To|(?:X-|Resent-|X-Original-)?Message-Id):/i)
+    {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+sub check_ratware_name_id {
+  my ($self, $pms) = @_;
+
+  my $mid = $pms->get('MESSAGEID');
+  my $from = $pms->get('From');
+  if ($mid =~ m/<[A-Z]{28}\.([^>]+?)>/) {
+     if ($from =~ m/\"[^\"]+\"\s*<\Q$1\E>/) {
+       return 1;
+     }
+  }
+  return 0;
+}
+
+sub check_ratware_envelope_from {
+  my ($self, $pms) = @_;
+
+  my $to = $pms->get('To:addr');
+  my $from = $pms->get('EnvelopeFrom');
+
+  return 0 unless ($to && $from);
+  return 0 if ($from =~ /^SRS\d=/);
+
+  if ($to =~ /^([^@]+)@(.+)$/) {
+    my($user,$dom) = ($1,$2);
+    $dom = Mail::SpamAssassin::Util::RegistrarBoundaries::trim_domain($dom);
+    return unless
+        (Mail::SpamAssassin::Util::RegistrarBoundaries::is_domain_valid($dom));
+
+    return 1 if ($from =~ /\b\Q$dom\E.\Q$user\E@/i);
+  }
+
+  return 0;
+}
+
+1;