You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2004/09/27 21:15:08 UTC
svn commit: rev 47325 - spamassassin/trunk/lib/Mail/SpamAssassin/Util
Author: felicity
Date: Mon Sep 27 12:15:06 2004
New Revision: 47325
Modified:
spamassassin/trunk/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm
Log:
bug 3831: RegistrarBoundaries was mishitting on a number of hostnames because the RE was too loose (www.3com.com matched com.co and was left www.3com.com). Also added in a performance boost by not trying RE matches when it's known they can't possibly match (fourth level TLDs require at least 4 parts in the domain, etc.) also moved the third_level TLD RE into order between fourth and second.
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm Mon Sep 27 12:15:06 2004
@@ -65,15 +65,6 @@
vt|wa|wi|wv|wy )
}ix;
-# updated: 2004-04-30: first rev
-#
-$THREE_LEVEL_DOMAINS = qr( (?:
- demon\.co\.uk |
-
- # http://www.neustar.us/policies/docs/rfc_1480.txt
- [^\.]+\.${US_STATES}\.us )
-)ix;
-
$FOUR_LEVEL_DOMAINS = qr( (?:
# http://www.neustar.us/policies/docs/rfc_1480.txt
# "Fire-Dept.CI.Los-Angeles.CA.US"
@@ -84,7 +75,13 @@
)
)ix;
-# updated: 2004-04-30: first rev
+$THREE_LEVEL_DOMAINS = qr( (?:
+ demon\.co\.uk |
+
+ # http://www.neustar.us/policies/docs/rfc_1480.txt
+ [^\.]+\.${US_STATES}\.us )
+)ix;
+
$TWO_LEVEL_DOMAINS = qr{ (?:
# http://www.neustar.us/policies/docs/rfc_1480.txt
@@ -615,12 +612,10 @@
sub split_domain {
my ($domain) = @_;
-
- # turn "host.dom.ain" into "dom.ain".
my $hostname = '';
if ($domain) {
- my $partsreqd;
+ my $partsreqd = 2; # default to domain.tld
# www..spamassassin.org -> www.spamassassin.org
$domain =~ tr/././s;
@@ -629,17 +624,19 @@
$domain =~ s/^\.+//;
$domain =~ s/\.+$//;
- if ($domain =~ /${FOUR_LEVEL_DOMAINS}/io) # Fire-Dept.CI.Los-Angeles.CA.US
+ # Split scalar domain into components
+ my @domparts = split (/\./, $domain);
+
+ # Look for a lower level TLD
+ # use $#domparts to skip trying to match on TLDs that can't possibly
+ # match, but keep in mind that the hostname can be blank.
+ #
+ if ($#domparts >= 4 && $domain =~ /(?:\.|^)${FOUR_LEVEL_DOMAINS}$/io) # Fire-Dept.CI.Los-Angeles.CA.US
{ $partsreqd = 5; }
- elsif ($domain =~ /${THREE_LEVEL_DOMAINS}/io) # demon.co.uk
+ elsif ($#domparts >= 3 && $domain =~ /(?:\.|^)${THREE_LEVEL_DOMAINS}$/io) # demon.co.uk
{ $partsreqd = 4; }
- elsif ($domain =~ /${TWO_LEVEL_DOMAINS}/io) # co.uk
+ elsif ($#domparts >= 2 && $domain =~ /(?:\.|^)${TWO_LEVEL_DOMAINS}$/io) # co.uk
{ $partsreqd = 3; }
- else # com
- { $partsreqd = 2; }
-
- # drop any hostname parts, if we can.
- my @domparts = split (/\./, $domain);
if (@domparts >= $partsreqd) {
# reset the domain to the last $partsreqd parts