You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2004/09/27 21:15:08 UTC

svn commit: rev 47325 - spamassassin/trunk/lib/Mail/SpamAssassin/Util

Author: felicity
Date: Mon Sep 27 12:15:06 2004
New Revision: 47325

Modified:
   spamassassin/trunk/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm
Log:
bug 3831: RegistrarBoundaries was mishitting on a number of hostnames because the RE was too loose (www.3com.com matched com.co and was left www.3com.com).  Also added in a performance boost by not trying RE matches when it's known they can't possibly match (fourth level TLDs require at least 4 parts in the domain, etc.)   also moved the third_level TLD RE into order between fourth and second.

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm	(original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm	Mon Sep 27 12:15:06 2004
@@ -65,15 +65,6 @@
   vt|wa|wi|wv|wy )
 }ix;
 
-# updated: 2004-04-30: first rev
-#
-$THREE_LEVEL_DOMAINS = qr( (?:
-  demon\.co\.uk |
-
-  # http://www.neustar.us/policies/docs/rfc_1480.txt
-  [^\.]+\.${US_STATES}\.us )
-)ix;
-
 $FOUR_LEVEL_DOMAINS = qr( (?:
   # http://www.neustar.us/policies/docs/rfc_1480.txt
   # "Fire-Dept.CI.Los-Angeles.CA.US"
@@ -84,7 +75,13 @@
 )
 )ix;
 
-# updated: 2004-04-30: first rev
+$THREE_LEVEL_DOMAINS = qr( (?:
+  demon\.co\.uk |
+
+  # http://www.neustar.us/policies/docs/rfc_1480.txt
+  [^\.]+\.${US_STATES}\.us )
+)ix;
+
 $TWO_LEVEL_DOMAINS = qr{ (?:
 
   # http://www.neustar.us/policies/docs/rfc_1480.txt
@@ -615,12 +612,10 @@
 
 sub split_domain {
   my ($domain) = @_;
-
-  # turn "host.dom.ain" into "dom.ain".
   my $hostname = '';
 
   if ($domain) {
-    my $partsreqd;
+    my $partsreqd = 2;	# default to domain.tld
 
     # www..spamassassin.org -> www.spamassassin.org
     $domain =~ tr/././s;
@@ -629,17 +624,19 @@
     $domain =~ s/^\.+//;
     $domain =~ s/\.+$//;
 
-    if ($domain =~ /${FOUR_LEVEL_DOMAINS}/io)     # Fire-Dept.CI.Los-Angeles.CA.US
+    # Split scalar domain into components
+    my @domparts = split (/\./, $domain);
+
+    # Look for a lower level TLD
+    # use $#domparts to skip trying to match on TLDs that can't possibly
+    # match, but keep in mind that the hostname can be blank.
+    #
+    if ($#domparts >= 4 && $domain =~ /(?:\.|^)${FOUR_LEVEL_DOMAINS}$/io)     # Fire-Dept.CI.Los-Angeles.CA.US
     { $partsreqd = 5; }
-    elsif ($domain =~ /${THREE_LEVEL_DOMAINS}/io) # demon.co.uk
+    elsif ($#domparts >= 3 && $domain =~ /(?:\.|^)${THREE_LEVEL_DOMAINS}$/io) # demon.co.uk
     { $partsreqd = 4; }
-    elsif ($domain =~ /${TWO_LEVEL_DOMAINS}/io)   # co.uk
+    elsif ($#domparts >= 2 && $domain =~ /(?:\.|^)${TWO_LEVEL_DOMAINS}$/io)   # co.uk
     { $partsreqd = 3; }
-    else                                          # com
-    { $partsreqd = 2; }
-
-    # drop any hostname parts, if we can.
-    my @domparts = split (/\./, $domain);
 
     if (@domparts >= $partsreqd) {
       # reset the domain to the last $partsreqd parts