You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by he...@apache.org on 2011/05/01 19:43:46 UTC

svn commit: r1098376 - in /spamassassin/trunk/lib/Mail/SpamAssassin: PerMsgStatus.pm Plugin/FreeMail.pm Util/RegistrarBoundaries.pm

Author: hege
Date: Sun May  1 17:43:45 2011
New Revision: 1098376

URL: http://svn.apache.org/viewvc?rev=1098376&view=rev
Log:
Bug #6578: Move TLD regexp to RegistrarBoundaries and make FreeMail use it

Modified:
    spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/FreeMail.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm?rev=1098376&r1=1098375&r2=1098376&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Sun May  1 17:43:45 2011
@@ -1833,20 +1833,8 @@ my $nonASCII    = '\x80-\xff';
 my $tbirdenddelimemail = $tbirdenddelim . '(\'' . $nonASCII;  # tbird ignores non-ASCII mail addresses for now, until RFC changes
 my $tbirdenddelimplusat = $tbirdenddelimemail . '@';
 
-# regexps for finding plain text non-scheme hostnames with valid TLDs.
-
-# the list from %VALID_TLDS in Util/RegistrarBoundaries.pm, as a
-# Regexp::List optimized regexp ;)  accurate as of 2010-04-15
-my $tldsRE = qr/
-  (?=[a-wyz])
-  (?:a(?:e(?:ro)?|r(?:pa)?|s(?:ia)?|[cdfgilmnoqtuwxz])|b(?:iz?|[abdefghjmnorstwyz])
-    |c(?:at?|o(?:m|op)?|[cdfghiklmnruvxyz])|d[ejkmoz]|e(?:[cegrst]|d?u)|f[ijkmor]
-    |g(?:[adefghilmnpqrstuwy]|ov)|h[kmnrtu]|i(?:n(?:fo|t)?|[delmoqrst])
-    |j(?:o(?:bs)?|[emp])|k[eghimnprwyz]|l[abcikrstuvy]
-    |m(?:o(?:bi)?|u(?:seum)?|[acdeghkmnpqrstvwxyz]|i?l)|n(?:a(?:me)?|et?|[cfgilopruz])
-    |o(?:m|rg)|p(?:ro?|[aefghklnstwy])|r[eosuw]|s[abcdeghiklmnrtuvyz]
-    |t(?:r(?:avel)?|[cdfghjkmnoptvwz]|e?l)|u[agksyz]|v[aceginu]|w[fs]|ye|z[amw]|qa
-  )/ix;
+# valid TLDs
+my $tldsRE = $Mail::SpamAssassin::Util::RegistrarBoundaries::VALID_TLDS_RE;
 
 # knownscheme regexp looks for either a https?: or ftp: scheme, or www\d*\. or ftp\. prefix, i.e., likely to start a URL
 # schemeless regexp looks for a valid TLD at the end of what may be a FQDN, followed by optional ., optional :portnum, optional /rest_of_uri

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/FreeMail.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/FreeMail.pm?rev=1098376&r1=1098375&r2=1098376&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/FreeMail.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/FreeMail.pm Sun May  1 17:43:45 2011
@@ -107,12 +107,13 @@ my $VERSION = 2.001;
 
 use Mail::SpamAssassin::Plugin;
 use Mail::SpamAssassin::PerMsgStatus;
+use Mail::SpamAssassin::Util::RegistrarBoundaries;
 use strict;
 use vars qw(@ISA);
 @ISA = qw(Mail::SpamAssassin::Plugin);
 
-# TLDs generated from RegistrarBoundaries.pm VALID_TLDS 2008020601
-my $tlds = '(?:m(?:[acdeghkmnpqrstvwxyz]|u(?:seum)?|o(?:bi)?|i?l)|a(?:[cdfgilmnoqtuwxz]|e(?:ro)?|r(?:pa)?|s(?:ia)?)|c(?:[cdfghiklmnruvxyz]|o(?:op|m)?|at?)|t(?:[cdfghjkmnoptvwz]|r(?:avel)?|e?l)|n(?:[cfgilopruz]|a(?:me)?|et?)|b(?:[abdefghjmnorstwyz]|iz?)|g(?:[adefghilmnpqrstuwy]|ov)|i(?:[delmoqrst]|n(?:fo|t)?)|p(?:[aefghklnstwy]|ro?)|s[abcdeghiklmnrtuvyz]|j(?:[emp]|o(?:bs)?)|e(?:[cegrst]|d?u)|k[eghimnprwyz]|l[abcikrstuvy]|v[aceginu]|d[ejkmoz]|f[ijkmor]|h[kmnrtu]|o(?:rg|m)|u[agksyz]|r[eosuw]|z[amw]|w[fs]|y[eu]|qa)';
+# List of TLDs from RegistrarBoundaries.pm
+my $tlds = $Mail::SpamAssassin::Util::RegistrarBoundaries::VALID_TLDS_RE;
 
 ### Some regexp tips courtesy of http://www.regular-expressions.info/email.html
 ### v 0.02

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm?rev=1098376&r1=1098375&r2=1098376&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm Sun May  1 17:43:45 2011
@@ -26,7 +26,7 @@ use bytes;
 use re 'taint';
 
 use vars qw (
-  @ISA %TWO_LEVEL_DOMAINS %THREE_LEVEL_DOMAINS %US_STATES %VALID_TLDS
+  @ISA %TWO_LEVEL_DOMAINS %THREE_LEVEL_DOMAINS %US_STATES %VALID_TLDS $VALID_TLDS_RE
 );
 
 # The list of currently-valid TLDs for the DNS system.
@@ -37,6 +37,8 @@ use vars qw (
 # inactive, as can be seen in the Wikipedia articles about them
 # as of 2008-02-08, e.g. http://en.wikipedia.org/wiki/.so_%28domain_name%29
 #     bv gb pm sj so um yt
+#
+# Remember to also change regexp below when updating!
 
 foreach (qw/
   ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
@@ -56,6 +58,23 @@ foreach (qw/
   $VALID_TLDS{$_} = 1;
 }
 
+# %VALID_TLDS as Regexp::List optimized regexp, for use in Plugins etc
+# Paste above list to:
+#  perl -MRegexp::List -e '$/=undef; $_=<>; $r = Regexp::List->new; push @l, $_ for (split); print $r->list2re(@l)'
+# Verified up to date 20110501
+$VALID_TLDS_RE = qr/
+  (?=[abcdefghijklmnopqrstuvwyz])
+  (?:a(?:e(?:ro)?|r(?:pa)?|s(?:ia)?|[cdfgilmnoqtuwxz])|b(?:iz?|[abdefghjmnorstwyz])
+    |c(?:at?|o(?:m|op)?|[cdfghiklmnruvxyz])|d[ejkmoz]|e(?:[cegrst]|d?u)|f[ijkmor]
+    |g(?:[adefghilmnpqrstuwy]|ov)|h[kmnrtu]|i(?:n(?:fo|t)?|[delmoqrst])|j(?:o(?:bs)?|[emp])
+    |k[eghimnprwyz]|l[abcikrstuvy]|m(?:o(?:bi)?|u(?:seum)?|[acdeghkmnpqrstvwxyz]|i?l)
+    |n(?:a(?:me)?|et?|[cfgilopruz])|o(?:m|rg)|p(?:ro?|[aefghklnstwy])|r[eosuw]
+    |s[abcdeghiklmnrtuvyz]|t(?:r(?:avel)?|[cdfghjkmnoptvwz]|e?l)|u[agksyz]
+    |v[aceginu]|w[fs]|z[amw]|qa|ye
+  )/ix;
+
+# Two-Level TLDs
+#
 # to resort this, pump the whole list through:
 #  perl -e '$/=undef; $_=<>; foreach(split) { ($a,$b) = split(/\./, $_, 2); $t{$b}->{$_}=1; } foreach (sort keys %t) { print "  ",join(" ", sort keys %{$t{$_}}),"\n" }'
 #