You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by he...@apache.org on 2011/05/01 19:43:46 UTC
svn commit: r1098376 - in /spamassassin/trunk/lib/Mail/SpamAssassin:
PerMsgStatus.pm Plugin/FreeMail.pm Util/RegistrarBoundaries.pm
Author: hege
Date: Sun May 1 17:43:45 2011
New Revision: 1098376
URL: http://svn.apache.org/viewvc?rev=1098376&view=rev
Log:
Bug #6578: Move TLD regexp to RegistrarBoundaries and make FreeMail use it
Modified:
spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/FreeMail.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm?rev=1098376&r1=1098375&r2=1098376&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Sun May 1 17:43:45 2011
@@ -1833,20 +1833,8 @@ my $nonASCII = '\x80-\xff';
my $tbirdenddelimemail = $tbirdenddelim . '(\'' . $nonASCII; # tbird ignores non-ASCII mail addresses for now, until RFC changes
my $tbirdenddelimplusat = $tbirdenddelimemail . '@';
-# regexps for finding plain text non-scheme hostnames with valid TLDs.
-
-# the list from %VALID_TLDS in Util/RegistrarBoundaries.pm, as a
-# Regexp::List optimized regexp ;) accurate as of 2010-04-15
-my $tldsRE = qr/
- (?=[a-wyz])
- (?:a(?:e(?:ro)?|r(?:pa)?|s(?:ia)?|[cdfgilmnoqtuwxz])|b(?:iz?|[abdefghjmnorstwyz])
- |c(?:at?|o(?:m|op)?|[cdfghiklmnruvxyz])|d[ejkmoz]|e(?:[cegrst]|d?u)|f[ijkmor]
- |g(?:[adefghilmnpqrstuwy]|ov)|h[kmnrtu]|i(?:n(?:fo|t)?|[delmoqrst])
- |j(?:o(?:bs)?|[emp])|k[eghimnprwyz]|l[abcikrstuvy]
- |m(?:o(?:bi)?|u(?:seum)?|[acdeghkmnpqrstvwxyz]|i?l)|n(?:a(?:me)?|et?|[cfgilopruz])
- |o(?:m|rg)|p(?:ro?|[aefghklnstwy])|r[eosuw]|s[abcdeghiklmnrtuvyz]
- |t(?:r(?:avel)?|[cdfghjkmnoptvwz]|e?l)|u[agksyz]|v[aceginu]|w[fs]|ye|z[amw]|qa
- )/ix;
+# valid TLDs
+my $tldsRE = $Mail::SpamAssassin::Util::RegistrarBoundaries::VALID_TLDS_RE;
# knownscheme regexp looks for either a https?: or ftp: scheme, or www\d*\. or ftp\. prefix, i.e., likely to start a URL
# schemeless regexp looks for a valid TLD at the end of what may be a FQDN, followed by optional ., optional :portnum, optional /rest_of_uri
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/FreeMail.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/FreeMail.pm?rev=1098376&r1=1098375&r2=1098376&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/FreeMail.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/FreeMail.pm Sun May 1 17:43:45 2011
@@ -107,12 +107,13 @@ my $VERSION = 2.001;
use Mail::SpamAssassin::Plugin;
use Mail::SpamAssassin::PerMsgStatus;
+use Mail::SpamAssassin::Util::RegistrarBoundaries;
use strict;
use vars qw(@ISA);
@ISA = qw(Mail::SpamAssassin::Plugin);
-# TLDs generated from RegistrarBoundaries.pm VALID_TLDS 2008020601
-my $tlds = '(?:m(?:[acdeghkmnpqrstvwxyz]|u(?:seum)?|o(?:bi)?|i?l)|a(?:[cdfgilmnoqtuwxz]|e(?:ro)?|r(?:pa)?|s(?:ia)?)|c(?:[cdfghiklmnruvxyz]|o(?:op|m)?|at?)|t(?:[cdfghjkmnoptvwz]|r(?:avel)?|e?l)|n(?:[cfgilopruz]|a(?:me)?|et?)|b(?:[abdefghjmnorstwyz]|iz?)|g(?:[adefghilmnpqrstuwy]|ov)|i(?:[delmoqrst]|n(?:fo|t)?)|p(?:[aefghklnstwy]|ro?)|s[abcdeghiklmnrtuvyz]|j(?:[emp]|o(?:bs)?)|e(?:[cegrst]|d?u)|k[eghimnprwyz]|l[abcikrstuvy]|v[aceginu]|d[ejkmoz]|f[ijkmor]|h[kmnrtu]|o(?:rg|m)|u[agksyz]|r[eosuw]|z[amw]|w[fs]|y[eu]|qa)';
+# List of TLDs from RegistrarBoundaries.pm
+my $tlds = $Mail::SpamAssassin::Util::RegistrarBoundaries::VALID_TLDS_RE;
### Some regexp tips courtesy of http://www.regular-expressions.info/email.html
### v 0.02
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm?rev=1098376&r1=1098375&r2=1098376&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm Sun May 1 17:43:45 2011
@@ -26,7 +26,7 @@ use bytes;
use re 'taint';
use vars qw (
- @ISA %TWO_LEVEL_DOMAINS %THREE_LEVEL_DOMAINS %US_STATES %VALID_TLDS
+ @ISA %TWO_LEVEL_DOMAINS %THREE_LEVEL_DOMAINS %US_STATES %VALID_TLDS $VALID_TLDS_RE
);
# The list of currently-valid TLDs for the DNS system.
@@ -37,6 +37,8 @@ use vars qw (
# inactive, as can be seen in the Wikipedia articles about them
# as of 2008-02-08, e.g. http://en.wikipedia.org/wiki/.so_%28domain_name%29
# bv gb pm sj so um yt
+#
+# Remember to also change regexp below when updating!
foreach (qw/
ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
@@ -56,6 +58,23 @@ foreach (qw/
$VALID_TLDS{$_} = 1;
}
+# %VALID_TLDS as Regexp::List optimized regexp, for use in Plugins etc
+# Paste above list to:
+# perl -MRegexp::List -e '$/=undef; $_=<>; $r = Regexp::List->new; push @l, $_ for (split); print $r->list2re(@l)'
+# Verified up to date 20110501
+$VALID_TLDS_RE = qr/
+ (?=[abcdefghijklmnopqrstuvwyz])
+ (?:a(?:e(?:ro)?|r(?:pa)?|s(?:ia)?|[cdfgilmnoqtuwxz])|b(?:iz?|[abdefghjmnorstwyz])
+ |c(?:at?|o(?:m|op)?|[cdfghiklmnruvxyz])|d[ejkmoz]|e(?:[cegrst]|d?u)|f[ijkmor]
+ |g(?:[adefghilmnpqrstuwy]|ov)|h[kmnrtu]|i(?:n(?:fo|t)?|[delmoqrst])|j(?:o(?:bs)?|[emp])
+ |k[eghimnprwyz]|l[abcikrstuvy]|m(?:o(?:bi)?|u(?:seum)?|[acdeghkmnpqrstvwxyz]|i?l)
+ |n(?:a(?:me)?|et?|[cfgilopruz])|o(?:m|rg)|p(?:ro?|[aefghklnstwy])|r[eosuw]
+ |s[abcdeghiklmnrtuvyz]|t(?:r(?:avel)?|[cdfghjkmnoptvwz]|e?l)|u[agksyz]
+ |v[aceginu]|w[fs]|z[amw]|qa|ye
+ )/ix;
+
+# Two-Level TLDs
+#
# to resort this, pump the whole list through:
# perl -e '$/=undef; $_=<>; foreach(split) { ($a,$b) = split(/\./, $_, 2); $t{$b}->{$_}=1; } foreach (sort keys %t) { print " ",join(" ", sort keys %{$t{$_}}),"\n" }'
#