You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2005/03/22 07:03:40 UTC
svn commit: r158542 -
spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
Author: jm
Date: Mon Mar 21 22:03:39 2005
New Revision: 158542
URL: http://svn.apache.org/viewcvs?view=rev&rev=158542
Log:
bug 4208: URI parser should catch 'cut and paste' raw URLs that appear without http:// prefix
Modified:
spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm?view=diff&r1=158541&r2=158542
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Mon Mar 21 22:03:39 2005
@@ -1772,7 +1772,25 @@
my $uricCheat = $uricSet;
$uricCheat =~ tr/://d;
-my $schemelessRE = qr/(?<![.=])(?:www\.|ftp\.)/;
+# the list from %VALID_TLDS in Util/RegistrarBoundaries.pm, as a
+# Regexp::Optimize optimized regexp ;) accurate as of 20050318
+my $tldsRE = qr/
+ (?=[a-wyz])
+ (?:a(?:e(?:ro)?|r(?:pa)?|[cdfgilmnoqstuwzx])
+ |b(?:iz?|[abdefghjmnorstvwyz]) |c(?:o(?:m|op)?|[acdfghiklmnrsu])
+ |g(?:[efghilmnpqrstuwy]|ov) |h[kmnrtu] |i(?:n(?:fo|t)?|[delmoqrst])
+ |j[emop]|k[eghimnprwyz]|l[abcikrstuvy]
+ |m(?:u(?:seum)?|[acdghkmnopqrstvwxyz]|i?l)|n(?:a(?:me)?|et?|[cfgilopruz])
+ |o(?:m|rg)|p(?:ro?|[aefghklmnstwy])|r[eouw]|s[abcdeghijklmnortvyzu]
+ |t[cdfghjklmnoprtvwz]|u[agkmsyz]|v[aceginu]|w[fs]|y[etu]|z[amw]|ed?u|qa
+ )/ix;
+
+my $schemelessRE = qr/(?<![.=])(?:
+ www\.
+ |ftp\.
+ |(?<!\@)[-_a-z0-9\.]{3,999}\.${tldsRE}(?![-_a-z0-9\.])
+ )/ix;
+
my $uriRe = qr/\b(?:$schemeRE:[$uricCheat]|$schemelessRE)[$uricSet#]*/o;
# Taken from Email::Find (thanks Tatso!)