You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2005/03/22 07:03:40 UTC

svn commit: r158542 - spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm

Author: jm
Date: Mon Mar 21 22:03:39 2005
New Revision: 158542

URL: http://svn.apache.org/viewcvs?view=rev&rev=158542
Log:
bug 4208: URI parser should catch 'cut and paste' raw URLs that appear without http:// prefix

Modified:
    spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm?view=diff&r1=158541&r2=158542
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Mon Mar 21 22:03:39 2005
@@ -1772,7 +1772,25 @@
 my $uricCheat = $uricSet;
 $uricCheat =~ tr/://d;
 
-my $schemelessRE = qr/(?<![.=])(?:www\.|ftp\.)/;
+# the list from %VALID_TLDS in Util/RegistrarBoundaries.pm, as a
+# Regexp::Optimize optimized regexp ;)  accurate as of 20050318
+my $tldsRE = qr/
+    (?=[a-wyz])
+    (?:a(?:e(?:ro)?|r(?:pa)?|[cdfgilmnoqstuwzx])
+      |b(?:iz?|[abdefghjmnorstvwyz]) |c(?:o(?:m|op)?|[acdfghiklmnrsu])
+      |g(?:[efghilmnpqrstuwy]|ov) |h[kmnrtu] |i(?:n(?:fo|t)?|[delmoqrst])
+      |j[emop]|k[eghimnprwyz]|l[abcikrstuvy]
+      |m(?:u(?:seum)?|[acdghkmnopqrstvwxyz]|i?l)|n(?:a(?:me)?|et?|[cfgilopruz])
+      |o(?:m|rg)|p(?:ro?|[aefghklmnstwy])|r[eouw]|s[abcdeghijklmnortvyzu]
+      |t[cdfghjklmnoprtvwz]|u[agkmsyz]|v[aceginu]|w[fs]|y[etu]|z[amw]|ed?u|qa
+    )/ix;
+
+my $schemelessRE = qr/(?<![.=])(?:
+        www\.
+        |ftp\.
+        |(?<!\@)[-_a-z0-9\.]{3,999}\.${tldsRE}(?![-_a-z0-9\.])
+    )/ix;
+
 my $uriRe = qr/\b(?:$schemeRE:[$uricCheat]|$schemelessRE)[$uricSet#]*/o;
 
 # Taken from Email::Find (thanks Tatso!)