You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by qu...@apache.org on 2005/05/04 04:31:07 UTC

svn commit: r168050 - /spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm

Author: quinlan
Date: Tue May  3 19:31:07 2005
New Revision: 168050

URL: http://svn.apache.org/viewcvs?rev=168050&view=rev
Log:
improve URI matching heuristics: hostnames can be at most 255 characters
long, only exclude ones that match email addresses, "." can occur at end,
don't allow "_" since that's not allowed in hostnames
also exclude addresses that don't contain a valid TLD from mailto: matching

Modified:
    spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm?rev=168050&r1=168049&r2=168050&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Tue May  3 19:31:07 2005
@@ -1777,19 +1777,25 @@
 my $tldsRE = qr/
     (?=[a-wyz])
     (?:a(?:e(?:ro)?|r(?:pa)?|[cdfgilmnoqstuwzx])
-      |b(?:iz?|[abdefghjmnorstvwyz]) |c(?:o(?:m|op)?|[acdfghiklmnrsu])
-      |g(?:[efghilmnpqrstuwy]|ov) |h[kmnrtu] |i(?:n(?:fo|t)?|[delmoqrst])
+      |b(?:iz?|[abdefghjmnorstvwyz])|c(?:o(?:m|op)?|[acdfghiklmnrsu])
+      |g(?:[efghilmnpqrstuwy]|ov)|h[kmnrtu]|i(?:n(?:fo|t)?|[delmoqrst])
       |j[emop]|k[eghimnprwyz]|l[abcikrstuvy]
       |m(?:u(?:seum)?|[acdghkmnopqrstvwxyz]|i?l)|n(?:a(?:me)?|et?|[cfgilopruz])
       |o(?:m|rg)|p(?:ro?|[aefghklmnstwy])|r[eouw]|s[abcdeghijklmnortvyzu]
       |t[cdfghjklmnoprtvwz]|u[agkmsyz]|v[aceginu]|w[fs]|y[etu]|z[amw]|ed?u|qa
     )/ix;
 
-my $schemelessRE = qr/(?<![.=])(?:
-        www\.
-        |ftp\.
-        |(?<!\@)[-_a-z0-9\.]{3,999}\.${tldsRE}(?![-_a-z0-9\.])
-    )/ix;
+# from RFC 1035, but allowing domains starting with numbers:
+#   $label = q/[A-Za-z\d](?:[A-Za-z\d-]{0,61}[A-Za-z\d])?/;
+#   $domain = qq<$label(?:\.$label)*>;
+#   length($host) <= 255 && $host =~ /^($domain)$/
+# massively simplified from grammar, only matches known TLDs, a single
+# dot at end of TLD works, skip ones that will match as email addresses
+my $schemelessRE = qr/(?<!.\@)\b[a-z\d]
+                      [a-z\d.-]{0,251}
+                      \.${tldsRE}\.?\b
+                      (?![a-z\d.-])
+                      /ix;
 
 my $uriRe = qr/\b(?:$schemeRE:[$uricCheat]|$schemelessRE)[$uricSet#]*/o;
 
@@ -1881,7 +1887,7 @@
 }
 
 sub get_parsed_uri_list {
-  my($self) = @_;
+  my ($self) = @_;
 
   # use cached answer if available
   unless (defined $self->{parsed_uri_list}) {
@@ -1904,6 +1910,9 @@
       while (/($uriRe)/igo) {
         my $uri = $1;
 
+        # skip mismatches from URI regular expression
+        next if $uri =~ /^[a-z\d.-]*\.\./i;	# skip ".."
+
         $uri =~ s/^<(.*)>$/$1/;
         $uri =~ s/[\]\)>#]$//;
 
@@ -1923,13 +1932,17 @@
           }
         }
 
-        # warn("uri: got URI: $uri\n");
+        #warn("uri: got URI: $uri\n");
         push @uris, $uri;
       }
       while (/($Addr_spec_re)/go) {
         my $uri = $1;
 
-        $uri = "mailto:$uri";
+        # skip mismatches from email address regular expression
+        next unless $uri =~ /\.${tldsRE}\W*$/;	# skip non-TLDs
+
+        $uri =~ s/\s*\@\s*/@/;	# remove spaces around the '@'
+        $uri = "mailto:$uri";	# prepend mailto:
 
         #warn("uri: got URI: $uri\n");
         push @uris, $uri;