You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by qu...@apache.org on 2005/05/04 04:31:07 UTC
svn commit: r168050 -
/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
Author: quinlan
Date: Tue May 3 19:31:07 2005
New Revision: 168050
URL: http://svn.apache.org/viewcvs?rev=168050&view=rev
Log:
improve URI matching heuristics: hostnames can be at most 255 characters
long, only exclude ones that match email addresses, "." can occur at end,
don't allow "_" since that's not allowed in hostnames
also exclude addresses that don't contain a valid TLD from mailto: matching
Modified:
spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm?rev=168050&r1=168049&r2=168050&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Tue May 3 19:31:07 2005
@@ -1777,19 +1777,25 @@
my $tldsRE = qr/
(?=[a-wyz])
(?:a(?:e(?:ro)?|r(?:pa)?|[cdfgilmnoqstuwzx])
- |b(?:iz?|[abdefghjmnorstvwyz]) |c(?:o(?:m|op)?|[acdfghiklmnrsu])
- |g(?:[efghilmnpqrstuwy]|ov) |h[kmnrtu] |i(?:n(?:fo|t)?|[delmoqrst])
+ |b(?:iz?|[abdefghjmnorstvwyz])|c(?:o(?:m|op)?|[acdfghiklmnrsu])
+ |g(?:[efghilmnpqrstuwy]|ov)|h[kmnrtu]|i(?:n(?:fo|t)?|[delmoqrst])
|j[emop]|k[eghimnprwyz]|l[abcikrstuvy]
|m(?:u(?:seum)?|[acdghkmnopqrstvwxyz]|i?l)|n(?:a(?:me)?|et?|[cfgilopruz])
|o(?:m|rg)|p(?:ro?|[aefghklmnstwy])|r[eouw]|s[abcdeghijklmnortvyzu]
|t[cdfghjklmnoprtvwz]|u[agkmsyz]|v[aceginu]|w[fs]|y[etu]|z[amw]|ed?u|qa
)/ix;
-my $schemelessRE = qr/(?<![.=])(?:
- www\.
- |ftp\.
- |(?<!\@)[-_a-z0-9\.]{3,999}\.${tldsRE}(?![-_a-z0-9\.])
- )/ix;
+# from RFC 1035, but allowing domains starting with numbers:
+# $label = q/[A-Za-z\d](?:[A-Za-z\d-]{0,61}[A-Za-z\d])?/;
+# $domain = qq<$label(?:\.$label)*>;
+# length($host) <= 255 && $host =~ /^($domain)$/
+# massively simplified from grammar, only matches known TLDs, a single
+# dot at end of TLD works, skip ones that will match as email addresses
+my $schemelessRE = qr/(?<!.\@)\b[a-z\d]
+ [a-z\d.-]{0,251}
+ \.${tldsRE}\.?\b
+ (?![a-z\d.-])
+ /ix;
my $uriRe = qr/\b(?:$schemeRE:[$uricCheat]|$schemelessRE)[$uricSet#]*/o;
@@ -1881,7 +1887,7 @@
}
sub get_parsed_uri_list {
- my($self) = @_;
+ my ($self) = @_;
# use cached answer if available
unless (defined $self->{parsed_uri_list}) {
@@ -1904,6 +1910,9 @@
while (/($uriRe)/igo) {
my $uri = $1;
+ # skip mismatches from URI regular expression
+ next if $uri =~ /^[a-z\d.-]*\.\./i; # skip ".."
+
$uri =~ s/^<(.*)>$/$1/;
$uri =~ s/[\]\)>#]$//;
@@ -1923,13 +1932,17 @@
}
}
- # warn("uri: got URI: $uri\n");
+ #warn("uri: got URI: $uri\n");
push @uris, $uri;
}
while (/($Addr_spec_re)/go) {
my $uri = $1;
- $uri = "mailto:$uri";
+ # skip mismatches from email address regular expression
+ next unless $uri =~ /\.${tldsRE}\W*$/; # skip non-TLDs
+
+ $uri =~ s/\s*\@\s*/@/; # remove spaces around the '@'
+ $uri = "mailto:$uri"; # prepend mailto:
#warn("uri: got URI: $uri\n");
push @uris, $uri;