You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by si...@apache.org on 2008/01/28 23:46:04 UTC
svn commit: r616097 - in /spamassassin/trunk:
lib/Mail/SpamAssassin/PerMsgStatus.pm t/uri.t t/uri_text.t
Author: sidney
Date: Mon Jan 28 14:46:03 2008
New Revision: 616097
URL: http://svn.apache.org/viewvc?rev=616097&view=rev
Log:
bug 5780: total rewrite of URI detection in plain text body and corresponding test cases
Modified:
spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
spamassassin/trunk/t/uri.t
spamassassin/trunk/t/uri_text.t
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm?rev=616097&r1=616096&r2=616097&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Mon Jan 28 14:46:03 2008
@@ -1580,7 +1580,7 @@
# strip out the (comments)
$result =~ s/\s*\(.*?\)//g;
# strip out the "quoted text"
- $result =~ s/(?<!<)"[^"]*"(?!@)//g;
+ $result =~ s/(?<!<)"[^"]*"(?!@)//g; #" emacs
# Foo Blah <jm...@xxx> or <jm...@xxx>
$result =~ s/^[^<]*?<(.*?)>.*$/$1/;
# multiple addresses on one line? remove all but first
@@ -1625,16 +1625,44 @@
###########################################################################
-# Taken from URI and URI::Find
-my $reserved = q(;/?:@&=+$,[]\#|);
-my $mark = q(-_.!~*'()); #'; emacs
-my $unreserved = "A-Za-z0-9\Q$mark\E\x00-\x08\x0b\x0c\x0e-\x1a\x1c-\x1f";
-my $uricSet = quotemeta($reserved) . $unreserved . "%";
-
-my $schemeRE = qr/(?:https?|ftp|mailto|javascript|file)/i;
+# uri parsing from plain text:
+# The goals are to find URIs in plain text spam that are intended to be clicked on or copy/pasted, but
+# ignore random strings that might look like URIs, for example in uuencoded files, and to ignore
+# URIs that spammers might seed in spam in ways not visible or clickable to add work to spam filters.
+# When we extract a domain and look it up in an RBL, an FP on decding that the text is a URI is not much
+# of a problem, as the only cost is an extra RBL lookup. The same FP is worse if the URI is used in matching rule
+# because it could lead to a rule FP, as in bug 5780 with WIERD_PORT matching random uuencoded strings.
+# The principles of the following code are 1) if ThunderBird or Outlook Express would linkify a string,
+# then we should attempt to parse it as a URI; 2) Where TBird and OE parse differently, choose to do what is most
+# likely to find a domain for the RBL tests; 3) If it begins with a scheme or www\d*\. or ftp\. assume that
+# it is a URI; 4) If it does not then require that the start of the string looks like a FQDN with a valid TLD;
+# 5) Reject strings that after parsing, URLDecoding, and redirection processing don't have a valid TLD
+#
+# We get the entire URI that would be linkified before dealing with it, in order to do the right thing
+# with URI-encodings and redirecting URIs.
+#
+# The delimiters for start of a URI in TBird are @(`{|[\"'<>,\s in OE they are ("<\s
+#
+# Tbird allows .,?';-! in a URI but ignores [.,?';-!]* at the end.
+# TBird's end delimiters are )`{}|[]"<>\s but ) is only an end delmiter if there is no ( in the URI
+# OE only uses space as a delimiter, but ignores [~!@#^&*()_+`-={}|[]:";'<>?,.]* at the end.
+#
+# Both TBird and OE decide that a URI is an email address when there is '@' character embedded in it.
+# TBird has some additional restrictions on email URIs: They cannot contain non-ASCII characters and their end
+# delimiters include ( and '
+#
+# bug 4522: ISO2022 format mail, most commonly Japanese SHIFT-JIS, inserts a three character escape sequence ESC ( .
+
+# a hybrid of tbird and oe's version of uri parsing
+my $tbirdstartdelim = '><"\'`,{[(|\s' . "\x1b"; # The \x1b as per bug 4522
+my $iso2022shift = "\x1b" . '\(.'; # bug 4522
+my $tbirdenddelim = '><"`}\]{[|\s' . "\x1b"; # The \x1b as per bug 4522
+my $oeignoreatend = '-~!@#^&*()_+=:;\'?,.';
+my $nonASCII = '\x80-\xff';
+my $tbirdenddelimemail = $tbirdenddelim . '(\'' . $nonASCII; # tbird ignores non-ASCII mail addresses for now, until RFC changes
+my $tbirdenddelimplusat = $tbirdenddelimemail . '@';
-my $uricCheat = $uricSet;
-$uricCheat =~ tr/://d;
+# regexps for finding plain text non-scheme hostnames with valid TLDs.
# the list from %VALID_TLDS in Util/RegistrarBoundaries.pm, as a
# Regexp::Optimize optimized regexp ;) accurate as of 20050318
@@ -1649,57 +1677,15 @@
|t[cdfghjklmnoprtvwz]|u[agkmsyz]|v[aceginu]|w[fs]|xxx|y[etu]|z[amw]|ed?u|qa
)/ix;
-# from RFC 1035, but allowing domains starting with numbers:
-# $label = q/[A-Za-z\d](?:[A-Za-z\d-]{0,61}[A-Za-z\d])?/;
-# $domain = qq<$label(?:\.$label)*>;
-# length($host) <= 255 && $host =~ /^($domain)$/
-# changes:
-# massively simplified from grammar, only matches known TLDs, a single
-# dot at end of TLD works
-# negative look-behinds:
-# (?<![a-z\d][.-]) = don't let there be more hostname behind, but
-# don't miss ".....www.bar.com" or "-----www.foo.com"
-# (?<!.\@) = this will be caught by the email address regular expression
-my $schemelessRE = qr/(?<![a-z\d][._-])(?<!.\@)\b[a-z\d]
- [a-z\d._-]{0,251}
- \.${tldsRE}\.?\b
- (?![a-z\d._-])
- /ix;
-
-my $uriRe = qr/\b(?:$schemeRE:[$uricCheat]|$schemelessRE)[$uricSet#]*/o;
-
-# Taken from Email::Find (thanks Tatso!)
-# This is the BNF from RFC 822
-my $esc = '\\\\';
-my $period = '\.';
-my $space = '\040';
-my $open_br = '\[';
-my $close_br = '\]';
-my $nonASCII = '\x80-\xff';
-my $ctrl = '\000-\037';
-my $cr_list = '\n\015';
-my $qtext = qq/[^$esc$nonASCII$cr_list\"]/; #"
-my $dtext = qq/[^$esc$nonASCII$cr_list$open_br$close_br]/;
-my $quoted_pair = qq<$esc>.qq<[^$nonASCII]>;
-my $atom_char = qq/[^($space)<>\@,;:\".$esc$open_br$close_br$ctrl$nonASCII]/;
-#"
-my $atom = qq{(?>$atom_char+)};
-my $quoted_str = qq<\"$qtext*(?:$quoted_pair$qtext*)*\">; #"
-my $word = qq<(?:$atom|$quoted_str)>;
-my $local_part = qq<$word(?:$period$word)*>;
-
-# This is a combination of the domain name BNF from RFC 1035 plus the
-# domain literal definition from RFC 822, but allowing domains starting
-# with numbers.
-my $label = q/[A-Za-z\d](?:[A-Za-z\d-]*[A-Za-z\d])?/;
-my $domain_ref = qq<$label(?:$period$label)*>;
-my $domain_lit = qq<$open_br(?:$dtext|$quoted_pair)*$close_br>;
-my $domain = qq<(?:$domain_ref|$domain_lit)>;
-
-# Finally, the address-spec regex (more or less)
-my $Addr_spec_re = qr<$local_part\s*\@\s*$domain>o;
-
-# TVD: This really belongs in metadata
+# knownscheme regexp looks for either a https?: or ftp: scheme, or www\d*\. or ftp\. prefix, i.e., likely to start a URL
+# schemeless regexp looks for a valid TLD at the end of what may be a FQDN, followed by optional ., optional :portnum, optional /rest_of_uri
+my $urischemeless = qr/[a-z\d][a-z\d._-]{0,251}\.${tldsRE}\.?(?::\d{1,5})?(?:\/[^$tbirdenddelim]{1,251})?/io;
+my $uriknownscheme = qr/(?:(?:(?:(?:https?)|(?:ftp)):(?:\/\/)?)|(?:(?:www\d{0,2}|ftp)\.))[^$tbirdenddelim]{1,251}/io;
+my $urimailscheme = qr/(?:mailto:)?[^$tbirdenddelimplusat]{1,251}@[^$tbirdenddelimemail]{1,251}/io;
+my $tbirdurire = qr/(?:\b|(?<=$iso2022shift)|(?<=[$tbirdstartdelim]))
+ (?:(?:($uriknownscheme)(?=[$tbirdenddelim])) |
+ (?:($urimailscheme)(?=[$tbirdenddelimemail])) |
+ (?:\b($urischemeless)(?=[$tbirdenddelim])))/xo;
=item $status->get_uri_list ()
@@ -1895,6 +1881,7 @@
# also, if we allow $textary to be passed in, we need to invalidate
# the cache first. fyi.
my $textary = $self->get_decoded_stripped_body_text_array();
+ my $redirector_patterns = $self->{conf}->{redirector_patterns};
my ($rulename, $pat, @uris);
local ($_);
@@ -1903,50 +1890,61 @@
for (@$textary) {
# NOTE: do not modify $_ in this loop
- while (/($uriRe)/igo) {
- my $uri = $1;
-
- # skip mismatches from URI regular expression
- next if $uri =~ /^[a-z\d.-]*\.\./i; # skip ".."
-
- $uri =~ s/^<(.*)>$/$1/;
- $uri =~ s/[\]\)>#]$//;
-
- if ($uri !~ /^${schemeRE}:/io) {
- # If it's a hostname that was just sitting out in the
- # open, without a protocol, and not inside of an HTML tag,
- # the we should add the proper protocol in front, rather
- # than using the base URI.
+ while (/$tbirdurire/igo) {
+ my $rawuri = $1||$2||$3;
+ $rawuri =~ s/(^[^(]*)\).*$/$1/; # as per ThunderBird, ) is an end delimiter if there is no ( preceeding it
+ $rawuri =~ s/[$oeignoreatend]*$//; # remove trailing string of punctuations that TBird ignores
+ # skip if there is '..' in the hostname portion of the URI, something we can't catch in the general URI regexp
+ next if $rawuri =~ /^(?:(?:https?|ftp|mailto):(?:\/\/)?)?[a-z\d.-]*\.\./i;
+
+ # If it's a hostname that was just sitting out in the
+ # open, without a protocol, and not inside of an HTML tag,
+ # the we should add the proper protocol in front, rather
+ # than using the base URI.
+ my $uri = $rawuri;
+ my $rblonly;
+ if ($uri !~ /^(?:https?|ftp|mailto|javascript|file):/i) {
if ($uri =~ /^ftp\./i) {
- push (@uris, $uri);
$uri = "ftp://$uri";
}
- if ($uri =~ /\@/) {
- push (@uris, $uri);
+ elsif ($uri =~ /^www\d{0,2}\./i) {
+ $uri = "http://$uri";
+ }
+ elsif ($uri =~ /\@/) {
$uri = "mailto:$uri";
}
- else # if ($uri =~ /^www\d*\./i)
- {
+ else {
# some spammers are using unschemed URIs to escape filters
- push (@uris, $uri);
+ $rblonly = 1; # flag that this is a URI that MUAs don't linkify so only use for RBLs
$uri = "http://$uri";
}
}
- # warn("uri: got URI: $uri\n");
- push @uris, $uri;
- }
- while (/($Addr_spec_re)/igo) {
- my $uri = $1;
-
- # skip mismatches from email address regular expression
- next unless $uri =~ /\.${tldsRE}\W*$/io; # skip non-TLDs
+ if ($uri =~ /^mailto:/) {
+ # skip a mail link that does not have a valid TLD or other than one @ after decoding any URLEncoded characters
+ $uri = Mail::SpamAssassin::Util::url_encode($uri) if ($uri =~ /\%(?:2[1-9a-fA-F]|[3-6][0-9a-fA-f]|7[0-9a-eA-E])/);
+ next if ($uri !~ /^[^@]+@[^@]+$/);
+ my $domuri = Mail::SpamAssassin::Util::uri_to_domain($uri);
+ next unless $domuri;
+ push (@uris, $rawuri);
+ push (@uris, $uri) unless ($rawuri eq $uri);
+ }
- $uri =~ s/\s*\@\s*/@/; # remove spaces around the '@'
- $uri = "mailto:$uri"; # prepend mailto:
+ next unless ($uri =~/^(?:https?|ftp):/); # at this point only valid if one or the other of these
- #warn("uri: got URI: $uri\n");
- push @uris, $uri;
+ my @tmp = Mail::SpamAssassin::Util::uri_list_canonify($redirector_patterns, $uri);
+ my $goodurifound = 0;
+ foreach my $cleanuri (@tmp) {
+ my $domain = Mail::SpamAssassin::Util::uri_to_domain($cleanuri);
+ if ($domain) {
+ # bug 5780: Stop after domain to avoid FP, but do that after all deobfuscation of urlencoding and redirection
+ $cleanuri =~ s/^(https?:\/\/[^:\/]+).*$/$1/ if $rblonly;
+ push (@uris, $cleanuri);
+ $goodurifound = 1;
+ }
+ }
+ next unless $goodurifound;
+ push @uris, $rawuri unless $rblonly;
}
}
Modified: spamassassin/trunk/t/uri.t
URL: http://svn.apache.org/viewvc/spamassassin/trunk/t/uri.t?rev=616097&r1=616096&r2=616097&view=diff
==============================================================================
--- spamassassin/trunk/t/uri.t (original)
+++ spamassassin/trunk/t/uri.t Mon Jan 28 14:46:03 2008
@@ -46,7 +46,7 @@
ok ($urimap{'http://66.92.69.222/'});
ok ($urimap{'http://66.92.69.223/'});
ok ($urimap{'http://66.92.69.224/'});
-ok ($urimap{'spamassassin.org'});
+ok ($urimap{'http://spamassassin.org'});
ok (!$urimap{'CUMSLUTS.'});
ok (!$urimap{'CUMSLUTS..VIRGIN'});
@@ -58,9 +58,11 @@
# undef is valid in some situations, so deal with it...
if (!defined $expect) {
+ warn("try_domains: failed! expect: undefined got: '$result'\n") if (defined $result);
return !defined $result;
}
elsif (!defined $result) {
+ warn "try_domains: failed! expect: '$expect' got: undefined\n";
return 0;
}
Modified: spamassassin/trunk/t/uri_text.t
URL: http://svn.apache.org/viewvc/spamassassin/trunk/t/uri_text.t?rev=616097&r1=616096&r2=616097&view=diff
==============================================================================
--- spamassassin/trunk/t/uri_text.t (original)
+++ spamassassin/trunk/t/uri_text.t Mon Jan 28 14:46:03 2008
@@ -24,7 +24,7 @@
use vars qw(%patterns %anti_patterns);
# settings
-plan tests => 104;
+plan tests => 134;
# initialize SpamAssassin
my $sa = create_saobj({'dont_copy_prefs' => 1});
@@ -133,11 +133,15 @@
.www.kuiH5sai.com www.kuiH5sai.com
a=www.zaiNgoo7.com www.zaiNgoo7.com
-b@www.vohWais0.com mailto:b@www.vohWais0.com !http://www.vohWais0.com
+b@www.vohWais0.com mailto:b@www.vohWais0.com !http://www.vohWais0.com
c.www.moSaoga8.com www.moSaoga8.com
-foo @ cae8kaip.com mailto:foo@cae8kaip.com
xyz..geifoza0.com !geifoza0
+xyz.geifoza1.com/..xyz xyz.geifoza1.com !xyz.geifoza1.com/..xyz
+xyz.geifoza2.CoM xyz.geifoza2.CoM
+http://xyz..geifoza3.com !geifoza3
+http://xyz.geifoza4.com/..xyz xyz.geifoza4.com/..xyz
+http://xyz.geifoza5.CoM xyz.geifoza5.CoM
joe@koja3fui.koja3fui !koja3fui
@@ -219,44 +223,33 @@
http://www.luzoop5k.com http://www.luzoop5k.com
https://www.luzoop5k.com https://www.luzoop5k.com
ftp://www.luzoop5k.com ftp://www.luzoop5k.com
-mailto:www.luzoop5k.com mailto:www.luzoop5k.com
-file://www.luzoop5k.com file://www.luzoop5k.com
+mailto:www.luzoop5k.com !mailto:www.luzoop5k.com
+# no longer accept file: scheme
+file://www.luzoop5k.com !file://www.luzoop5k.com
# //<user>:<password>@<host>:<port>/<url-path>
http://user:pass@jiefeet4.com:80/x/y http://user:pass@jiefeet4.com:80/x/y
-liy8quei:80 !liy8quei
-veibi6cu:443 !veibi6cu
-puahi8si.com:80 puahi8si.com:80
-chop8tan.com:443 chop8tan.com:443
+www.liy8quei:80 !liy8quei
+www.veibi6cu:443 !veibi6cu
+puahi8si.com:80 !puahi8si.com:80
+chop8tan.com:443 !chop8tan.com:443
+www.puahi9si.com:80 puahi9si.com:80
+www.chop9tan.com:443 chop9tan.com:443
ftp://name@su5queib.ca//etc/motd ftp://name@su5queib.ca//etc/motd
-ftp://name@faikaj4t.dom/%2Fetc/motd ftp://name@faikaj4t.dom//etc/motd
+ftp://name@faikaj4t.dom/%2Fetc/motd !ftp://name@faikaj4t.dom//etc/motd
+ftp://name@faikaj4t.com/%2Fetc/motd ftp://name@faikaj4t.com//etc/motd
keyword:sportscar !sportscar
# questionable tests
-
mailto://cah3neun@thaihe4d.com mailto://cah3neun@thaihe4d.com
-mailto://jicu8vah@another@jicu8vah jicu8vah@another@jicu8vah
-baeb1fai@@example.com !baeb1fai@@example.com
-#mailto://yie6xuna !yie6xuna
-
-#http://425EE622 http://66.94.230.34
-#gopher://www.luzoop5k.com gopher://www.luzoop5k.com
-#nntp://www.luzoop5k.com nntp://www.luzoop5k.com
-#telnet://www.luzoop5k.com telnet://www.luzoop5k.com
-#wais://www.luzoop5k.com wais://www.luzoop5k.com
-#prospero://www.luzoop5k.com prospero://www.luzoop5k.com
-#nfs://www.luzoop5k.com nfs://www.luzoop5k.com
-#pop://www.luzoop5k.com pop://www.luzoop5k.com
-#tel://www.luzoop5k.com tel://www.luzoop5k.com
-#fax://www.luzoop5k.com fax://www.luzoop5k.com
-#modem://www.luzoop5k.com modem://www.luzoop5k.com
-#ldap://www.luzoop5k.com ldap://www.luzoop5k.com
-#im://www.luzoop5k.com im://www.luzoop5k.com
-#snmp://www.luzoop5k.com snmp://www.luzoop5k.com
+mailto://jicu8vah@another@jicu8vah !jicu8vah@another@jicu8vah
+baeb1fai@@example.com !baeb1fai@@example.com
+mailto://yie6xuna !yie6xuna
+mailto://yie6xuna@nottld !yie6xuna@nottld
<se...@verper.com> !^http://.*addr.com@verper.com
<se...@verper.com> mailto:sentto-4934-foo=addr.com@verper.com
@@ -265,3 +258,23 @@
{ESC}(Bhttp://foo23499.com/ ^http://foo23499.com/$
http://foo23500.com{ESC}(B/ ^http://foo23500.com(?:/?)$
+M0"-AE/9Y.KN:_0D2F:95^H*:I,8 !9Y\.KN
+>delimtest1.com http://delimtest1.com
+<delimtest2.com http://delimtest2.com
+"delimtest3.com http://delimtest3.com
+\delimtest4.com http://delimtest4.com
+'delimtest5.com http://delimtest5.com
+`delimtest6.com http://delimtest6.com
+,delimtest7.com http://delimtest7.com
+{delimtest8.com http://delimtest8.com
+[delimtest9.com http://delimtest9.com
+(delimtest10.com http://delimtest10.com
+|delimtest11.com http://delimtest11.com
+ delimtest12.com http://delimtest12.com
+ignorethishttp://delimtest13.org http://delimtest13.org
+donotignorethiswww.delimtest14.com donotignorethiswww.delimtest14.com
+<www.delimtest15.com/foo-~!@#^&*()_+=:;'?,.xyz-~!@#^&*()_+=:;'?,.> ^http://www.delimtest15.com/foo-~!@#\^&\*\(\)_\+=:;'\?,\.xyz$
+.....www.delimtest16.com.......... ^http://www.delimtest16.com$
+-----www.delimtest17.com---------- ^http://www.delimtest17.com$
+.....http://www.delimtest18.com.......... ^http://www.delimtest18.com$
+-----http://www.delimtest19.com---------- ^http://www.delimtest19.com$