You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by he...@apache.org on 2019/08/14 08:34:58 UTC
svn commit: r1865095 - in /spamassassin:
branches/3.4/lib/Mail/SpamAssassin/PerMsgStatus.pm
branches/3.4/lib/Mail/SpamAssassin/Util.pm
trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
trunk/lib/Mail/SpamAssassin/Util.pm
Author: hege
Date: Wed Aug 14 08:34:58 2019
New Revision: 1865095
URL: http://svn.apache.org/viewvc?rev=1865095&view=rev
Log:
More email uri parser tweaks
Modified:
spamassassin/branches/3.4/lib/Mail/SpamAssassin/PerMsgStatus.pm
spamassassin/branches/3.4/lib/Mail/SpamAssassin/Util.pm
spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
Modified: spamassassin/branches/3.4/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/3.4/lib/Mail/SpamAssassin/PerMsgStatus.pm?rev=1865095&r1=1865094&r2=1865095&view=diff
==============================================================================
--- spamassassin/branches/3.4/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/branches/3.4/lib/Mail/SpamAssassin/PerMsgStatus.pm Wed Aug 14 08:34:58 2019
@@ -2152,7 +2152,7 @@ sub _tbirdurire {
# schemeless regexp looks for a valid TLD at the end of what may be a FQDN, followed by optional ., optional :portnum, optional /rest_of_uri
my $urischemeless = qr/([a-z\d][a-z\d._-]{0,251}\.${tldsRE})\.?(?::\d{1,5})?(?:\/[^$tbirdenddelim]{1,251})?/i;
my $uriknownscheme = qr/(?:(?:https?|ftp):\/\/|(?:www\d{0,2}|ftp)\.)[^$tbirdenddelim]{1,251}/i;
- my $urimailscheme = qr/(?:mailto:)?[^$tbirdenddelimplusat]{1,251}\@[^$tbirdenddelimemail]{1,251}/i;
+ my $urimailscheme = qr/(?:mailto:[^$tbirdenddelimemail]{1,2048}|[^$tbirdenddelimplusat]{1,251}\@[^$tbirdenddelimemail]{1,251})/i;
$self->{tbirdurire} = qr/(?:\b|(?<=$iso2022shift)|(?<=[$tbirdstartdelim]))
(?:(?:($uriknownscheme)(?=(?:[$tbirdenddelim]|\z))) |
@@ -2386,8 +2386,10 @@ sub _get_parsed_uri_list {
my ($rulename, $pat, @uris);
my $text;
my $tbirdurire = $self->_tbirdurire;
+ my %seen;
+ my $would_log_uri_all = would_log('dbg', 'uri-all') == 2; # cache
- for my $entry (@$textary) {
+ foreach my $entry (@$textary) {
# a workaround for [perl #69973] bug:
# Invalid and tainted utf-8 char crashes perl 5.10.1 in regexp evaluation
@@ -2405,6 +2407,11 @@ sub _get_parsed_uri_list {
$rawuri =~ s/(^[^(]*)\).*$/$1/; # as per ThunderBird, ) is an end delimiter if there is no ( preceeding it
$rawuri =~ s/[-~!@#^&*()_+=:;\'?,.]*$//; # remove trailing string of punctuations that TBird ignores
+ next if exists $seen{$rawuri};
+ $seen{$rawuri} = 1;
+
+ dbg("uri: found rawuri from text ($rawtype): $rawuri") if $would_log_uri_all;
+
# Quick ignore if schemeless host not valid
next if defined $schost && !is_fqdn_valid($schost);
@@ -2419,8 +2426,6 @@ sub _get_parsed_uri_list {
# skip if there is '..' in the hostname portion of the URI, something we can't catch in the general URI regexp
next if $rawuri =~ m{^(?:(?:https?|ftp|mailto):(?://)?)?(?:[^\@/?#]*\@)?[^/?#:]*\.\.}i;
- dbg("uri: found rawuri ($rawtype): $rawuri");
-
# If it's a hostname that was just sitting out in the
# open, without a protocol, and not inside of an HTML tag,
# the we should add the proper protocol in front, rather
@@ -2442,10 +2447,11 @@ sub _get_parsed_uri_list {
# And this is linkified: foo@bar%2Ecom?foo.com&bar (woot??)
# And this is linkified with Outlook: foo@bar%2Ecom&foo (woot??)
# Don't test when ? or & exists, canonicalizing will handle later.
- $uri =~ s/^(?:skype|e?-?mail)?:+//i; # strip common misparses
if ($uri !~ tr/?&// && $uri =~ /\@(.*)/) {
next unless $self->{main}->{registryboundaries}->is_domain_valid($1);
}
+ next if index($uri, ' ') != -1; # ignore garbled
+ $uri =~ s/^(?:skype|e?-?mail)?:+//i; # strip common misparses
$uri = "mailto:$uri";
}
else {
@@ -2455,10 +2461,13 @@ sub _get_parsed_uri_list {
}
}
- if ($uri =~ /^mailto:/i) {
- # skip a mail link that does not have a valid TLD or @ after decoding any URLEncoded characters
- $uri = Mail::SpamAssassin::Util::url_encode($uri) if ($uri =~ /\%(?:2[1-9a-fA-F]|[3-6][0-9a-fA-F]|7[0-9a-eA-E])/);
- next unless $uri =~ /\@/;
+ if ($uri =~ /^mailto:/i) { # Schemed mailto: handled different from schemeless
+ # MUAs linkify and urldecode mailto:foo%40bar%2Fcom
+ $uri = Mail::SpamAssassin::Util::url_encode($uri) if $uri =~ /\%[0-9a-f]{2}/i;
+ # Skip unless @ found after decoding, then check tld is valid
+ next unless $uri =~ /\@([^?&>]*)/;
+ next unless $self->{main}->{registryboundaries}->is_domain_valid($1);
+ # SA 3.4 legacy code continues
my $domuri = $self->{main}->{registryboundaries}->uri_to_domain($uri);
next unless $domuri;
push (@uris, $rawuri);
Modified: spamassassin/branches/3.4/lib/Mail/SpamAssassin/Util.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/3.4/lib/Mail/SpamAssassin/Util.pm?rev=1865095&r1=1865094&r2=1865095&view=diff
==============================================================================
--- spamassassin/branches/3.4/lib/Mail/SpamAssassin/Util.pm (original)
+++ spamassassin/branches/3.4/lib/Mail/SpamAssassin/Util.pm Wed Aug 14 08:34:58 2019
@@ -1334,12 +1334,16 @@ sub uri_list_canonicalize {
if ($uri =~ /^mailto:/i || $uri =~ /^[^:]*\@/) {
# Strip ?subject= parameters and obfuscations
# Outlook linkifies foo@bar%2Ecom&x.com to foo@bar.com !!
- if ($nuri =~ /^(.*?)\?/) {
+ if ($nuri =~ /^([^\@]+\@[^?]+)\?/) {
push @nuris, $1;
}
- if ($nuri =~ /^(.*?)\&/) {
+ if ($nuri =~ /^([^\@]+\@[^&]+)\&/) {
push @nuris, $1
}
+ # mailto:"Foo%20Bar"%20<fo...@example.com>
+ if ($nuri =~ /^[^?&]*<([^\@>]+\@[^>]+)>/) {
+ push @nuris, "mailto:$1";
+ }
# End email processing
next;
}
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm?rev=1865095&r1=1865094&r2=1865095&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Wed Aug 14 08:34:58 2019
@@ -2284,7 +2284,7 @@ sub _tbirdurire {
# schemeless regexp looks for a valid TLD at the end of what may be a FQDN, followed by optional ., optional :portnum, optional /rest_of_uri
my $urischemeless = qr/([a-z\d][a-z\d._-]{0,251}\.${tldsRE})\.?(?::\d{1,5})?(?:\/[^$tbirdenddelim]{1,251})?/i;
my $uriknownscheme = qr/(?:(?:https?|ftp):\/\/|(?:www\d{0,2}|ftp)\.)[^$tbirdenddelim]{1,251}/i;
- my $urimailscheme = qr/(?:mailto:)?[^$tbirdenddelimplusat]{1,251}\@[^$tbirdenddelimemail]{1,251}/i;
+ my $urimailscheme = qr/(?:mailto:[^$tbirdenddelimemail]{1,2048}|[^$tbirdenddelimplusat]{1,251}\@[^$tbirdenddelimemail]{1,251})/i;
$self->{tbirdurire} = qr/(?:\b|(?<=$iso2022shift)|(?<=[$tbirdstartdelim]))
(?:(?:($uriknownscheme)(?=(?:[$tbirdenddelim]|\z))) |
@@ -2413,6 +2413,7 @@ sub _process_text_uri_list {
my $textary = $self->get_decoded_stripped_body_text_array();
my $tbirdurire = $self->_tbirdurire;
my %seen;
+ my $would_log_uri_all = would_log('dbg', 'uri-all') == 2; # cache
foreach my $text (@$textary) {
# a workaround for [perl #69973] bug:
@@ -2434,6 +2435,8 @@ sub _process_text_uri_list {
next if exists $seen{$rawuri};
$seen{$rawuri} = 1;
+ dbg("uri: found rawuri from text ($rawtype): $rawuri") if $would_log_uri_all;
+
# Quick ignore if schemeless host not valid
next if defined $schost && !is_fqdn_valid($schost, 1);
@@ -2445,7 +2448,6 @@ sub _process_text_uri_list {
# Ignore empty uris
next if $rawuri =~ /^\w+:\/{0,2}$/i;
- dbg("uri: found rawuri from text ($rawtype): $rawuri");
my $types = {parsed => 1};
# If it's a hostname that was just sitting out in the
@@ -2468,10 +2470,13 @@ sub _process_text_uri_list {
# And this is linkified: foo@bar%2Ecom?foo.com&bar (woot??)
# And this is linkified with Outlook: foo@bar%2Ecom&foo (woot??)
# Don't test when ? or & exists, canonicalizing will handle later.
- $uri =~ s/^(?:skype|e?-?mail)?:+//i; # strip common misparses
if ($uri !~ tr/?&// && $uri =~ /\@(.*)/) {
next unless $self->{main}->{registryboundaries}->is_domain_valid($1);
}
+ next if index($uri, ' ') != -1; # ignore garbled
+ $uri =~ s/^(?:skype|e?-?mail)?:+//i; # strip common misparses
+ # Urldecode now
+ $uri = Mail::SpamAssassin::Util::url_encode($uri) if $uri =~ /\%[0-9a-f]{2}/i;
$uri = "mailto:$uri";
}
else {
@@ -2484,12 +2489,16 @@ sub _process_text_uri_list {
# Mark any of those schemeless
$types->{schemeless} = 1;
}
-
- if ($uri =~ /^mailto:/i) {
- # skip a mail link that does not have a valid TLD or @ after decoding any URLEncoded characters
- $uri = Mail::SpamAssassin::Util::url_encode($uri) if ($uri =~ /\%(?:2[1-9a-fA-F]|[3-6][0-9a-fA-F]|7[0-9a-eA-E])/);
+ elsif ($uri =~ /^mailto:/i) { # Schemed mailto: handled different from schemeless
+ # MUAs linkify and urldecode mailto:foo%40bar%2Fcom
+ $uri = Mail::SpamAssassin::Util::url_encode($uri) if $uri =~ /\%[0-9a-f]{2}/i;
+ # Skip unless @ found after decoding, then check tld is valid
+ next unless $uri =~ /\@([^?&>]*)/;
+ next unless $self->{main}->{registryboundaries}->is_domain_valid($1);
}
+ dbg("uri: parsed uri from text ($rawtype): $uri") if $would_log_uri_all;
+
$self->add_uri_detail_list($uri, $types, 'parsed', 1);
}
}
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm?rev=1865095&r1=1865094&r2=1865095&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm Wed Aug 14 08:34:58 2019
@@ -1496,12 +1496,16 @@ sub uri_list_canonicalize {
if ($uri =~ /^mailto:/i || $uri =~ /^[^:]*\@/) {
# Strip ?subject= parameters and obfuscations
# Outlook linkifies foo@bar%2Ecom&x.com to foo@bar.com !!
- if ($nuri =~ /^([^@]+\@[^?]+)\?/) {
+ if ($nuri =~ /^([^\@]+\@[^?]+)\?/) {
push @nuris, $1;
}
- if ($nuri =~ /^([^@]+\@[^&]+)\&/) {
+ if ($nuri =~ /^([^\@]+\@[^&]+)\&/) {
push @nuris, $1
}
+ # mailto:"Foo%20Bar"%20<fo...@example.com>
+ if ($nuri =~ /^[^?&]*<([^\@>]+\@[^>]+)>/) {
+ push @nuris, "mailto:$1";
+ }
# End email processing
next;
}