You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by he...@apache.org on 2019/08/14 08:34:58 UTC

svn commit: r1865095 - in /spamassassin: branches/3.4/lib/Mail/SpamAssassin/PerMsgStatus.pm branches/3.4/lib/Mail/SpamAssassin/Util.pm trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm trunk/lib/Mail/SpamAssassin/Util.pm

Author: hege
Date: Wed Aug 14 08:34:58 2019
New Revision: 1865095

URL: http://svn.apache.org/viewvc?rev=1865095&view=rev
Log:
More email uri parser tweaks

Modified:
    spamassassin/branches/3.4/lib/Mail/SpamAssassin/PerMsgStatus.pm
    spamassassin/branches/3.4/lib/Mail/SpamAssassin/Util.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm

Modified: spamassassin/branches/3.4/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/3.4/lib/Mail/SpamAssassin/PerMsgStatus.pm?rev=1865095&r1=1865094&r2=1865095&view=diff
==============================================================================
--- spamassassin/branches/3.4/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/branches/3.4/lib/Mail/SpamAssassin/PerMsgStatus.pm Wed Aug 14 08:34:58 2019
@@ -2152,7 +2152,7 @@ sub _tbirdurire {
   # schemeless regexp looks for a valid TLD at the end of what may be a FQDN, followed by optional ., optional :portnum, optional /rest_of_uri
   my $urischemeless = qr/([a-z\d][a-z\d._-]{0,251}\.${tldsRE})\.?(?::\d{1,5})?(?:\/[^$tbirdenddelim]{1,251})?/i;
   my $uriknownscheme = qr/(?:(?:https?|ftp):\/\/|(?:www\d{0,2}|ftp)\.)[^$tbirdenddelim]{1,251}/i;
-  my $urimailscheme = qr/(?:mailto:)?[^$tbirdenddelimplusat]{1,251}\@[^$tbirdenddelimemail]{1,251}/i;
+  my $urimailscheme = qr/(?:mailto:[^$tbirdenddelimemail]{1,2048}|[^$tbirdenddelimplusat]{1,251}\@[^$tbirdenddelimemail]{1,251})/i;
 
   $self->{tbirdurire} = qr/(?:\b|(?<=$iso2022shift)|(?<=[$tbirdstartdelim]))
                         (?:(?:($uriknownscheme)(?=(?:[$tbirdenddelim]|\z))) |
@@ -2386,8 +2386,10 @@ sub _get_parsed_uri_list {
     my ($rulename, $pat, @uris);
     my $text;
     my $tbirdurire = $self->_tbirdurire;
+    my %seen;
+    my $would_log_uri_all = would_log('dbg', 'uri-all') == 2; # cache
 
-    for my $entry (@$textary) {
+    foreach my $entry (@$textary) {
 
       # a workaround for [perl #69973] bug:
       # Invalid and tainted utf-8 char crashes perl 5.10.1 in regexp evaluation
@@ -2405,6 +2407,11 @@ sub _get_parsed_uri_list {
         $rawuri =~ s/(^[^(]*)\).*$/$1/;  # as per ThunderBird, ) is an end delimiter if there is no ( preceeding it
         $rawuri =~ s/[-~!@#^&*()_+=:;\'?,.]*$//; # remove trailing string of punctuations that TBird ignores
 
+        next if exists $seen{$rawuri};
+        $seen{$rawuri} = 1;
+
+        dbg("uri: found rawuri from text ($rawtype): $rawuri") if $would_log_uri_all;
+
         # Quick ignore if schemeless host not valid
         next if defined $schost && !is_fqdn_valid($schost);
 
@@ -2419,8 +2426,6 @@ sub _get_parsed_uri_list {
         # skip if there is '..' in the hostname portion of the URI, something we can't catch in the general URI regexp
         next if $rawuri =~ m{^(?:(?:https?|ftp|mailto):(?://)?)?(?:[^\@/?#]*\@)?[^/?#:]*\.\.}i;
 
-        dbg("uri: found rawuri ($rawtype): $rawuri");
-
         # If it's a hostname that was just sitting out in the
         # open, without a protocol, and not inside of an HTML tag,
         # the we should add the proper protocol in front, rather
@@ -2442,10 +2447,11 @@ sub _get_parsed_uri_list {
             # And this is linkified: foo@bar%2Ecom?foo.com&bar  (woot??)
             # And this is linkified with Outlook: foo@bar%2Ecom&foo  (woot??)
             # Don't test when ? or & exists, canonicalizing will handle later.
-            $uri =~ s/^(?:skype|e?-?mail)?:+//i; # strip common misparses
             if ($uri !~ tr/?&// && $uri =~ /\@(.*)/) {
               next unless $self->{main}->{registryboundaries}->is_domain_valid($1);
             }
+            next if index($uri, '&nbsp;') != -1; # ignore garbled
+            $uri =~ s/^(?:skype|e?-?mail)?:+//i; # strip common misparses
             $uri = "mailto:$uri";
           }
           else {
@@ -2455,10 +2461,13 @@ sub _get_parsed_uri_list {
           }
         }
 
-        if ($uri =~ /^mailto:/i) {
-          # skip a mail link that does not have a valid TLD or @ after decoding any URLEncoded characters
-          $uri = Mail::SpamAssassin::Util::url_encode($uri) if ($uri =~ /\%(?:2[1-9a-fA-F]|[3-6][0-9a-fA-F]|7[0-9a-eA-E])/);
-          next unless $uri =~ /\@/;
+        if ($uri =~ /^mailto:/i) { # Schemed mailto: handled different from schemeless
+          # MUAs linkify and urldecode mailto:foo%40bar%2Fcom
+          $uri = Mail::SpamAssassin::Util::url_encode($uri) if $uri =~ /\%[0-9a-f]{2}/i;
+          # Skip unless @ found after decoding, then check tld is valid
+          next unless $uri =~ /\@([^?&>]*)/;
+          next unless $self->{main}->{registryboundaries}->is_domain_valid($1);
+          # SA 3.4 legacy code continues
           my $domuri = $self->{main}->{registryboundaries}->uri_to_domain($uri);
           next unless $domuri;
           push (@uris, $rawuri);

Modified: spamassassin/branches/3.4/lib/Mail/SpamAssassin/Util.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/3.4/lib/Mail/SpamAssassin/Util.pm?rev=1865095&r1=1865094&r2=1865095&view=diff
==============================================================================
--- spamassassin/branches/3.4/lib/Mail/SpamAssassin/Util.pm (original)
+++ spamassassin/branches/3.4/lib/Mail/SpamAssassin/Util.pm Wed Aug 14 08:34:58 2019
@@ -1334,12 +1334,16 @@ sub uri_list_canonicalize {
     if ($uri =~ /^mailto:/i || $uri =~ /^[^:]*\@/) {
       # Strip ?subject= parameters and obfuscations
       # Outlook linkifies foo@bar%2Ecom&x.com to foo@bar.com !!
-      if ($nuri =~ /^(.*?)\?/) {
+      if ($nuri =~ /^([^\@]+\@[^?]+)\?/) {
         push @nuris, $1;
       }
-      if ($nuri =~ /^(.*?)\&/) {
+      if ($nuri =~ /^([^\@]+\@[^&]+)\&/) {
         push @nuris, $1
       }
+      # mailto:"Foo%20Bar"%20<fo...@example.com>
+      if ($nuri =~ /^[^?&]*<([^\@>]+\@[^>]+)>/) {
+        push @nuris, "mailto:$1";
+      }
       # End email processing
       next;
     }

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm?rev=1865095&r1=1865094&r2=1865095&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Wed Aug 14 08:34:58 2019
@@ -2284,7 +2284,7 @@ sub _tbirdurire {
   # schemeless regexp looks for a valid TLD at the end of what may be a FQDN, followed by optional ., optional :portnum, optional /rest_of_uri
   my $urischemeless = qr/([a-z\d][a-z\d._-]{0,251}\.${tldsRE})\.?(?::\d{1,5})?(?:\/[^$tbirdenddelim]{1,251})?/i;
   my $uriknownscheme = qr/(?:(?:https?|ftp):\/\/|(?:www\d{0,2}|ftp)\.)[^$tbirdenddelim]{1,251}/i;
-  my $urimailscheme = qr/(?:mailto:)?[^$tbirdenddelimplusat]{1,251}\@[^$tbirdenddelimemail]{1,251}/i;
+  my $urimailscheme = qr/(?:mailto:[^$tbirdenddelimemail]{1,2048}|[^$tbirdenddelimplusat]{1,251}\@[^$tbirdenddelimemail]{1,251})/i;
 
   $self->{tbirdurire} = qr/(?:\b|(?<=$iso2022shift)|(?<=[$tbirdstartdelim]))
                         (?:(?:($uriknownscheme)(?=(?:[$tbirdenddelim]|\z))) |
@@ -2413,6 +2413,7 @@ sub _process_text_uri_list {
   my $textary = $self->get_decoded_stripped_body_text_array();
   my $tbirdurire = $self->_tbirdurire;
   my %seen;
+  my $would_log_uri_all = would_log('dbg', 'uri-all') == 2; # cache
 
   foreach my $text (@$textary) {
     # a workaround for [perl #69973] bug:
@@ -2434,6 +2435,8 @@ sub _process_text_uri_list {
       next if exists $seen{$rawuri};
       $seen{$rawuri} = 1;
 
+      dbg("uri: found rawuri from text ($rawtype): $rawuri") if $would_log_uri_all;
+
       # Quick ignore if schemeless host not valid
       next if defined $schost && !is_fqdn_valid($schost, 1);
 
@@ -2445,7 +2448,6 @@ sub _process_text_uri_list {
       # Ignore empty uris
       next if $rawuri =~ /^\w+:\/{0,2}$/i;
 
-      dbg("uri: found rawuri from text ($rawtype): $rawuri");
       my $types = {parsed => 1};
 
       # If it's a hostname that was just sitting out in the
@@ -2468,10 +2470,13 @@ sub _process_text_uri_list {
           # And this is linkified: foo@bar%2Ecom?foo.com&bar  (woot??)
           # And this is linkified with Outlook: foo@bar%2Ecom&foo  (woot??)
           # Don't test when ? or & exists, canonicalizing will handle later.
-          $uri =~ s/^(?:skype|e?-?mail)?:+//i; # strip common misparses
           if ($uri !~ tr/?&// && $uri =~ /\@(.*)/) {
             next unless $self->{main}->{registryboundaries}->is_domain_valid($1);
           }
+          next if index($uri, '&nbsp;') != -1; # ignore garbled
+          $uri =~ s/^(?:skype|e?-?mail)?:+//i; # strip common misparses
+          # Urldecode now
+          $uri = Mail::SpamAssassin::Util::url_encode($uri) if $uri =~ /\%[0-9a-f]{2}/i;
           $uri = "mailto:$uri";
         }
         else {
@@ -2484,12 +2489,16 @@ sub _process_text_uri_list {
         # Mark any of those schemeless
         $types->{schemeless} = 1;
       }
-
-      if ($uri =~ /^mailto:/i) {
-        # skip a mail link that does not have a valid TLD or @ after decoding any URLEncoded characters
-        $uri = Mail::SpamAssassin::Util::url_encode($uri) if ($uri =~ /\%(?:2[1-9a-fA-F]|[3-6][0-9a-fA-F]|7[0-9a-eA-E])/);
+      elsif ($uri =~ /^mailto:/i) { # Schemed mailto: handled different from schemeless
+        # MUAs linkify and urldecode mailto:foo%40bar%2Fcom
+        $uri = Mail::SpamAssassin::Util::url_encode($uri) if $uri =~ /\%[0-9a-f]{2}/i;
+        # Skip unless @ found after decoding, then check tld is valid
+        next unless $uri =~ /\@([^?&>]*)/;
+        next unless $self->{main}->{registryboundaries}->is_domain_valid($1);
       }
 
+      dbg("uri: parsed uri from text ($rawtype): $uri") if $would_log_uri_all;
+
       $self->add_uri_detail_list($uri, $types, 'parsed', 1);
     }
   }

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm?rev=1865095&r1=1865094&r2=1865095&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm Wed Aug 14 08:34:58 2019
@@ -1496,12 +1496,16 @@ sub uri_list_canonicalize {
     if ($uri =~ /^mailto:/i || $uri =~ /^[^:]*\@/) {
       # Strip ?subject= parameters and obfuscations
       # Outlook linkifies foo@bar%2Ecom&x.com to foo@bar.com !!
-      if ($nuri =~ /^([^@]+\@[^?]+)\?/) {
+      if ($nuri =~ /^([^\@]+\@[^?]+)\?/) {
         push @nuris, $1;
       }
-      if ($nuri =~ /^([^@]+\@[^&]+)\&/) {
+      if ($nuri =~ /^([^\@]+\@[^&]+)\&/) {
         push @nuris, $1
       }
+      # mailto:"Foo%20Bar"%20<fo...@example.com>
+      if ($nuri =~ /^[^?&]*<([^\@>]+\@[^>]+)>/) {
+        push @nuris, "mailto:$1";
+      }
       # End email processing
       next;
     }