You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by he...@apache.org on 2019/08/21 09:19:39 UTC

svn commit: r1865612 - in /spamassassin: branches/3.4/lib/Mail/SpamAssassin/PerMsgStatus.pm trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm

Author: hege
Date: Wed Aug 21 09:19:39 2019
New Revision: 1865612

URL: http://svn.apache.org/viewvc?rev=1865612&view=rev
Log:
Improve schemeless uri parser start boundary

Modified:
    spamassassin/branches/3.4/lib/Mail/SpamAssassin/PerMsgStatus.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm

Modified: spamassassin/branches/3.4/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/3.4/lib/Mail/SpamAssassin/PerMsgStatus.pm?rev=1865612&r1=1865611&r2=1865612&view=diff
==============================================================================
--- spamassassin/branches/3.4/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/branches/3.4/lib/Mail/SpamAssassin/PerMsgStatus.pm Wed Aug 21 09:19:39 2019
@@ -2140,6 +2140,9 @@ sub _tbirdurire {
   my $tbirdenddelim = '><"`}\]{[|\s' . "\x1b\xa0";  # The \x1b as per bug 4522 # \xa0 (nbsp) added 7/2019
   my $nonASCII    = '\x80-\xff';
 
+  # schemeless uri start delimiter, combo of most punctuations and delims above
+  my $scstartdelim = qr/[\!\"\#\$\&\'\(\)\*\+\,\/\:\;\<\=\>\?\@\[\\\]\^\`\{\|\}\~\s\x1b\xa0]/;
+
   # bug 7100: we allow a comma to delimit the end of an email address because it will never appear in a domain name, and
   # it's a common thing to find in text
   my $tbirdenddelimemail = $tbirdenddelim . ',(\'' . $nonASCII;  # tbird ignores non-ASCII mail addresses for now, until RFC changes
@@ -2157,7 +2160,7 @@ sub _tbirdurire {
   $self->{tbirdurire} = qr/(?:\b|(?<=$iso2022shift)|(?<=[$tbirdstartdelim]))
                         (?:(?:($uriknownscheme)(?=(?:[$tbirdenddelim]|\z))) |
                         (?:($urimailscheme)(?=(?:[$tbirdenddelimemail]|\z))) |
-                        (?:(?<![a-z\d._-])($urischemeless)(?=(?:[$tbirdenddelim]|\z))))/ix;
+                        (?:(?:^|(?<=$scstartdelim))($urischemeless)(?=(?:[$tbirdenddelim]|\z))))/ix;
 
   return $self->{tbirdurire};
 }

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm?rev=1865612&r1=1865611&r2=1865612&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Wed Aug 21 09:19:39 2019
@@ -2272,6 +2272,9 @@ sub _tbirdurire {
   my $tbirdenddelim = '><"`}\]{[|\s' . "\x1b\xa0";  # The \x1b as per bug 4522 # \xa0 (nbsp) added 7/2019
   my $nonASCII    = '\x80-\xff';
 
+  # schemeless uri start delimiter, combo of most punctuations and delims above
+  my $scstartdelim = qr/[\!\"\#\$\&\'\(\)\*\+\,\/\:\;\<\=\>\?\@\[\\\]\^\`\{\|\}\~\s\x1b\xa0]/;
+
   # bug 7100: we allow a comma to delimit the end of an email address because it will never appear in a domain name, and
   # it's a common thing to find in text
   my $tbirdenddelimemail = $tbirdenddelim . ',(\'' . $nonASCII;  # tbird ignores non-ASCII mail addresses for now, until RFC changes
@@ -2289,7 +2292,7 @@ sub _tbirdurire {
   $self->{tbirdurire} = qr/(?:\b|(?<=$iso2022shift)|(?<=[$tbirdstartdelim]))
                         (?:(?:($uriknownscheme)(?=(?:[$tbirdenddelim]|\z))) |
                         (?:($urimailscheme)(?=(?:[$tbirdenddelimemail]|\z))) |
-                        (?:(?<![a-z\d._-])($urischemeless)(?=(?:[$tbirdenddelim]|\z))))/ix;
+                        (?:(?:^|(?<=$scstartdelim))($urischemeless)(?=(?:[$tbirdenddelim]|\z))))/ix;
 
   return $self->{tbirdurire};
 }