You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by mm...@apache.org on 2015/03/11 01:03:25 UTC

svn commit: r1665744 - /spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm

Author: mmartinec
Date: Wed Mar 11 00:03:24 2015
New Revision: 1665744

URL: http://svn.apache.org/r1665744
Log:
Bug 6751: Certain Character Sets can use alternate characters for period that then bypass DNSBL Checks

Modified:
    spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm?rev=1665744&r1=1665743&r2=1665744&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm Wed Mar 11 00:03:24 2015
@@ -1268,15 +1268,15 @@ sub uri_list_canonicalize {
     # bug 4390: certain MUAs treat back slashes as front slashes.
     # since backslashes are supposed to be encoded in a URI, swap non-encoded
     # ones with front slashes.
-    $nuri =~ tr@\\@/@;
+    $nuri =~ tr{\\}{/};
 
     # http:www.foo.biz -> http://www.foo.biz
-    $nuri =~ s#^(https?:)/{0,2}#$1//#i;
+    $nuri =~ s{^(https?:)/{0,2}}{$1//}i;
 
     # *always* make a dup with all %-encoding decoded, since
     # important parts of the URL may be encoded (such as the
     # scheme). (bug 4213)
-    if ($nuri =~ /\%[0-9a-fA-F]{2}/) {
+    if ($nuri =~ /%[0-9a-fA-F]{2}/) {
       $nuri = Mail::SpamAssassin::Util::url_encode($nuri);
     }
 
@@ -1284,15 +1284,15 @@ sub uri_list_canonicalize {
     # unschemed URIs: assume default of "http://" as most MUAs do
     if ($nuri !~ /^[-_a-z0-9]+:/i) {
       if ($nuri =~ /^ftp\./) {
-	$nuri =~ s@^@ftp://@g;
+	$nuri =~ s{^}{ftp://}g;
       }
       else {
-	$nuri =~ s@^@http://@g;
+	$nuri =~ s{^}{http://}g;
       }
     }
 
     # http://www.foo.biz?id=3 -> http://www.foo.biz/?id=3
-    $nuri =~ s@^(https?://[^/?]+)\?@$1/?@i;
+    $nuri =~ s{^(https?://[^/?]+)\?}{$1/?}i;
 
     # deal with encoding of chars, this is just the set of printable
     # chars minus ' ' (that is, dec 33-126, hex 21-7e)
@@ -1311,6 +1311,21 @@ sub uri_list_canonicalize {
       # not required
       $rest ||= '';
 
+      # Bug 6751:
+      # RFC 3490 (IDNA): Whenever dots are used as label separators, the
+      #   following characters MUST be recognized as dots: U+002E (full stop),
+      #   U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
+      #   U+FF61 (halfwidth ideographic full stop).
+      # RFC 5895: [...] the IDEOGRAPHIC FULL STOP character (U+3002)
+      #   can be mapped to the FULL STOP before label separation occurs.
+      #   [...] Only the IDEOGRAPHIC FULL STOP character (U+3002) is added in
+      #   this mapping because the authors have not fully investigated [...]
+      # Adding also 'SMALL FULL STOP' (U+FE52) as seen in the wild.
+      if ($host =~ s{(?: \xE3\x80\x82 | \xEF\xBC\x8E | \xEF\xBD\xA1 |
+                         \xEF\xB9\x92 )}{.}xgs) {
+        push(@nuris, join ('', $proto, $host, $rest));
+      }
+
       # bug 4146: deal with non-US ASCII 7-bit chars in the host portion
       # of the URI according to RFC 1738 that's invalid, and the tested
       # browsers (Firefox, IE) remove them before usage...