You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by mm...@apache.org on 2015/08/05 17:47:35 UTC
svn commit: r1694252 - in /spamassassin/trunk/lib/Mail/SpamAssassin: AsyncLoop.pm Dns.pm DnsResolver.pm Plugin/AskDNS.pm Plugin/HeaderEval.pm Plugin/URIDNSBL.pm Util.pm

Author: mmartinec
Date: Wed Aug  5 15:47:34 2015
New Revision: 1694252

URL: http://svn.apache.org/r1694252
Log:
Bug 7215: Towards supporting IDNA (Internationalizing Domain Names in Applications) - introduce MS::Util::idn_to_ascii()

Modified:
    spamassassin/trunk/lib/Mail/SpamAssassin/AsyncLoop.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Dns.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/DnsResolver.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/AskDNS.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HeaderEval.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/AsyncLoop.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/AsyncLoop.pm?rev=1694252&r1=1694251&r2=1694252&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/AsyncLoop.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/AsyncLoop.pm Wed Aug  5 15:47:34 2015
@@ -257,6 +257,16 @@ filled-in with a query ID.
 
 sub bgsend_and_start_lookup {
   my($self, $domain, $type, $class, $ent, $cb, %options) = @_;
+
+  # At this point the $domain should already be encoded to UTF-8 and
+  # IDN converted to ASCII-compatible encoding (ACE).  Make sure this is
+  # really the case in order to be able to catch any leftover omissions.
+  if (utf8::is_utf8($domain)) {
+    warn "bgsend_and_start_lookup: domain name in Unicode, expected octets: $domain\n";
+  } elsif ($domain =~ tr/\x00-\x7F//c) {  # is not all-ASCII
+    info("bgsend_and_start_lookup: non-ASCII domain name: %s", $domain);
+  }
+
   $ent = {}  if !$ent;
   $domain =~ s/\.+\z//s;  # strip trailing dots, these sometimes still sneak in
   $ent->{id} = undef;

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Dns.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Dns.pm?rev=1694252&r1=1694251&r2=1694252&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Dns.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Dns.pm Wed Aug  5 15:47:34 2015
@@ -29,7 +29,7 @@ use Mail::SpamAssassin::Conf;
 use Mail::SpamAssassin::PerMsgStatus;
 use Mail::SpamAssassin::AsyncLoop;
 use Mail::SpamAssassin::Constants qw(:ip);
-use Mail::SpamAssassin::Util qw(untaint_var am_running_on_windows);
+use Mail::SpamAssassin::Util qw(untaint_var am_running_on_windows idn_to_ascii);
 
 use File::Spec;
 use IO::Socket;
@@ -101,6 +101,7 @@ BEGIN {
 sub do_rbl_lookup {
   my ($self, $rule, $set, $type, $host, $subtest) = @_;
 
+  $host = idn_to_ascii($host);
   $host =~ s/\.\z//s;  # strip a redundant trailing dot
   my $key = "dns:$type:$host";
   my $existing_ent = $self->{async}->get_lookup($key);
@@ -145,6 +146,7 @@ sub register_rbl_subtest {
 sub do_dns_lookup {
   my ($self, $rule, $type, $host) = @_;
 
+  $host = idn_to_ascii($host);
   $host =~ s/\.\z//s;  # strip a redundant trailing dot
   my $key = "dns:$type:$host";
 

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/DnsResolver.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/DnsResolver.pm?rev=1694252&r1=1694251&r2=1694252&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/DnsResolver.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/DnsResolver.pm Wed Aug  5 15:47:34 2015
@@ -45,7 +45,7 @@ require 5.008001;  # needs utf8::is_utf8
 use Mail::SpamAssassin;
 use Mail::SpamAssassin::Logger;
 use Mail::SpamAssassin::Constants qw(:ip);
-use Mail::SpamAssassin::Util qw(untaint_var decode_dns_question_entry);
+use Mail::SpamAssassin::Util qw(untaint_var decode_dns_question_entry idn_to_ascii);
 
 use Socket;
 use Errno qw(EADDRINUSE EACCES);
@@ -878,8 +878,9 @@ sub send {
   # using some arbitrary encoding (they are normally just 7-bit ascii
   # characters anyway, just need to get rid of the utf8 flag).  Bug 6959
   # Most if not all af these come from a SPF plugin.
+  #   (was a call to utf8::encode($name), now we prefer a proper idn_to_ascii)
   #
-  utf8::encode($name);
+  $name = idn_to_ascii($name);
 
   my $retrans = $self->{retrans};
   my $retries = $self->{retry};

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/AskDNS.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/AskDNS.pm?rev=1694252&r1=1694251&r2=1694252&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/AskDNS.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/AskDNS.pm Wed Aug  5 15:47:34 2015
@@ -189,7 +189,7 @@ use warnings;
 use re 'taint';
 
 use Mail::SpamAssassin::Plugin;
-use Mail::SpamAssassin::Util qw(decode_dns_question_entry);
+use Mail::SpamAssassin::Util qw(decode_dns_question_entry idn_to_ascii);
 use Mail::SpamAssassin::Logger;
 
 use vars qw(@ISA %rcode_value $txtdata_can_provide_a_list);
@@ -465,6 +465,7 @@ OUTER:
       $query_domain =~ s{_([A-Z][A-Z0-9]*)_}
                         { defined $current_tag_val{$1} ? $current_tag_val{$1}
                                                        : '' }ge;
+      $query_domain = idn_to_ascii($query_domain);
 
       # the $dnskey identifies this query in AsyncLoop's pending_lookups
       my $dnskey = join(':', 'askdns', $query_type, $query_domain);

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HeaderEval.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HeaderEval.pm?rev=1694252&r1=1694251&r2=1694252&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HeaderEval.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HeaderEval.pm Wed Aug  5 15:47:34 2015
@@ -25,6 +25,8 @@ use Errno qw(EBADF);
 
 use Mail::SpamAssassin::Plugin;
 use Mail::SpamAssassin::Locales;
+use Mail::SpamAssassin::Util qw(get_my_locales parse_rfc822_date
+                                idn_to_ascii is_valid_utf_8);
 use Mail::SpamAssassin::Logger;
 use Mail::SpamAssassin::Constants qw(:sa :ip);
 
@@ -276,6 +278,17 @@ sub check_illegal_chars {
     $str =~ s/^(?:Subject|From):.*$//gmi;
   }
 
+  if ($str =~ tr/\x00-\x7F//c && is_valid_utf_8($str)) {
+    # is non-ASCII and is valid UTF-8
+    if ($str =~ tr/\x00-\x08\x0B\x0C\x0E-\x1F//) {
+      dbg("eval: %s is valid UTF-8 but contains controls: %s", $header, $str);
+    } else {
+      # todo: only with a SMTPUTF8 mail
+      dbg("eval: %s is valid UTF-8: %s", $header, $str);
+      return 0;
+    }
+  }
+
   # count illegal substrings (RFC 2045)
   # (non-ASCII + C0 controls except TAB, NL, CR)
   my $illegal = $str =~ tr/\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff//;
@@ -1037,6 +1050,7 @@ sub check_ratware_envelope_from {
 
   if ($to =~ /^([^@]+)@(.+)$/) {
     my($user,$dom) = ($1,$2);
+    $dom = idn_to_ascii($dom);
     $dom = $self->{main}->{registryboundaries}->trim_domain($dom);
     return unless
         ($self->{main}->{registryboundaries}->is_domain_valid($dom));

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm?rev=1694252&r1=1694251&r2=1694252&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm Wed Aug  5 15:47:34 2015
@@ -294,7 +294,7 @@ package Mail::SpamAssassin::Plugin::URID
 
 use Mail::SpamAssassin::Plugin;
 use Mail::SpamAssassin::Constants qw(:ip);
-use Mail::SpamAssassin::Util;
+use Mail::SpamAssassin::Util qw(idn_to_ascii);
 use Mail::SpamAssassin::Logger;
 use strict;
 use warnings;
@@ -901,6 +901,7 @@ sub query_hosts_or_domains {
 sub lookup_domain_ns {
   my ($self, $pms, $obj, $dom, $rulename) = @_;
 
+  $dom = idn_to_ascii($dom);
   my $key = "NS:" . $dom;
   my $ent = {
     key => $key, zone => $dom, obj => $obj, type => "URI-NS",
@@ -986,6 +987,7 @@ sub complete_ns_lookup {
 sub lookup_a_record {
   my ($self, $pms, $obj, $hname, $rulename) = @_;
 
+  $hname = idn_to_ascii($hname);
   my $key = "A:" . $hname;
   my $ent = {
     key => $key, zone => $hname, obj => $obj, type => "URI-A",
@@ -1054,6 +1056,7 @@ sub lookup_dnsbl_for_ip {
 sub lookup_single_dnsbl {
   my ($self, $pms, $obj, $rulename, $lookupstr, $dnsbl, $qtype) = @_;
 
+  $dnsbl = idn_to_ascii($dnsbl);
   my $key = "DNSBL:" . $lookupstr . ':' . $dnsbl;
   my $ent = {
     key => $key, zone => $dnsbl, obj => $obj, type => 'URI-DNSBL',

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm?rev=1694252&r1=1694251&r2=1694252&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm Wed Aug  5 15:47:34 2015
@@ -62,7 +62,9 @@ BEGIN {
   @EXPORT_OK = qw(&local_tz &base64_decode &untaint_var &untaint_file_path
                   &exit_status_str &proc_status_ok &am_running_on_windows
                   &reverse_ip_address &decode_dns_question_entry
-                  &secure_tmpfile &secure_tmpdir &uri_list_canonicalize);
+                  &secure_tmpfile &secure_tmpdir &uri_list_canonicalize
+                  &get_my_locales &parse_rfc822_date &idn_to_ascii
+                  &is_valid_utf_8);
 }
 
 use Mail::SpamAssassin;
@@ -74,6 +76,7 @@ use File::Basename;
 use Time::Local;
 use Sys::Hostname (); # don't import hostname() into this namespace!
 use NetAddr::IP 4.000;
+use Scalar::Util qw(tainted);
 use Fcntl;
 use Errno qw(ENOENT EACCES EEXIST);
 use POSIX qw(:sys_wait_h WIFEXITED WIFSIGNALED WIFSTOPPED WEXITSTATUS
@@ -96,6 +99,43 @@ BEGIN {
 
 ###########################################################################
 
+our $ALT_FULLSTOP_UTF8_RE;
+BEGIN {
+  # Bug 6751:
+  # RFC 3490 (IDNA): Whenever dots are used as label separators, the
+  #   following characters MUST be recognized as dots: U+002E (full stop),
+  #   U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
+  #   U+FF61 (halfwidth ideographic full stop).
+  # RFC 5895: [...] the IDEOGRAPHIC FULL STOP character (U+3002)
+  #   can be mapped to the FULL STOP before label separation occurs.
+  #   [...] Only the IDEOGRAPHIC FULL STOP character (U+3002) is added in
+  #   this mapping because the authors have not fully investigated [...]
+  # Adding also 'SMALL FULL STOP' (U+FE52) as seen in the wild,
+  # and a 'ONE DOT LEADER' (U+2024).
+  #
+  my $dot_chars = "\x{2024}\x{3002}\x{FF0E}\x{FF61}\x{FE52}";  # \x{002E}
+  my $dot_bytes = join('|', split(//,$dot_chars));  utf8::encode($dot_bytes);
+  $ALT_FULLSTOP_UTF8_RE = qr/$dot_bytes/so;
+}
+
+###########################################################################
+
+our $enc_utf8;
+BEGIN {
+  eval { require Encode }
+    and do { $enc_utf8 = Encode::find_encoding('UTF-8') }
+};
+
+our $have_libidn;
+BEGIN {
+  eval { require Net::LibIDN } and do { $have_libidn = 1 };
+}
+
+$have_libidn or warn "INFO: module Net::LibIDN not available,\n".
+  "  internationalized domain names with U-labels will not be recognized!\n";
+
+###########################################################################
+
 # find an executable in the current $PATH (or whatever for that platform)
 {
   # Show the PATH we're going to explore only once.
@@ -338,6 +378,93 @@ sub taint_var {
 
 ###########################################################################
 
+# returns true if the provided string of octets represents a syntactically
+# valid UTF-8 string, otherwise a false is returned
+#
+sub is_valid_utf_8($) {
+# my $octets = $_[0];
+  return undef if !defined $_[0];
+  #
+  # RFC 6532: UTF8-non-ascii = UTF8-2 / UTF8-3 / UTF8-4
+  # RFC 3629 section 4: Syntax of UTF-8 Byte Sequences
+  #   UTF8-char   = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
+  #   UTF8-1      = %x00-7F
+  #   UTF8-2      = %xC2-DF UTF8-tail
+  #   UTF8-3      = %xE0 %xA0-BF UTF8-tail /
+  #                 %xE1-EC 2( UTF8-tail ) /
+  #                 %xED %x80-9F UTF8-tail /
+  #                   # U+D800..U+DFFF are utf16 surrogates, not legal utf8
+  #                 %xEE-EF 2( UTF8-tail )
+  #   UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) /
+  #                 %xF1-F3 3( UTF8-tail ) /
+  #                 %xF4 %x80-8F 2( UTF8-tail )
+  #   UTF8-tail   = %x80-BF
+  #
+  # loose variant:
+  #   [\x00-\x7F] | [\xC0-\xDF][\x80-\xBF] |
+  #   [\xE0-\xEF][\x80-\xBF]{2} | [\xF0-\xF4][\x80-\xBF]{3}
+  #
+  $_[0] =~ /^ (?: [\x00-\x7F] |
+                  [\xC2-\xDF] [\x80-\xBF] |
+                  \xE0 [\xA0-\xBF] [\x80-\xBF] |
+                  [\xE1-\xEC] [\x80-\xBF]{2} |
+                  \xED [\x80-\x9F] [\x80-\xBF] |
+                  [\xEE-\xEF] [\x80-\xBF]{2} |
+                  \xF0 [\x90-\xBF] [\x80-\xBF]{2} |
+                  [\xF1-\xF3] [\x80-\xBF]{3} |
+                  \xF4 [\x80-\x8F] [\x80-\xBF]{2} )* \z/xs ? 1 : 0;
+}
+
+# Given an international domain name with U-labels (UTF-8 or Unicode chars)
+# converts it to ASCII-compatible encoding (ACE).  If the argument is in
+# ASCII (or is an invalid IDN), returns it lowercased but otherwise unchanged.
+# The result is always in octets (utf8 flag off) even if the argument was in
+# Unicode characters.
+#
+sub idn_to_ascii($) {
+  no bytes;  # make sure there is no 'use bytes' in effect
+  return undef  if !defined $_[0];
+  my $s = "$_[0]";  # stringify
+  # propagate taintedness of the argument, but not its utf8 flag
+  my $t = tainted($s);  # taintedness of the argument
+  if ($t) {  # untaint $s, avoids taint-related bugs in LibIDN or in old perl
+    no re 'taint';  local $1;  $s =~ /^(.*)\z/s;
+  }
+  # encode chars to UTF-8, leave octets unchanged (not necessarily valid UTF-8)
+  utf8::encode($s)  if utf8::is_utf8($s);
+  if ($s !~ tr/\x00-\x7F//c) {  # is all-ASCII (including IP address literal)
+    $s = lc $s;
+  } elsif (!is_valid_utf_8($s)) {
+    my($package, $filename, $line) = caller;
+    info("util: idn_to_ascii: not valid UTF-8: /%s/, called from %s line %d",
+         $s, $package, $line);
+    $s = lc $s;  # garbage-in / garbage-out
+  } else {
+    my $chars;
+    # RFC 3490 (IDNA): Whenever dots are used as label separators, the
+    # following characters MUST be recognized as dots: U+002E (full stop),
+    # U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
+    # U+FF61 (halfwidth ideographic full stop).
+    if ($s =~ s/$ALT_FULLSTOP_UTF8_RE/./gso) {
+      info("util: idn_to_ascii: alternative dots normalized: /%s/ -> /%s/",
+           $_[0], $s);
+    }
+    if ($have_libidn) {
+      # to ASCII-compatible encoding (ACE), lowercased
+      my $sa = Net::LibIDN::idn_to_ascii($s, 'UTF-8');
+      if (!defined $sa) {
+        info("util: idn_to_ascii: conversion to ACE failed: /%s/", $s);
+      } else {
+        info("util: idn_to_ascii: converted to ACE: /%s/ -> /%s/", $s, $sa);
+        $s = $sa;
+      }
+    }
+  }
+  $t ? taint_var($s) : $s;  # propagate taintedness of the argument
+}
+
+###########################################################################
+
 # map process termination status number to an informative string, and
 # append optional mesage (dual-valued errno or a string or a number),
 # returning the resulting string
@@ -1314,20 +1441,10 @@ sub uri_list_canonicalize {
       # not required
       $rest ||= '';
 
-      # Bug 6751:
-      # RFC 3490 (IDNA): Whenever dots are used as label separators, the
-      #   following characters MUST be recognized as dots: U+002E (full stop),
-      #   U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
-      #   U+FF61 (halfwidth ideographic full stop).
-      # RFC 5895: [...] the IDEOGRAPHIC FULL STOP character (U+3002)
-      #   can be mapped to the FULL STOP before label separation occurs.
-      #   [...] Only the IDEOGRAPHIC FULL STOP character (U+3002) is added in
-      #   this mapping because the authors have not fully investigated [...]
-      # Adding also 'SMALL FULL STOP' (U+FE52) as seen in the wild.
-      # Parhaps also the 'ONE DOT LEADER' (U+2024).
-      if ($host =~ s{(?: \xE3\x80\x82 | \xEF\xBC\x8E | \xEF\xBD\xA1 |
-                         \xEF\xB9\x92 | \xE2\x80\xA4 )}{.}xgs) {
-        push(@nuris, join ('', $proto, $host, $rest));
+      my $nhost = idn_to_ascii($host);
+      if (defined $nhost && $nhost ne lc $host) {
+        push(@nuris, join('', $proto, $nhost, $rest));
+        $host = $nhost;
       }
 
       # bug 4146: deal with non-US ASCII 7-bit chars in the host portion