You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by mm...@apache.org on 2015/08/05 17:47:35 UTC
svn commit: r1694252 - in /spamassassin/trunk/lib/Mail/SpamAssassin:
AsyncLoop.pm Dns.pm DnsResolver.pm Plugin/AskDNS.pm Plugin/HeaderEval.pm
Plugin/URIDNSBL.pm Util.pm
Author: mmartinec
Date: Wed Aug 5 15:47:34 2015
New Revision: 1694252
URL: http://svn.apache.org/r1694252
Log:
Bug 7215: Towards supporting IDNA (Internationalizing Domain Names in Applications) - introduce MS::Util::idn_to_ascii()
Modified:
spamassassin/trunk/lib/Mail/SpamAssassin/AsyncLoop.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Dns.pm
spamassassin/trunk/lib/Mail/SpamAssassin/DnsResolver.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/AskDNS.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HeaderEval.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/AsyncLoop.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/AsyncLoop.pm?rev=1694252&r1=1694251&r2=1694252&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/AsyncLoop.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/AsyncLoop.pm Wed Aug 5 15:47:34 2015
@@ -257,6 +257,16 @@ filled-in with a query ID.
sub bgsend_and_start_lookup {
my($self, $domain, $type, $class, $ent, $cb, %options) = @_;
+
+ # At this point the $domain should already be encoded to UTF-8 and
+ # IDN converted to ASCII-compatible encoding (ACE). Make sure this is
+ # really the case in order to be able to catch any leftover omissions.
+ if (utf8::is_utf8($domain)) {
+ warn "bgsend_and_start_lookup: domain name in Unicode, expected octets: $domain\n";
+ } elsif ($domain =~ tr/\x00-\x7F//c) { # is not all-ASCII
+ info("bgsend_and_start_lookup: non-ASCII domain name: %s", $domain);
+ }
+
$ent = {} if !$ent;
$domain =~ s/\.+\z//s; # strip trailing dots, these sometimes still sneak in
$ent->{id} = undef;
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Dns.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Dns.pm?rev=1694252&r1=1694251&r2=1694252&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Dns.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Dns.pm Wed Aug 5 15:47:34 2015
@@ -29,7 +29,7 @@ use Mail::SpamAssassin::Conf;
use Mail::SpamAssassin::PerMsgStatus;
use Mail::SpamAssassin::AsyncLoop;
use Mail::SpamAssassin::Constants qw(:ip);
-use Mail::SpamAssassin::Util qw(untaint_var am_running_on_windows);
+use Mail::SpamAssassin::Util qw(untaint_var am_running_on_windows idn_to_ascii);
use File::Spec;
use IO::Socket;
@@ -101,6 +101,7 @@ BEGIN {
sub do_rbl_lookup {
my ($self, $rule, $set, $type, $host, $subtest) = @_;
+ $host = idn_to_ascii($host);
$host =~ s/\.\z//s; # strip a redundant trailing dot
my $key = "dns:$type:$host";
my $existing_ent = $self->{async}->get_lookup($key);
@@ -145,6 +146,7 @@ sub register_rbl_subtest {
sub do_dns_lookup {
my ($self, $rule, $type, $host) = @_;
+ $host = idn_to_ascii($host);
$host =~ s/\.\z//s; # strip a redundant trailing dot
my $key = "dns:$type:$host";
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/DnsResolver.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/DnsResolver.pm?rev=1694252&r1=1694251&r2=1694252&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/DnsResolver.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/DnsResolver.pm Wed Aug 5 15:47:34 2015
@@ -45,7 +45,7 @@ require 5.008001; # needs utf8::is_utf8
use Mail::SpamAssassin;
use Mail::SpamAssassin::Logger;
use Mail::SpamAssassin::Constants qw(:ip);
-use Mail::SpamAssassin::Util qw(untaint_var decode_dns_question_entry);
+use Mail::SpamAssassin::Util qw(untaint_var decode_dns_question_entry idn_to_ascii);
use Socket;
use Errno qw(EADDRINUSE EACCES);
@@ -878,8 +878,9 @@ sub send {
# using some arbitrary encoding (they are normally just 7-bit ascii
# characters anyway, just need to get rid of the utf8 flag). Bug 6959
# Most if not all af these come from a SPF plugin.
+ # (was a call to utf8::encode($name), now we prefer a proper idn_to_ascii)
#
- utf8::encode($name);
+ $name = idn_to_ascii($name);
my $retrans = $self->{retrans};
my $retries = $self->{retry};
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/AskDNS.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/AskDNS.pm?rev=1694252&r1=1694251&r2=1694252&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/AskDNS.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/AskDNS.pm Wed Aug 5 15:47:34 2015
@@ -189,7 +189,7 @@ use warnings;
use re 'taint';
use Mail::SpamAssassin::Plugin;
-use Mail::SpamAssassin::Util qw(decode_dns_question_entry);
+use Mail::SpamAssassin::Util qw(decode_dns_question_entry idn_to_ascii);
use Mail::SpamAssassin::Logger;
use vars qw(@ISA %rcode_value $txtdata_can_provide_a_list);
@@ -465,6 +465,7 @@ OUTER:
$query_domain =~ s{_([A-Z][A-Z0-9]*)_}
{ defined $current_tag_val{$1} ? $current_tag_val{$1}
: '' }ge;
+ $query_domain = idn_to_ascii($query_domain);
# the $dnskey identifies this query in AsyncLoop's pending_lookups
my $dnskey = join(':', 'askdns', $query_type, $query_domain);
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HeaderEval.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HeaderEval.pm?rev=1694252&r1=1694251&r2=1694252&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HeaderEval.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HeaderEval.pm Wed Aug 5 15:47:34 2015
@@ -25,6 +25,8 @@ use Errno qw(EBADF);
use Mail::SpamAssassin::Plugin;
use Mail::SpamAssassin::Locales;
+use Mail::SpamAssassin::Util qw(get_my_locales parse_rfc822_date
+ idn_to_ascii is_valid_utf_8);
use Mail::SpamAssassin::Logger;
use Mail::SpamAssassin::Constants qw(:sa :ip);
@@ -276,6 +278,17 @@ sub check_illegal_chars {
$str =~ s/^(?:Subject|From):.*$//gmi;
}
+ if ($str =~ tr/\x00-\x7F//c && is_valid_utf_8($str)) {
+ # is non-ASCII and is valid UTF-8
+ if ($str =~ tr/\x00-\x08\x0B\x0C\x0E-\x1F//) {
+ dbg("eval: %s is valid UTF-8 but contains controls: %s", $header, $str);
+ } else {
+ # todo: only with a SMTPUTF8 mail
+ dbg("eval: %s is valid UTF-8: %s", $header, $str);
+ return 0;
+ }
+ }
+
# count illegal substrings (RFC 2045)
# (non-ASCII + C0 controls except TAB, NL, CR)
my $illegal = $str =~ tr/\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff//;
@@ -1037,6 +1050,7 @@ sub check_ratware_envelope_from {
if ($to =~ /^([^@]+)@(.+)$/) {
my($user,$dom) = ($1,$2);
+ $dom = idn_to_ascii($dom);
$dom = $self->{main}->{registryboundaries}->trim_domain($dom);
return unless
($self->{main}->{registryboundaries}->is_domain_valid($dom));
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm?rev=1694252&r1=1694251&r2=1694252&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIDNSBL.pm Wed Aug 5 15:47:34 2015
@@ -294,7 +294,7 @@ package Mail::SpamAssassin::Plugin::URID
use Mail::SpamAssassin::Plugin;
use Mail::SpamAssassin::Constants qw(:ip);
-use Mail::SpamAssassin::Util;
+use Mail::SpamAssassin::Util qw(idn_to_ascii);
use Mail::SpamAssassin::Logger;
use strict;
use warnings;
@@ -901,6 +901,7 @@ sub query_hosts_or_domains {
sub lookup_domain_ns {
my ($self, $pms, $obj, $dom, $rulename) = @_;
+ $dom = idn_to_ascii($dom);
my $key = "NS:" . $dom;
my $ent = {
key => $key, zone => $dom, obj => $obj, type => "URI-NS",
@@ -986,6 +987,7 @@ sub complete_ns_lookup {
sub lookup_a_record {
my ($self, $pms, $obj, $hname, $rulename) = @_;
+ $hname = idn_to_ascii($hname);
my $key = "A:" . $hname;
my $ent = {
key => $key, zone => $hname, obj => $obj, type => "URI-A",
@@ -1054,6 +1056,7 @@ sub lookup_dnsbl_for_ip {
sub lookup_single_dnsbl {
my ($self, $pms, $obj, $rulename, $lookupstr, $dnsbl, $qtype) = @_;
+ $dnsbl = idn_to_ascii($dnsbl);
my $key = "DNSBL:" . $lookupstr . ':' . $dnsbl;
my $ent = {
key => $key, zone => $dnsbl, obj => $obj, type => 'URI-DNSBL',
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm?rev=1694252&r1=1694251&r2=1694252&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm Wed Aug 5 15:47:34 2015
@@ -62,7 +62,9 @@ BEGIN {
@EXPORT_OK = qw(&local_tz &base64_decode &untaint_var &untaint_file_path
&exit_status_str &proc_status_ok &am_running_on_windows
&reverse_ip_address &decode_dns_question_entry
- &secure_tmpfile &secure_tmpdir &uri_list_canonicalize);
+ &secure_tmpfile &secure_tmpdir &uri_list_canonicalize
+ &get_my_locales &parse_rfc822_date &idn_to_ascii
+ &is_valid_utf_8);
}
use Mail::SpamAssassin;
@@ -74,6 +76,7 @@ use File::Basename;
use Time::Local;
use Sys::Hostname (); # don't import hostname() into this namespace!
use NetAddr::IP 4.000;
+use Scalar::Util qw(tainted);
use Fcntl;
use Errno qw(ENOENT EACCES EEXIST);
use POSIX qw(:sys_wait_h WIFEXITED WIFSIGNALED WIFSTOPPED WEXITSTATUS
@@ -96,6 +99,43 @@ BEGIN {
###########################################################################
+our $ALT_FULLSTOP_UTF8_RE;
+BEGIN {
+ # Bug 6751:
+ # RFC 3490 (IDNA): Whenever dots are used as label separators, the
+ # following characters MUST be recognized as dots: U+002E (full stop),
+ # U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
+ # U+FF61 (halfwidth ideographic full stop).
+ # RFC 5895: [...] the IDEOGRAPHIC FULL STOP character (U+3002)
+ # can be mapped to the FULL STOP before label separation occurs.
+ # [...] Only the IDEOGRAPHIC FULL STOP character (U+3002) is added in
+ # this mapping because the authors have not fully investigated [...]
+ # Adding also 'SMALL FULL STOP' (U+FE52) as seen in the wild,
+ # and a 'ONE DOT LEADER' (U+2024).
+ #
+ my $dot_chars = "\x{2024}\x{3002}\x{FF0E}\x{FF61}\x{FE52}"; # \x{002E}
+ my $dot_bytes = join('|', split(//,$dot_chars)); utf8::encode($dot_bytes);
+ $ALT_FULLSTOP_UTF8_RE = qr/$dot_bytes/so;
+}
+
+###########################################################################
+
+our $enc_utf8;
+BEGIN {
+ eval { require Encode }
+ and do { $enc_utf8 = Encode::find_encoding('UTF-8') }
+};
+
+our $have_libidn;
+BEGIN {
+ eval { require Net::LibIDN } and do { $have_libidn = 1 };
+}
+
+$have_libidn or warn "INFO: module Net::LibIDN not available,\n".
+ " internationalized domain names with U-labels will not be recognized!\n";
+
+###########################################################################
+
# find an executable in the current $PATH (or whatever for that platform)
{
# Show the PATH we're going to explore only once.
@@ -338,6 +378,93 @@ sub taint_var {
###########################################################################
+# returns true if the provided string of octets represents a syntactically
+# valid UTF-8 string, otherwise a false is returned
+#
+sub is_valid_utf_8($) {
+# my $octets = $_[0];
+ return undef if !defined $_[0];
+ #
+ # RFC 6532: UTF8-non-ascii = UTF8-2 / UTF8-3 / UTF8-4
+ # RFC 3629 section 4: Syntax of UTF-8 Byte Sequences
+ # UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
+ # UTF8-1 = %x00-7F
+ # UTF8-2 = %xC2-DF UTF8-tail
+ # UTF8-3 = %xE0 %xA0-BF UTF8-tail /
+ # %xE1-EC 2( UTF8-tail ) /
+ # %xED %x80-9F UTF8-tail /
+ # # U+D800..U+DFFF are utf16 surrogates, not legal utf8
+ # %xEE-EF 2( UTF8-tail )
+ # UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) /
+ # %xF1-F3 3( UTF8-tail ) /
+ # %xF4 %x80-8F 2( UTF8-tail )
+ # UTF8-tail = %x80-BF
+ #
+ # loose variant:
+ # [\x00-\x7F] | [\xC0-\xDF][\x80-\xBF] |
+ # [\xE0-\xEF][\x80-\xBF]{2} | [\xF0-\xF4][\x80-\xBF]{3}
+ #
+ $_[0] =~ /^ (?: [\x00-\x7F] |
+ [\xC2-\xDF] [\x80-\xBF] |
+ \xE0 [\xA0-\xBF] [\x80-\xBF] |
+ [\xE1-\xEC] [\x80-\xBF]{2} |
+ \xED [\x80-\x9F] [\x80-\xBF] |
+ [\xEE-\xEF] [\x80-\xBF]{2} |
+ \xF0 [\x90-\xBF] [\x80-\xBF]{2} |
+ [\xF1-\xF3] [\x80-\xBF]{3} |
+ \xF4 [\x80-\x8F] [\x80-\xBF]{2} )* \z/xs ? 1 : 0;
+}
+
+# Given an international domain name with U-labels (UTF-8 or Unicode chars)
+# converts it to ASCII-compatible encoding (ACE). If the argument is in
+# ASCII (or is an invalid IDN), returns it lowercased but otherwise unchanged.
+# The result is always in octets (utf8 flag off) even if the argument was in
+# Unicode characters.
+#
+sub idn_to_ascii($) {
+ no bytes; # make sure there is no 'use bytes' in effect
+ return undef if !defined $_[0];
+ my $s = "$_[0]"; # stringify
+ # propagate taintedness of the argument, but not its utf8 flag
+ my $t = tainted($s); # taintedness of the argument
+ if ($t) { # untaint $s, avoids taint-related bugs in LibIDN or in old perl
+ no re 'taint'; local $1; $s =~ /^(.*)\z/s;
+ }
+ # encode chars to UTF-8, leave octets unchanged (not necessarily valid UTF-8)
+ utf8::encode($s) if utf8::is_utf8($s);
+ if ($s !~ tr/\x00-\x7F//c) { # is all-ASCII (including IP address literal)
+ $s = lc $s;
+ } elsif (!is_valid_utf_8($s)) {
+ my($package, $filename, $line) = caller;
+ info("util: idn_to_ascii: not valid UTF-8: /%s/, called from %s line %d",
+ $s, $package, $line);
+ $s = lc $s; # garbage-in / garbage-out
+ } else {
+ my $chars;
+ # RFC 3490 (IDNA): Whenever dots are used as label separators, the
+ # following characters MUST be recognized as dots: U+002E (full stop),
+ # U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
+ # U+FF61 (halfwidth ideographic full stop).
+ if ($s =~ s/$ALT_FULLSTOP_UTF8_RE/./gso) {
+ info("util: idn_to_ascii: alternative dots normalized: /%s/ -> /%s/",
+ $_[0], $s);
+ }
+ if ($have_libidn) {
+ # to ASCII-compatible encoding (ACE), lowercased
+ my $sa = Net::LibIDN::idn_to_ascii($s, 'UTF-8');
+ if (!defined $sa) {
+ info("util: idn_to_ascii: conversion to ACE failed: /%s/", $s);
+ } else {
+ info("util: idn_to_ascii: converted to ACE: /%s/ -> /%s/", $s, $sa);
+ $s = $sa;
+ }
+ }
+ }
+ $t ? taint_var($s) : $s; # propagate taintedness of the argument
+}
+
+###########################################################################
+
# map process termination status number to an informative string, and
# append optional mesage (dual-valued errno or a string or a number),
# returning the resulting string
@@ -1314,20 +1441,10 @@ sub uri_list_canonicalize {
# not required
$rest ||= '';
- # Bug 6751:
- # RFC 3490 (IDNA): Whenever dots are used as label separators, the
- # following characters MUST be recognized as dots: U+002E (full stop),
- # U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
- # U+FF61 (halfwidth ideographic full stop).
- # RFC 5895: [...] the IDEOGRAPHIC FULL STOP character (U+3002)
- # can be mapped to the FULL STOP before label separation occurs.
- # [...] Only the IDEOGRAPHIC FULL STOP character (U+3002) is added in
- # this mapping because the authors have not fully investigated [...]
- # Adding also 'SMALL FULL STOP' (U+FE52) as seen in the wild.
- # Parhaps also the 'ONE DOT LEADER' (U+2024).
- if ($host =~ s{(?: \xE3\x80\x82 | \xEF\xBC\x8E | \xEF\xBD\xA1 |
- \xEF\xB9\x92 | \xE2\x80\xA4 )}{.}xgs) {
- push(@nuris, join ('', $proto, $host, $rest));
+ my $nhost = idn_to_ascii($host);
+ if (defined $nhost && $nhost ne lc $host) {
+ push(@nuris, join('', $proto, $nhost, $rest));
+ $host = $nhost;
}
# bug 4146: deal with non-US ASCII 7-bit chars in the host portion