You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by mm...@apache.org on 2015/09/15 18:33:50 UTC

svn commit: r1703247 - in /spamassassin/trunk: lib/Mail/SpamAssassin/Conf.pm lib/Mail/SpamAssassin/Plugin/HeaderEval.pm lib/Mail/SpamAssassin/RegistryBoundaries.pm lib/Mail/SpamAssassin/Util.pm rules/20_aux_tlds.cf

Author: mmartinec
Date: Tue Sep 15 16:33:50 2015
New Revision: 1703247

URL: http://svn.apache.org/r1703247
Log:
Bug 7215: Towards supporting IDNA - handle IDN domain boundaries

Modified:
    spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HeaderEval.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/RegistryBoundaries.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
    spamassassin/trunk/rules/20_aux_tlds.cf

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm?rev=1703247&r1=1703246&r2=1703247&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm Tue Sep 15 16:33:50 2015
@@ -82,13 +82,12 @@ use warnings;
 # use bytes;
 use re 'taint';
 
-use Mail::SpamAssassin::Util;
 use Mail::SpamAssassin::NetSet;
 use Mail::SpamAssassin::Constants qw(:sa :ip);
 use Mail::SpamAssassin::Conf::Parser;
 use Mail::SpamAssassin::Logger;
 use Mail::SpamAssassin::Util::TieOneStringHash;
-use Mail::SpamAssassin::Util qw(untaint_var);
+use Mail::SpamAssassin::Util qw(untaint_var idn_to_ascii);
 use File::Spec;
 
 use vars qw{
@@ -3477,8 +3476,11 @@ subdomain of the specified zone.
 
 =item util_rb_tld tld1 tld2 ...
 
-This option maintains list of valid TLDs in the RegistryBoundaries code. 
-TLDs include things like com, net, org, etc.
+This option maintains a list of valid TLDs in the RegistryBoundaries code. 
+Top level domains (TLD) include things like com, net, org, xn--p1ai, рф, ...
+International domain names may be specified in ASCII-compatible encoding (ACE),
+e.g. xn--p1ai, xn--qxam, or with Unicode labels encoded as UTF-8 octets,
+e.g. рф, ελ.
 
 =cut
 
@@ -3541,7 +3543,7 @@ TLDs include things like com, net, org,
     xn--wgbh1c xn--wgbl6a xn--xhq521b xn--xkc2al3hye2a xn--xkc2dl3a5ee0h
     xn--yfro4i67o xn--ygbi2ammx xn--zfr164b xxx xyz yachts yandex ye yokohama
     youtube yt za zm zone zw
-    /) { $self->{valid_tlds}{lc $_} = 1; }
+    /) { $self->{valid_tlds}{idn_to_ascii($_)} = 1 }
 
   push (@cmds, {
     setting => 'util_rb_tld',
@@ -3555,7 +3557,7 @@ TLDs include things like com, net, org,
 	return $INVALID_VALUE;
       }
       foreach (split(/\s+/, $value)) {
-        $self->{valid_tlds}{lc $_} = 1;
+        $self->{valid_tlds}{idn_to_ascii($_)} = 1;
       }
       dbg("config: added tld list - $value");
     }
@@ -3564,7 +3566,9 @@ TLDs include things like com, net, org,
 =item util_rb_2tld 2tld-1.tld 2tld-2.tld ...
 
 This option maintains list of valid 2nd-level TLDs in the RegistryBoundaries
-code.  2TLDs include things like co.uk, fed.us, etc.
+code.  2TLDs include things like co.uk, fed.us, etc.  International domain
+names may be specified in ASCII-compatible encoding (ACE), or with Unicode
+labels encoded as UTF-8 octets.
 
 =cut
 
@@ -3735,7 +3739,7 @@ code.  2TLDs include things like co.uk,
     net.ye org.ye ac.za alt.za bourse.za city.za co.za edu.za gov.za law.za
     mil.za net.za ngo.za nom.za org.za school.za tm.za web.za ac.zm co.zm
     com.zm edu.zm gov.zm org.zm sch.zm ac.zw co.zw gov.zw org.zw
-    /) { $self->{two_level_domains}{lc $_} = 1; }
+    /) { $self->{two_level_domains}{idn_to_ascii($_)} = 1 }
 
   push (@cmds, {
     setting => 'util_rb_2tld',
@@ -3749,7 +3753,7 @@ code.  2TLDs include things like co.uk,
 	return $INVALID_VALUE;
       }
       foreach (split(/\s+/, $value)) {
-        $self->{two_level_domains}{lc $_} = 1;
+        $self->{two_level_domains}{idn_to_ascii($_)} = 1;
       }
     }
   });
@@ -3757,7 +3761,9 @@ code.  2TLDs include things like co.uk,
 =item util_rb_3tld 3tld1.some.tld 3tld2.other.tld ...
 
 This option maintains list of valid 3rd-level TLDs in the RegistryBoundaries
-code.  3TLDs include things like demon.co.uk, plc.co.im, etc.
+code.  3TLDs include things like demon.co.uk, plc.co.im, etc.  International
+domain names may be specified in ASCII-compatible encoding (ACE), or with
+Unicode labels encoded as UTF-8 octets.
 
 =cut
 
@@ -3766,7 +3772,7 @@ code.  3TLDs include things like demon.c
   # sa-update 20_aux_tlds.cf.
   foreach (qw/
     demon.co.uk esc.edu.ar lkd.co.im plc.co.im
-    /) { $self->{three_level_domains}{lc $_} = 1; }
+    /) { $self->{three_level_domains}{idn_to_ascii($_)} = 1 }
 
   push (@cmds, {
     setting => 'util_rb_3tld',
@@ -3780,7 +3786,7 @@ code.  3TLDs include things like demon.c
 	return $INVALID_VALUE;
       }
       foreach (split(/\s+/, $value)) {
-        $self->{three_level_domains}{lc $_} = 1;
+        $self->{three_level_domains}{idn_to_ascii($_)} = 1;
       }
     }
   });

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HeaderEval.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HeaderEval.pm?rev=1703247&r1=1703246&r2=1703247&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HeaderEval.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HeaderEval.pm Tue Sep 15 16:33:50 2015
@@ -1048,9 +1048,8 @@ sub check_ratware_envelope_from {
   return 0 if $from eq '' || $to eq '';
   return 0 if $from =~ /^SRS\d=/;
 
-  if ($to =~ /^([^@]+)@(.+)$/) {
+  if ($to =~ /^([^@]+)\@(.+)$/) {
     my($user,$dom) = ($1,$2);
-    $dom = idn_to_ascii($dom);
     $dom = $self->{main}->{registryboundaries}->trim_domain($dom);
     return unless
         ($self->{main}->{registryboundaries}->is_domain_valid($dom));

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/RegistryBoundaries.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/RegistryBoundaries.pm?rev=1703247&r1=1703246&r2=1703247&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/RegistryBoundaries.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/RegistryBoundaries.pm Tue Sep 15 16:33:50 2015
@@ -33,6 +33,9 @@ use re 'taint';
 our @ISA = qw();
 use vars qw(%US_STATES);
 
+use Mail::SpamAssassin::Logger;
+use Mail::SpamAssassin::Util qw(idn_to_ascii);
+
 # called from SpamAssassin->init() to create $self->{util_rb}
 sub new {
   my $class = shift;
@@ -46,7 +49,8 @@ sub new {
   bless ($self, $class);
 
   # Initialize valid_tlds_re for schemeless uri parsing, FreeMail etc
-  if ($self->{conf}->{valid_tlds}) {
+  if ($self->{conf}->{valid_tlds} && %{$self->{conf}->{valid_tlds}}) {
+    # International domain names are already in ASCII-compatible encoding (ACE)
     my $tlds = join('|', keys %{$self->{conf}->{valid_tlds}});
     # Perl 5.10+ trie optimizes lists, no need for fancy regex optimizing
     $self->{valid_tlds_re} = qr/(?:$tlds)/i;
@@ -87,9 +91,9 @@ Examples:
 =cut
 
 sub split_domain {
-  my $self = shift;
-  my $domain = lc shift;
+  my ($self, $domain) = @_;
 
+  $domain = idn_to_ascii($domain);
   my $hostname = '';
 
   if (defined $domain && $domain ne '') {
@@ -126,12 +130,14 @@ sub split_domain {
         }
         else {
           my $temp = join(".", @domparts);
+          # International domain names in ASCII-compatible encoding (ACE)
           last if ($self->{conf}->{three_level_domains}{$temp});
         }
       }
       elsif (@domparts == 2) {
         # co.uk, etc.
         my $temp = join(".", @domparts);
+        # International domain names in ASCII-compatible encoding (ACE)
         last if ($self->{conf}->{two_level_domains}{$temp});
       }
       push(@hostname, shift @domparts);
@@ -185,12 +191,13 @@ uses a valid TLD or ccTLD.
 =cut
 
 sub is_domain_valid {
-  my $self = shift;
-  my $dom = lc shift;
+  my ($self, $dom) = @_;
 
   # domains don't have whitespace
   return 0 if ($dom =~ /\s/);
 
+  $dom = idn_to_ascii($dom);
+
   # ensure it ends in a known-valid TLD, and has at least 1 dot
   return 0 unless ($dom =~ /\.([^.]+)$/);
   return 0 unless ($self->{conf}->{valid_tlds}{$1});

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm?rev=1703247&r1=1703246&r2=1703247&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm Tue Sep 15 16:33:50 2015
@@ -440,7 +440,7 @@ sub idn_to_ascii($) {
     info("util: idn_to_ascii: not valid UTF-8: /%s/, called from %s line %d",
          $s, $package, $line);
     $s = lc $s;  # garbage-in / garbage-out
-  } else {
+  } else {  # is valid UTF-8 but not all-ASCII
     my $chars;
     # RFC 3490 (IDNA): Whenever dots are used as label separators, the
     # following characters MUST be recognized as dots: U+002E (full stop),
@@ -450,7 +450,9 @@ sub idn_to_ascii($) {
       info("util: idn_to_ascii: alternative dots normalized: /%s/ -> /%s/",
            $_[0], $s);
     }
-    if ($have_libidn) {
+    if (!$have_libidn) {
+      $s = lc $s;
+    } else {
       # to ASCII-compatible encoding (ACE), lowercased
       my $sa = Net::LibIDN::idn_to_ascii($s, 'UTF-8');
       if (!defined $sa) {

Modified: spamassassin/trunk/rules/20_aux_tlds.cf
URL: http://svn.apache.org/viewvc/spamassassin/trunk/rules/20_aux_tlds.cf?rev=1703247&r1=1703246&r2=1703247&view=diff
==============================================================================
--- spamassassin/trunk/rules/20_aux_tlds.cf (original)
+++ spamassassin/trunk/rules/20_aux_tlds.cf Tue Sep 15 16:33:50 2015
@@ -52,6 +52,8 @@ endif
 #
 # For an up to date list of IDN TLDs that can be pasted into this block, run this command:
 #  wget http://data.iana.org/TLD/tlds-alpha-by-domain.txt -q -O - | grep -i '^xn--' | tr '\n' ' ' | fold -w 80 -s | perl -pe 'chomp; s/.*/util_rb_tld \L$_\n/'
+# Since version 4.0 the util_rb_tld also accepts Unicode IDN labels (encoded as UTF-8), e.g.:
+#  wget http://data.iana.org/TLD/tlds-alpha-by-domain.txt -q -O - | grep -i '^xn--' | idn -u | tr '\n' ' ' | fold -w 80 -s | perl -pe 'chomp; s/.*/util_rb_tld \L$_\n/'
 
 if (can(Mail::SpamAssassin::Conf::feature_registryboundaries))
 util_rb_tld xn--1qqw23a xn--30rr7y xn--3bst00m xn--3ds443g xn--3e0b707e xn--45brj9c