You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by he...@apache.org on 2019/07/31 11:46:53 UTC
svn commit: r1864045 - in /spamassassin/trunk: lib/Mail/SpamAssassin/Util.pm
t/uri.t
Author: hege
Date: Wed Jul 31 11:46:53 2019
New Revision: 1864045
URL: http://svn.apache.org/viewvc?rev=1864045&view=rev
Log:
idn_to_ascii fixes, handle iso-8859-1 diacritic domains, add some tests
Modified:
spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
spamassassin/trunk/t/uri.t
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm?rev=1864045&r1=1864044&r2=1864045&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm Wed Jul 31 11:46:53 2019
@@ -422,43 +422,59 @@ sub idn_to_ascii {
no bytes; # make sure there is no 'use bytes' in effect
return undef if !defined $_[0]; ## no critic (ProhibitExplicitReturnUndef)
my $s = "$_[0]"; # stringify
- # propagate taintedness of the argument, but not its utf8 flag
- my $t = tainted($s); # taintedness of the argument
+
+ # encode chars to UTF-8, leave octets unchanged (not necessarily valid UTF-8)
+ utf8::encode($s) if utf8::is_utf8($s); # i.e. remove utf-8 flag if set
+
+ # Rapid return for most common case, all-ASCII (including IP address literal),
+ # no conversion needed. Also if we don't have LibIDN, nothing more we can do.
+ if ($s !~ tr/a-zA-Z0-9_.:[]-//c || !$have_libidn) {
+ return lc $s; # retains taintedness
+ }
+
+ # propagate taintedness of the argument
+ my $t = tainted($s);
if ($t) { # untaint $s, avoids taint-related bugs in LibIDN or in old perl
- no re 'taint'; local $1; $s =~ /^(.*)\z/s;
+ $s = untaint_var($s);
}
- # encode chars to UTF-8, leave octets unchanged (not necessarily valid UTF-8)
- utf8::encode($s) if utf8::is_utf8($s);
- if ($s !~ tr/\x00-\x7F//c) { # is all-ASCII (including IP address literal)
- $s = lc $s;
- } elsif (!is_valid_utf_8($s)) {
- my($package, $filename, $line) = caller;
- info("util: idn_to_ascii: not valid UTF-8: /%s/, called from %s line %d",
- $s, $package, $line);
- $s = lc $s; # garbage-in / garbage-out
- } else { # is valid UTF-8 but not all-ASCII
- my $chars;
+
+ my $charset;
+
+ # Check for valid UTF-8
+ if (is_valid_utf_8($s)) {
# RFC 3490 (IDNA): Whenever dots are used as label separators, the
# following characters MUST be recognized as dots: U+002E (full stop),
# U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
# U+FF61 (halfwidth ideographic full stop).
if ($s =~ s/$ALT_FULLSTOP_UTF8_RE/./gso) {
- info("util: idn_to_ascii: alternative dots normalized: /%s/ -> /%s/",
+ dbg("util: idn_to_ascii: alternative dots normalized: /%s/ -> /%s/",
$_[0], $s);
}
- if (!$have_libidn) {
- $s = lc $s;
+ $charset = 'UTF-8';
+ }
+ # Check for valid extended ISO-8859-1 including diacritics
+ elsif ($s !~ tr/a-zA-Z0-9\xc0-\xd6\xd8-\xde\xe0-\xf6\xf8-\xfe_.-//c) {
+ $charset = 'ISO-8859-1';
+ }
+
+ if ($charset) {
+ # to ASCII-compatible encoding (ACE), lowercased
+ my $sa = Net::LibIDN::idn_to_ascii($s, $charset);
+ if (!defined $sa) {
+ info("util: idn_to_ascii: conversion to ACE failed: /%s/ (charset %s)",
+ $s, $charset);
} else {
- # to ASCII-compatible encoding (ACE), lowercased
- my $sa = Net::LibIDN::idn_to_ascii($s, 'UTF-8');
- if (!defined $sa) {
- info("util: idn_to_ascii: conversion to ACE failed: /%s/", $s);
- } else {
- info("util: idn_to_ascii: converted to ACE: /%s/ -> /%s/", $s, $sa);
- $s = $sa;
- }
+ dbg("util: idn_to_ascii: converted to ACE: /%s/ -> /%s/ (charset %s)",
+ $s, $sa, $charset);
+ $s = $sa;
}
+ } else {
+ my($package, $filename, $line) = caller;
+ info("util: idn_to_ascii: valid charset not detected: /%s/, called from %s line %d",
+ $s, $package, $line);
+ $s = lc $s; # garbage-in / garbage-out
}
+
$t ? taint_var($s) : $s; # propagate taintedness of the argument
}
@@ -1482,7 +1498,10 @@ sub uri_list_canonicalize {
# bug 4146: deal with non-US ASCII 7-bit chars in the host portion
# of the URI according to RFC 1738 that's invalid, and the tested
# browsers (Firefox, IE) remove them before usage...
- if ($host =~ tr/\000-\040\200-\377//d) {
+ #if ($host =~ tr/\000-\040\200-\377//d) {
+ # Fixed 7/2019 to not strip extended chars, since they can be used in
+ # IDN domains. Stripping control chars should be enough?
+ if ($host =~ tr/\x00-\x20//d) {
push(@nuris, join ('', $proto, $host, $rest));
}
Modified: spamassassin/trunk/t/uri.t
URL: http://svn.apache.org/viewvc/spamassassin/trunk/t/uri.t?rev=1864045&r1=1864044&r2=1864045&view=diff
==============================================================================
--- spamassassin/trunk/t/uri.t (original)
+++ spamassassin/trunk/t/uri.t Wed Jul 31 11:46:53 2019
@@ -15,11 +15,21 @@ if (-e 'test_dir') { # runnin
$prefix = '..';
}
+my $have_libidn;
+BEGIN {
+ eval { require Net::LibIDN } and do { $have_libidn = 1 };
+}
+
use strict;
-use Test::More tests => 98;
+use Test::More;
use lib '.'; use lib 't';
use SATest; sa_t_init("uri");
+my $tests = 98;
+$tests += 5 if $have_libidn;
+
+plan tests => $tests;
+
use Mail::SpamAssassin;
use Mail::SpamAssassin::HTML;
use Mail::SpamAssassin::Util;
@@ -109,6 +119,14 @@ ok(try_domains('http://ebg&vosxfov.com.m
ok(try_domains('http://blah.blah.com:/', 'blah.com'));
ok(try_domains('http://example.com.%20.host.example.info/', 'example.info'));
+if ($have_libidn) {
+ ok(try_domains('Cinéma.ca', 'xn--cinma-dsa.ca'));
+ ok(try_domains('marcaespaña.es', 'xn--marcaespaa-19a.es'));
+ ok(try_domains('äkäslompolo.fi', 'xn--kslompolo-u2ab.fi'));
+ ok(try_domains('foo.xn--fiqs8s', 'foo.xn--fiqs8s'));
+ ok(try_domains("foo\x2e\xe9\xa6\x99\xe6\xb8\xaf", 'foo.xn--j6w193g'));
+}
+
##############################################
sub array_cmp {