You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2005/02/21 08:02:31 UTC

svn commit: r154646 - in spamassassin/trunk: lib/Mail/SpamAssassin/Util.pm rules/70_testing.cf t/uri.t

Author: felicity
Date: Sun Feb 20 23:02:28 2005
New Revision: 154646

URL: http://svn.apache.org/viewcvs?view=rev&rev=154646
Log:
bug 4146: invalid chars in the host section of uris would be missed for uribl checks.  rework uri canonify code, add test and rules for the issue.

Modified:
    spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
    spamassassin/trunk/rules/70_testing.cf
    spamassassin/trunk/t/uri.t

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm?view=diff&r1=154645&r2=154646
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm Sun Feb 20 23:02:28 2005
@@ -910,17 +910,12 @@
     $nuri =~ s#^(https?:)/{0,2}#$1//#i;
 
     # http://www.foo.biz?id=3 -> http://www.foo.biz/?id=3
-    $nuri =~ s@^(https?://[^/?]+)\?@$1/?@;
+    $nuri =~ s@^(https?://[^/?]+)\?@$1/?@i;
 
     # deal with encoding of chars, this is just the set of printable
     # chars minus ' ' (that is, dec 33-126, hex 21-7e)
     $nuri =~ s/\&\#0*(3[3-9]|[4-9]\d|1[01]\d|12[0-6]);/sprintf "%c",$1/ge;
-    $nuri =~ s/\&\#x0*(2[1-9]|[3-6][a-f0-9]|7[0-9a-e]);/sprintf "%c",hex($1)/gei;
-
-    # deal with the %## encoding if necessary
-    if ($nuri =~ /\%[0-9a-fA-F]{2}/) {
-      $nuri = Mail::SpamAssassin::Util::url_encode($nuri);
-    }
+    $nuri =~ s/\&\#x0*(2[1-9]|[3-6][a-fA-F0-9]|7[0-9a-eA-E]);/sprintf "%c",hex($1)/ge;
 
     # put the new URI on the new list if it's different
     if ($nuri ne $uri) {
@@ -929,11 +924,35 @@
 
     # deal with wierd hostname parts, remove user/pass, etc.
     if ($nuri =~ m{^(https?://)([^/]+)(\/.*)?$}i) {
-      my ($proto, $host, $rest) = ($1,$2,$3);
+      my($proto, $host, $rest) = ($1,$2,$3);
 
       # not required
       $rest ||= '';
 
+      # bug 4146: deal with non-US ASCII 7-bit chars in the host portion
+      # of the URI according to RFC 1738 that's invalid, and the tested
+      # browsers (Firefox, IE) remove them before usage...
+      if ($host =~ tr/\000-\040\200-\377//d) {
+        push(@nuris, join ('', $proto, $host, $rest));
+      }
+
+      # deal with the %## encoding if necessary
+      # only worry about decoding stuff as an obfuscation technique?
+      # encoding isn't allowed in anything but $rest, so just deal with it
+      # there.
+      if ($rest =~ /\%[0-9a-fA-F]{2}/) {
+        $rest = Mail::SpamAssassin::Util::url_encode($rest);
+        push(@nuris, join ('', $proto, $host, $rest));
+      }
+
+      # deal with http redirectors.  strip off one level of redirector
+      # and add back to the array.  the foreach loop will go over those
+      # and deal appropriately.
+      # bug 3308: redirectors like yahoo only need one '/' ... <grrr>
+      if ($rest =~ m{(https?:/{0,2}.+)$}i) {
+        push(@uris, $1);
+      }
+
       ########################
       ## TVD: known issue, if host has multiple combinations of the following,
       ## all permutations will be put onto @nuris.  shouldn't be an issue.
@@ -952,6 +971,7 @@
       if ($host =~ s/[^0-9A-Za-z]+$//) {
         push(@nuris, join ('', $proto, $host, $rest));
       }
+
       ########################
 
       # deal with 'http://213.172.0x1f.13/', decode encoded octets
@@ -974,14 +994,7 @@
       elsif ($host =~ /^[0-9]+$/) {
         push(@nuris, join ('', $proto, decode_ulong_to_ip($host), $rest));
       }
-    }
 
-    # deal with http redirectors.  strip off one level of redirector
-    # and add back to the array.  the foreach loop will go over those
-    # and deal appropriately.
-    # bug 3308: redirectors like yahoo only need one '/' ... <grrr>
-    if ($nuri =~ m{^https?://.+?(https?:/{0,2}.+)$}i) {
-      push(@uris, $1);
     }
   }
 

Modified: spamassassin/trunk/rules/70_testing.cf
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/rules/70_testing.cf?view=diff&r1=154645&r2=154646
==============================================================================
--- spamassassin/trunk/rules/70_testing.cf (original)
+++ spamassassin/trunk/rules/70_testing.cf Sun Feb 20 23:02:28 2005
@@ -381,3 +381,6 @@
 
 header __RATWARE_NAME_ID	eval:check_ratware_name_id()
 meta T_RATWARE_NAME_ID	__RATWARE_0_TZ_DATE && __RATWARE_NAME_ID
+
+uri T_HTTP_BAD_HOST_CHAR	m@^https?://[^/]*[\000-\040\200-\377]@i
+uri T_HTTP_BAD_HOST_CTRL	m@^https?://[^/]*[\000-\037]@i

Modified: spamassassin/trunk/t/uri.t
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/t/uri.t?view=diff&r1=154645&r2=154646
==============================================================================
--- spamassassin/trunk/t/uri.t (original)
+++ spamassassin/trunk/t/uri.t Sun Feb 20 23:02:28 2005
@@ -125,6 +125,7 @@
    ]));
 
 ok(try_canon(['http://images.google.ca/imgres?imgurl=gmib.free.fr/viagra.jpg&imgrefurl=http://www.google.com/url?q=http://www.google.com/url?q=%68%74%74%70%3A%2F%2F%77%77%77%2E%65%78%70%61%67%65%2E%63%6F%6D%2F%6D%61%6E%67%65%72%33%32'],
+
    [
    'http://images.google.ca/imgres?imgurl=gmib.free.fr/viagra.jpg&imgrefurl=http://www.google.com/url?q=http://www.google.com/url?q=%68%74%74%70%3A%2F%2F%77%77%77%2E%65%78%70%61%67%65%2E%63%6F%6D%2F%6D%61%6E%67%65%72%33%32',
    'http://images.google.ca/imgres?imgurl=gmib.free.fr/viagra.jpg&imgrefurl=http://www.google.com/url?q=http://www.google.com/url?q=http://www.expage.com/manger32',
@@ -132,6 +133,10 @@
    'http://www.google.com/url?q=http://www.expage.com/manger32',
    'http://www.google.com/url?q=http://www.google.com/url?q=http://www.expage.com/manger32',
    ]));
+
+ok(try_canon(["http://www.kl\nuge.n\net/"],
+  ['http://www.kluge.net/']
+  ));
 
 ##############################################