You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2005/05/27 19:25:01 UTC
svn commit: r178801 - in /spamassassin/branches/3.0: lib/Mail/SpamAssassin/Util.pm t/uri.t

Author: felicity
Date: Fri May 27 10:25:01 2005
New Revision: 178801

URL: http://svn.apache.org/viewcvs?rev=178801&view=rev
Log:
bug 4213: backport uri canonification/anti-obfu code from 3.1

Modified:
    spamassassin/branches/3.0/lib/Mail/SpamAssassin/Util.pm
    spamassassin/branches/3.0/t/uri.t

Modified: spamassassin/branches/3.0/lib/Mail/SpamAssassin/Util.pm
URL: http://svn.apache.org/viewcvs/spamassassin/branches/3.0/lib/Mail/SpamAssassin/Util.pm?rev=178801&r1=178800&r2=178801&view=diff
==============================================================================
--- spamassassin/branches/3.0/lib/Mail/SpamAssassin/Util.pm (original)
+++ spamassassin/branches/3.0/lib/Mail/SpamAssassin/Util.pm Fri May 27 10:25:01 2005
@@ -801,7 +801,7 @@
   $uri =~ s,#.*$,,gs;			# drop fragment
   $uri =~ s#^[a-z]+:/{0,2}##gsi;	# drop the protocol
   $uri =~ s,^[^/]*\@,,gs;		# username/passwd
-  $uri =~ s,[/\?\&].*$,,gs;		# path/cgi params
+  $uri =~ s,[/\?].*$,,gs;		# path/cgi params
   $uri =~ s,:\d*$,,gs;			# port, bug 4191: sometimes the # is missing
 
   return if $uri =~ /\%/;         # skip undecoded URIs.
@@ -827,6 +827,7 @@
   # make sure we catch bad encoding tricks
   my @nuris = ();
   for my $uri (@uris) {
+    # we're interested in http:// and so on, skip mailto:
     next if $uri =~ /^mailto:/i;
 
     # sometimes we catch URLs on multiple lines
@@ -836,65 +837,110 @@
     $uri =~ s/^\s+//;
     $uri =~ s/\s+$//;
 
-    # Make a copy so we don't trash the original
+    # CRs just confuse things down below, so trash them now
+    $uri =~ s/\r//g;
+
+    # Make a copy so we don't trash the original in the array
     my $nuri = $uri;
 
     # http:www.foo.biz -> http://www.foo.biz
     $nuri =~ s#^(https?:)/{0,2}#$1//#i;
 
+    # *always* make a dup with all %-encoding decoded, since
+    # important parts of the URL may be encoded (such as the
+    # scheme). (bug 4213)
+    if ($nuri =~ /\%[0-9a-fA-F]{2}/) {
+      $nuri = Mail::SpamAssassin::Util::url_encode($nuri);
+    }
+
+    # www.foo.biz -> http://www.foo.biz
+    # unschemed URIs: assume default of "http://" as most MUAs do
+    if ($nuri !~ /^[-_a-z0-9]+:/i) {
+      if ($nuri =~ /^ftp\./) {
+	$nuri =~ s@^@ftp://@g;
+      }
+      else {
+	$nuri =~ s@^@http://@g;
+      }
+    }
+
     # http://www.foo.biz?id=3 -> http://www.foo.biz/?id=3
-    $nuri =~ s/^(https?:\/\/[^\/\?]+)\?/$1\/?/;
+    $nuri =~ s@^(https?://[^/?]+)\?@$1/?@i;
 
     # deal with encoding of chars, this is just the set of printable
     # chars minus ' ' (that is, dec 33-126, hex 21-7e)
     $nuri =~ s/\&\#0*(3[3-9]|[4-9]\d|1[01]\d|12[0-6]);/sprintf "%c",$1/ge;
-    $nuri =~ s/\&\#x0*(2[1-9]|[3-6][a-f0-9]|7[0-9a-e]);/sprintf "%c",hex($1)/gei;
+    $nuri =~ s/\&\#x0*(2[1-9]|[3-6][a-fA-F0-9]|7[0-9a-eA-E]);/sprintf "%c",hex($1)/ge;
 
-    # deal with wierd hostname parts
-    if ($nuri =~ /^(https?:\/\/)([^\/]+)(\.?\/.*)$/i) {
-      my ($proto, $host, $rest) = ($1,$2,$3);
+    # put the new URI on the new list if it's different
+    if ($nuri ne $uri) {
+      push(@nuris, $nuri);
+    }
+
+    # deal with wierd hostname parts, remove user/pass, etc.
+    if ($nuri =~ m{^(https?://)([^/]+)(\/.*)?$}i) {
+      my($proto, $host, $rest) = ($1,$2,$3);
+
+      # not required
+      $rest ||= '';
+
+      # bug 4146: deal with non-US ASCII 7-bit chars in the host portion
+      # of the URI according to RFC 1738 that's invalid, and the tested
+      # browsers (Firefox, IE) remove them before usage...
+      if ($host =~ tr/\000-\040\200-\377//d) {
+        push(@nuris, join ('', $proto, $host, $rest));
+      }
+
+      # deal with http redirectors.  strip off one level of redirector
+      # and add back to the array.  the foreach loop will go over those
+      # and deal appropriately.
+      # bug 3308: redirectors like yahoo only need one '/' ... <grrr>
+      if ($rest =~ m{(https?:/{0,2}.+)$}i) {
+        push(@uris, $1);
+      }
+
+      ########################
+      ## TVD: known issue, if host has multiple combinations of the following,
+      ## all permutations will be put onto @nuris.  shouldn't be an issue.
+
+      # Get rid of cruft that could cause confusion for rules...
 
       # remove "www.fakehostname.com@" username part
-      $host =~ s/^[^\@]+\@//gs;
+      if ($host =~ s/^[^\@]+\@//gs) {
+        push(@nuris, join ('', $proto, $host, $rest));
+      }
+
+      # bug 3186: If in a sentence, we might pick up odd characters ...
+      # ie: "visit http://example.biz." or "visit http://example.biz!!!"
+      # the host portion should end in some form of alpha-numeric, strip off
+      # the rest.
+      if ($host =~ s/[^0-9A-Za-z]+$//) {
+        push(@nuris, join ('', $proto, $host, $rest));
+      }
 
-      # deal with 'http://213.172.0x1f.13/'; decode encoded octets
-      if ($host =~ /^([0-9a-fx]*\.)([0-9a-fx]*\.)([0-9a-fx]*\.)([0-9a-fx]*)$/ix)
-      {
+      ########################
+
+      # deal with 'http://213.172.0x1f.13/', decode encoded octets
+      if ($host =~ /^([0-9a-fx]*\.)([0-9a-fx]*\.)([0-9a-fx]*\.)([0-9a-fx]*)$/ix) {
         my (@chunk) = ($1,$2,$3,$4);
         for my $octet (0 .. 3) {
           $chunk[$octet] =~ s/^0x([0-9a-f][0-9a-f])/sprintf "%d",hex($1)/gei;
         }
-        my $parsed = join ('', $proto, @chunk, $rest);
-        if ($parsed ne $nuri) { push(@nuris, $parsed); }
+        push(@nuris, join ('', $proto, @chunk, $rest));
       }
 
       # "http://0x7f000001/"
-      if ($host =~ /^0x[0-9a-f]+$/i) {
-        $host =~ s/^0x([0-9a-f]+)/sprintf "%d",hex($1)/gei;
-        $host = decode_ulong_to_ip ($host);
-        my $parsed = join ('', $proto, $host, $rest);
-        push(@nuris, $parsed);
+      elsif ($host =~ /^0x[0-9a-f]+$/i) {
+        # only take last 4 octets
+        $host =~ s/^0x[0-9a-f]*?([0-9a-f]{1,8})$/sprintf "%d",hex($1)/gei;
+        push(@nuris, join ('', $proto, decode_ulong_to_ip($host), $rest));
       }
 
       # "http://1113343453/"
-      if ($host =~ /^[0-9]+$/) {
-        $host = decode_ulong_to_ip ($host);
-        my $parsed = join ('', $proto, $host, $rest);
-        push(@nuris, $parsed);
+      elsif ($host =~ /^[0-9]+$/) {
+        push(@nuris, join ('', $proto, decode_ulong_to_ip($host), $rest));
       }
-    }
-
-    ($nuri) = Mail::SpamAssassin::Util::url_encode($nuri);
-    if ($nuri ne $uri) {
-      push(@nuris, $nuri);
-    }
 
-    # deal with http redirectors.  strip off one level of redirector
-    # and add back to the array.  the foreach loop will go over those
-    # and deal appropriately.
-    # bug 3308: redirectors like yahoo only need one '/' ... <grrr>
-    if ($nuri =~ m{^https?://.+?(https?:/{0,2}.+)$}i) {
-      push(@uris, $1);
     }
   }
 

Modified: spamassassin/branches/3.0/t/uri.t
URL: http://svn.apache.org/viewcvs/spamassassin/branches/3.0/t/uri.t?rev=178801&r1=178800&r2=178801&view=diff
==============================================================================
--- spamassassin/branches/3.0/t/uri.t (original)
+++ spamassassin/branches/3.0/t/uri.t Fri May 27 10:25:01 2005
@@ -21,7 +21,7 @@
 use Mail::SpamAssassin::HTML;
 use Mail::SpamAssassin::Util;
 
-plan tests => 63;
+plan tests => 64;
 
 ##############################################
 
@@ -81,6 +81,7 @@
 ok(try_domains('http:www.spamassassin.org/lists.html', 'spamassassin.org'));
 ok(try_domains('http://kung.pao.com.cn', 'pao.com.cn'));
 ok(try_domains('http://blah.blah.com:/', 'blah.com'));
+ok(try_domains('http://ebg&vosxfov.com.munged-rxspecials.net/b/Tr3f0amG','munged-rxspecials.net'));
 
 ##############################################