You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2004/11/18 03:44:13 UTC
svn commit: rev 76212 - in spamassassin/trunk: lib/Mail/SpamAssassin rules

Author: felicity
Date: Wed Nov 17 18:44:11 2004
New Revision: 76212

Modified:
   spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
   spamassassin/trunk/rules/20_uri_tests.cf
   spamassassin/trunk/rules/70_testing.cf
Log:
bug 3973: get_uri_list changes, add undecoded (but still possibly obfuscated), and add the uri w/ user/pass removed.  this makes uri rules a little easier by not needing to deal with a potential user/pass in the RE.

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm	(original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm	Wed Nov 17 18:44:11 2004
@@ -835,6 +835,7 @@
   # make sure we catch bad encoding tricks
   my @nuris = ();
   for my $uri (@uris) {
+    # we're interested in http:// and so on, skip mailto:
     next if $uri =~ /^mailto:/i;
 
     # sometimes we catch URLs on multiple lines
@@ -844,57 +845,59 @@
     $uri =~ s/^\s+//;
     $uri =~ s/\s+$//;
 
-    # Make a copy so we don't trash the original
+    # Make a copy so we don't trash the original in the array
     my $nuri = $uri;
 
     # http:www.foo.biz -> http://www.foo.biz
     $nuri =~ s#^(https?:)/{0,2}#$1//#i;
 
     # http://www.foo.biz?id=3 -> http://www.foo.biz/?id=3
-    $nuri =~ s/^(https?:\/\/[^\/\?]+)\?/$1\/?/;
+    $nuri =~ s@^(https?://[^/?]+)\?@$1/?@;
 
     # deal with encoding of chars, this is just the set of printable
     # chars minus ' ' (that is, dec 33-126, hex 21-7e)
     $nuri =~ s/\&\#0*(3[3-9]|[4-9]\d|1[01]\d|12[0-6]);/sprintf "%c",$1/ge;
     $nuri =~ s/\&\#x0*(2[1-9]|[3-6][a-f0-9]|7[0-9a-e]);/sprintf "%c",hex($1)/gei;
 
-    # deal with wierd hostname parts
-    if ($nuri =~ /^(https?:\/\/)([^\/]+)(\.?\/.*)$/i) {
+    # deal with the %## encoding
+    $nuri = Mail::SpamAssassin::Util::url_encode($nuri);
+
+    # put the new URI on the new list if it's different
+    if ($nuri ne $uri) {
+      push(@nuris, $nuri);
+    }
+
+    # deal with wierd hostname parts, remove user/pass, etc.
+    if ($nuri =~ m{^(https?://)([^/]+)(\.?/.*)?$}i) {
       my ($proto, $host, $rest) = ($1,$2,$3);
 
+      # not required
+      $rest ||= '';
+
       # remove "www.fakehostname.com@" username part
-      $host =~ s/^[^\@]+\@//gs;
+      if ($host =~ s/^[^\@]+\@//gs) {
+        push(@nuris, join ('', $proto, $host, $rest));
+      }
 
-      # deal with 'http://213.172.0x1f.13/'; decode encoded octets
-      if ($host =~ /^([0-9a-fx]*\.)([0-9a-fx]*\.)([0-9a-fx]*\.)([0-9a-fx]*)$/ix)
-      {
+      # deal with 'http://213.172.0x1f.13/', decode encoded octets
+      if ($host =~ /^([0-9a-fx]*\.)([0-9a-fx]*\.)([0-9a-fx]*\.)([0-9a-fx]*)$/ix) {
         my (@chunk) = ($1,$2,$3,$4);
         for my $octet (0 .. 3) {
           $chunk[$octet] =~ s/^0x([0-9a-f][0-9a-f])/sprintf "%d",hex($1)/gei;
         }
-        my $parsed = join ('', $proto, @chunk, $rest);
-        if ($parsed ne $nuri) { push(@nuris, $parsed); }
+        push(@nuris, join ('', $proto, @chunk, $rest));
       }
 
       # "http://0x7f000001/"
-      if ($host =~ /^0x[0-9a-f]+$/i) {
+      elsif ($host =~ /^0x[0-9a-f]+$/i) {
         $host =~ s/^0x([0-9a-f]+)/sprintf "%d",hex($1)/gei;
-        $host = decode_ulong_to_ip ($host);
-        my $parsed = join ('', $proto, $host, $rest);
-        push(@nuris, $parsed);
+        push(@nuris, join ('', $proto, decode_ulong_to_ip($host), $rest));
       }
 
       # "http://1113343453/"
-      if ($host =~ /^[0-9]+$/) {
-        $host = decode_ulong_to_ip ($host);
-        my $parsed = join ('', $proto, $host, $rest);
-        push(@nuris, $parsed);
+      elsif ($host =~ /^[0-9]+$/) {
+        push(@nuris, join ('', $proto, decode_ulong_to_ip($host), $rest));
       }
-    }
-
-    $nuri = Mail::SpamAssassin::Util::url_encode($nuri);
-    if ($nuri ne $uri) {
-      push(@nuris, $nuri);
     }
 
     # deal with http redirectors.  strip off one level of redirector

Modified: spamassassin/trunk/rules/20_uri_tests.cf
==============================================================================
--- spamassassin/trunk/rules/20_uri_tests.cf	(original)
+++ spamassassin/trunk/rules/20_uri_tests.cf	Wed Nov 17 18:44:11 2004
@@ -27,7 +27,7 @@
 uri NUMERIC_HTTP_ADDR		/^https?\:\/\/\d{7,}/is
 describe NUMERIC_HTTP_ADDR	Uses a numeric IP address in URL
 
-uri NORMAL_HTTP_TO_IP		/^https?\:\/\/(?:\S*\@)?\d+\.\d+\.\d+\.\d+/i
+uri NORMAL_HTTP_TO_IP		m{^https?://\d+\.\d+\.\d+\.\d+}i
 describe NORMAL_HTTP_TO_IP	Uses a dotted-decimal IP address in URL
  	
 # Theo sez:
@@ -47,7 +47,7 @@
 describe HTTP_EXCESSIVE_ESCAPES	Completely unnecessary %-escapes inside a URL
 
 # bug 1801
-uri IP_LINK_PLUS	/^https?\:\/\/(?:\S*\@)?\d+\.\d+\.\d+\.\d+.{0,20}(?:cgi|click|ads|id\=)/i
+uri IP_LINK_PLUS	m{^https?://\d+\.\d+\.\d+\.\d+.{0,20}(?:cgi|click|ads|id=)}i
 describe IP_LINK_PLUS	Dotted-decimal IP address followed by CGI
 
 uri REMOVE_PAGE			/^https?:\/\/[^\/]+\/.*?remove/

Modified: spamassassin/trunk/rules/70_testing.cf
==============================================================================
--- spamassassin/trunk/rules/70_testing.cf	(original)
+++ spamassassin/trunk/rules/70_testing.cf	Wed Nov 17 18:44:11 2004
@@ -466,7 +466,7 @@
 describe T_SPOOF_COM2COM 	a.com.b.com
 
 # CDNs (Akamai (edgesuite), Speedera, and NYUD, so far) do this, so skip them.
-uri      T_SPOOF_OURI		m{^https?:/{0,2}(?:[^@/]+@)*?(?:[a-z0-9_-]+?\.){2,}(?:com|net|org|biz|info|edu|www)(?!\.(?:\w+\.)?(?:edgesuite|nyud|speedera)\.net)(?:\.[a-z0-9_%-]+?){2,}(?:(?::|%3a)\d+)?}i
+uri      T_SPOOF_OURI		m{^https?://(?:[a-z0-9_-]+?\.){2,}(?:com|net|org|biz|info|edu|www)(?!\.(?:\w+\.)?(?:edgesuite|nyud|speedera)\.net)(?:\.[a-z0-9_%-]+?){2,}(?:(?::|%3a)\d+)?}i
 describe T_SPOOF_OURI	 	URL has items in odd places
 
 ##########################################################################