You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2004/05/07 06:36:43 UTC

svn commit: rev 10552 - in incubator/spamassassin/trunk: lib/Mail/SpamAssassin t t/data/spam

Author: jm
Date: Thu May  6 21:36:42 2004
New Revision: 10552

Modified:
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
   incubator/spamassassin/trunk/t/data/spam/009
   incubator/spamassassin/trunk/t/uri.t
Log:
spam spotted in wild evading URIBL, so deal with several URI obfuscations: http://0x425c45de/, http://66.92.0x45.221/, http://1113343455/, http://slashdot.org@1113343456/ in get_uri_list

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm	Thu May  6 21:36:42 2004
@@ -792,6 +792,40 @@
     $nuri =~ s/\&\#0*(3[3-9]|[4-9]\d|1[01]\d|12[0-6]);/sprintf "%c",$1/ge;
     $nuri =~ s/\&\#x0*(2[1-9]|[3-6][a-f0-9]|7[0-9a-e]);/sprintf "%c",hex($1)/gei;
 
+    # deal with wierd hostname parts
+    if ($nuri =~ /^(https?:\/\/)([^\/]+)(\.?\/.*)$/i) {
+      my ($proto, $host, $rest) = ($1,$2,$3);
+
+      # remove "www.fakehostname.com@" username part
+      $host =~ s/^[^\@]+\@//gs;
+
+      # deal with 'http://213.172.0x1f.13/'; decode encoded octets
+      if ($host =~ /^([0-9a-fx]*\.)([0-9a-fx]*\.)([0-9a-fx]*\.)([0-9a-fx]*)$/ix)
+      {
+        my (@chunk) = ($1,$2,$3,$4);
+        for my $octet (0 .. 3) {
+          $chunk[$octet] =~ s/^0x([0-9a-f][0-9a-f])/sprintf "%d",hex($1)/gei;
+        }
+        my $parsed = join ('', $proto, @chunk, $rest);
+        if ($parsed ne $nuri) { push(@nuris, $parsed); }
+      }
+
+      # "http://0x7f000001/"
+      if ($host =~ /^0x[0-9a-f]+$/i) {
+        $host =~ s/^0x([0-9a-f]+)/sprintf "%d",hex($1)/gei;
+        $host = decode_ulong_to_ip ($host);
+        my $parsed = join ('', $proto, $host, $rest);
+        push(@nuris, $parsed);
+      }
+
+      # "http://1113343453/"
+      if ($host =~ /^[0-9]+$/) {
+        $host = decode_ulong_to_ip ($host);
+        my $parsed = join ('', $proto, $host, $rest);
+        push(@nuris, $parsed);
+      }
+    }
+
     ($nuri) = Mail::SpamAssassin::Util::url_encode($nuri);
     if ($nuri ne $uri) {
       push(@nuris, $nuri);
@@ -810,6 +844,16 @@
   my %uris = map { $_ => 1 } @uris, @nuris;
 
   return keys %uris;
+}
+
+sub decode_ulong_to_ip {
+  my ($ulong) = @_;
+  my @octets = ();
+  unshift (@octets, $ulong & 0xff); $ulong >>= 8;
+  unshift (@octets, $ulong & 0xff); $ulong >>= 8;
+  unshift (@octets, $ulong & 0xff); $ulong >>= 8;
+  unshift (@octets, $ulong & 0xff);
+  return join (".", @octets);
 }
 
 ###########################################################################

Modified: incubator/spamassassin/trunk/t/data/spam/009
==============================================================================
Binary files. No diff available.

Modified: incubator/spamassassin/trunk/t/uri.t
==============================================================================
--- incubator/spamassassin/trunk/t/uri.t	(original)
+++ incubator/spamassassin/trunk/t/uri.t	Thu May  6 21:36:42 2004
@@ -21,7 +21,7 @@
 use Mail::SpamAssassin::HTML;
 use Mail::SpamAssassin::Util;
 
-plan tests => 57;
+plan tests => 62;
 
 ##############################################
 
@@ -41,7 +41,14 @@
 my $msg = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail);
 
 my @uris = $msg->get_uri_list();
-ok((@uris == 1) && ($uris[0] eq 'http://62.16.101.59/livesex.htm'));
+print "got URIs: ".join (", ", @uris)."\n";
+ok (@uris >= 5);
+my %urimap = map { $_ => 1 } @uris;
+ok ($urimap{'http://62.16.101.59/livesex.htm'});
+ok ($urimap{'http://66.92.69.221/'});
+ok ($urimap{'http://66.92.69.222/'});
+ok ($urimap{'http://66.92.69.223/'});
+ok ($urimap{'http://66.92.69.224/'});
 
 ##############################################
 
@@ -54,7 +61,12 @@
     return !defined $result;
   }
 
-  return $expect eq $result;
+  if ($expect eq $result) {
+    return 1;
+  } else {
+    warn "try_domains: failed! expect: '$expect' got: '$result'\n";
+    return 0;
+  }
 }
 
 ok(try_domains('javascript:{some crap}', undef));

Re: svn commit: rev 10552 - in incubator/spamassassin/trunk: lib/Mail/SpamAssassin t t/data/spam

Posted by Theo Van Dinter <fe...@kluge.net>.
On Fri, May 07, 2004 at 11:28:50AM -0700, Justin Mason wrote:
> I'm assuming that, like an encoded URL, it should dupe to both an
> @ version and an undecoded version.

Hrm.  Ok.   We should probably do full decoding first then, add that to
the queue, then remove the username part and add that to the queue.

My original idea was that we'd have just the raw and "cooked" URIs
returned, and leave everything else up to calling functions.  But I can
see the argument about the username bit.

> sure, hack away ;)  looks likely to be better alright.

I'll do a few timings and see what falls out. :)   It's not like this
function is going to be called a lot, but ... ;)

-- 
Randomly Generated Tagline:
"Fruit, what's that?
  It's a thing you eat when you feel like you need to have fruit."
                         - Dave McClelland and Chris Smith

Re: svn commit: rev 10552 - in incubator/spamassassin/trunk: lib/Mail/SpamAssassin t t/data/spam

Posted by Theo Van Dinter <fe...@kluge.net>.
On Fri, May 07, 2004 at 04:36:43AM -0000, jm@apache.org wrote:
> +      # remove "www.fakehostname.com@" username part
> +      $host =~ s/^[^\@]+\@//gs;

IMO, get_uri_list shouldn't remove that, that should be up to whoever's calling.

> +      # "http://0x7f000001/"

does http://00000x7f000001/ work?  Just curious.

> +sub decode_ulong_to_ip {
> +  my ($ulong) = @_;
> +  my @octets = ();
> +  unshift (@octets, $ulong & 0xff); $ulong >>= 8;
> +  unshift (@octets, $ulong & 0xff); $ulong >>= 8;
> +  unshift (@octets, $ulong & 0xff); $ulong >>= 8;
> +  unshift (@octets, $ulong & 0xff);
> +  return join (".", @octets);

I don't know which is more efficient, but I always used (converted for
this function):

@octets = unpack("CCCC",pack("H*", sprintf "%08lx", $ulong));

-- 
Randomly Generated Tagline:
When I die, I would like to go peacefully, in my sleep, like my
 Grandfather did. Not screaming and yelling like the passenger in his car.