You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2004/11/18 03:44:13 UTC
svn commit: rev 76212 - in spamassassin/trunk: lib/Mail/SpamAssassin rules
Author: felicity
Date: Wed Nov 17 18:44:11 2004
New Revision: 76212
Modified:
spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
spamassassin/trunk/rules/20_uri_tests.cf
spamassassin/trunk/rules/70_testing.cf
Log:
bug 3973: get_uri_list changes, add undecoded (but still possibly obfuscated), and add the uri w/ user/pass removed. this makes uri rules a little easier by not needing to deal with a potential user/pass in the RE.
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm Wed Nov 17 18:44:11 2004
@@ -835,6 +835,7 @@
# make sure we catch bad encoding tricks
my @nuris = ();
for my $uri (@uris) {
+ # we're interested in http:// and so on, skip mailto:
next if $uri =~ /^mailto:/i;
# sometimes we catch URLs on multiple lines
@@ -844,57 +845,59 @@
$uri =~ s/^\s+//;
$uri =~ s/\s+$//;
- # Make a copy so we don't trash the original
+ # Make a copy so we don't trash the original in the array
my $nuri = $uri;
# http:www.foo.biz -> http://www.foo.biz
$nuri =~ s#^(https?:)/{0,2}#$1//#i;
# http://www.foo.biz?id=3 -> http://www.foo.biz/?id=3
- $nuri =~ s/^(https?:\/\/[^\/\?]+)\?/$1\/?/;
+ $nuri =~ s@^(https?://[^/?]+)\?@$1/?@;
# deal with encoding of chars, this is just the set of printable
# chars minus ' ' (that is, dec 33-126, hex 21-7e)
$nuri =~ s/\&\#0*(3[3-9]|[4-9]\d|1[01]\d|12[0-6]);/sprintf "%c",$1/ge;
$nuri =~ s/\&\#x0*(2[1-9]|[3-6][a-f0-9]|7[0-9a-e]);/sprintf "%c",hex($1)/gei;
- # deal with wierd hostname parts
- if ($nuri =~ /^(https?:\/\/)([^\/]+)(\.?\/.*)$/i) {
+ # deal with the %## encoding
+ $nuri = Mail::SpamAssassin::Util::url_encode($nuri);
+
+ # put the new URI on the new list if it's different
+ if ($nuri ne $uri) {
+ push(@nuris, $nuri);
+ }
+
+ # deal with wierd hostname parts, remove user/pass, etc.
+ if ($nuri =~ m{^(https?://)([^/]+)(\.?/.*)?$}i) {
my ($proto, $host, $rest) = ($1,$2,$3);
+ # not required
+ $rest ||= '';
+
# remove "www.fakehostname.com@" username part
- $host =~ s/^[^\@]+\@//gs;
+ if ($host =~ s/^[^\@]+\@//gs) {
+ push(@nuris, join ('', $proto, $host, $rest));
+ }
- # deal with 'http://213.172.0x1f.13/'; decode encoded octets
- if ($host =~ /^([0-9a-fx]*\.)([0-9a-fx]*\.)([0-9a-fx]*\.)([0-9a-fx]*)$/ix)
- {
+ # deal with 'http://213.172.0x1f.13/', decode encoded octets
+ if ($host =~ /^([0-9a-fx]*\.)([0-9a-fx]*\.)([0-9a-fx]*\.)([0-9a-fx]*)$/ix) {
my (@chunk) = ($1,$2,$3,$4);
for my $octet (0 .. 3) {
$chunk[$octet] =~ s/^0x([0-9a-f][0-9a-f])/sprintf "%d",hex($1)/gei;
}
- my $parsed = join ('', $proto, @chunk, $rest);
- if ($parsed ne $nuri) { push(@nuris, $parsed); }
+ push(@nuris, join ('', $proto, @chunk, $rest));
}
# "http://0x7f000001/"
- if ($host =~ /^0x[0-9a-f]+$/i) {
+ elsif ($host =~ /^0x[0-9a-f]+$/i) {
$host =~ s/^0x([0-9a-f]+)/sprintf "%d",hex($1)/gei;
- $host = decode_ulong_to_ip ($host);
- my $parsed = join ('', $proto, $host, $rest);
- push(@nuris, $parsed);
+ push(@nuris, join ('', $proto, decode_ulong_to_ip($host), $rest));
}
# "http://1113343453/"
- if ($host =~ /^[0-9]+$/) {
- $host = decode_ulong_to_ip ($host);
- my $parsed = join ('', $proto, $host, $rest);
- push(@nuris, $parsed);
+ elsif ($host =~ /^[0-9]+$/) {
+ push(@nuris, join ('', $proto, decode_ulong_to_ip($host), $rest));
}
- }
-
- $nuri = Mail::SpamAssassin::Util::url_encode($nuri);
- if ($nuri ne $uri) {
- push(@nuris, $nuri);
}
# deal with http redirectors. strip off one level of redirector
Modified: spamassassin/trunk/rules/20_uri_tests.cf
==============================================================================
--- spamassassin/trunk/rules/20_uri_tests.cf (original)
+++ spamassassin/trunk/rules/20_uri_tests.cf Wed Nov 17 18:44:11 2004
@@ -27,7 +27,7 @@
uri NUMERIC_HTTP_ADDR /^https?\:\/\/\d{7,}/is
describe NUMERIC_HTTP_ADDR Uses a numeric IP address in URL
-uri NORMAL_HTTP_TO_IP /^https?\:\/\/(?:\S*\@)?\d+\.\d+\.\d+\.\d+/i
+uri NORMAL_HTTP_TO_IP m{^https?://\d+\.\d+\.\d+\.\d+}i
describe NORMAL_HTTP_TO_IP Uses a dotted-decimal IP address in URL
# Theo sez:
@@ -47,7 +47,7 @@
describe HTTP_EXCESSIVE_ESCAPES Completely unnecessary %-escapes inside a URL
# bug 1801
-uri IP_LINK_PLUS /^https?\:\/\/(?:\S*\@)?\d+\.\d+\.\d+\.\d+.{0,20}(?:cgi|click|ads|id\=)/i
+uri IP_LINK_PLUS m{^https?://\d+\.\d+\.\d+\.\d+.{0,20}(?:cgi|click|ads|id=)}i
describe IP_LINK_PLUS Dotted-decimal IP address followed by CGI
uri REMOVE_PAGE /^https?:\/\/[^\/]+\/.*?remove/
Modified: spamassassin/trunk/rules/70_testing.cf
==============================================================================
--- spamassassin/trunk/rules/70_testing.cf (original)
+++ spamassassin/trunk/rules/70_testing.cf Wed Nov 17 18:44:11 2004
@@ -466,7 +466,7 @@
describe T_SPOOF_COM2COM a.com.b.com
# CDNs (Akamai (edgesuite), Speedera, and NYUD, so far) do this, so skip them.
-uri T_SPOOF_OURI m{^https?:/{0,2}(?:[^@/]+@)*?(?:[a-z0-9_-]+?\.){2,}(?:com|net|org|biz|info|edu|www)(?!\.(?:\w+\.)?(?:edgesuite|nyud|speedera)\.net)(?:\.[a-z0-9_%-]+?){2,}(?:(?::|%3a)\d+)?}i
+uri T_SPOOF_OURI m{^https?://(?:[a-z0-9_-]+?\.){2,}(?:com|net|org|biz|info|edu|www)(?!\.(?:\w+\.)?(?:edgesuite|nyud|speedera)\.net)(?:\.[a-z0-9_%-]+?){2,}(?:(?::|%3a)\d+)?}i
describe T_SPOOF_OURI URL has items in odd places
##########################################################################