You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2007/02/13 18:04:49 UTC

svn commit: r507101 - in /spamassassin/branches/3.1: lib/Mail/SpamAssassin/Constants.pm lib/Mail/SpamAssassin/EvalTests.pm lib/Mail/SpamAssassin/HTML.pm lib/Mail/SpamAssassin/PerMsgStatus.pm rules/20_body_tests.cf

Author: felicity
Date: Tue Feb 13 09:04:48 2007
New Revision: 507101

URL: http://svn.apache.org/viewvc?view=rev&rev=507101
Log:
bug 5318: set a maximum internal length for URIs

Modified:
    spamassassin/branches/3.1/lib/Mail/SpamAssassin/Constants.pm
    spamassassin/branches/3.1/lib/Mail/SpamAssassin/EvalTests.pm
    spamassassin/branches/3.1/lib/Mail/SpamAssassin/HTML.pm
    spamassassin/branches/3.1/lib/Mail/SpamAssassin/PerMsgStatus.pm
    spamassassin/branches/3.1/rules/20_body_tests.cf

Modified: spamassassin/branches/3.1/lib/Mail/SpamAssassin/Constants.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/3.1/lib/Mail/SpamAssassin/Constants.pm?view=diff&rev=507101&r1=507100&r2=507101
==============================================================================
--- spamassassin/branches/3.1/lib/Mail/SpamAssassin/Constants.pm (original)
+++ spamassassin/branches/3.1/lib/Mail/SpamAssassin/Constants.pm Tue Feb 13 09:04:48 2007
@@ -38,6 +38,7 @@
 	META_TEST_MIN_PRIORITY HARVEST_DNSBL_PRIORITY MBX_SEPARATOR
 	MAX_BODY_LINE_LENGTH MAX_HEADER_KEY_LENGTH MAX_HEADER_VALUE_LENGTH
 	MAX_HEADER_LENGTH ARITH_EXPRESSION_LEXER AI_TIME_UNKNOWN
+	MAX_URI_LENGTH
 );
 
 %EXPORT_TAGS = (
@@ -282,6 +283,9 @@
 use constant MAX_HEADER_VALUE_LENGTH => 8192;
 # maximum byte length of entire header
 use constant MAX_HEADER_LENGTH => 65536;
+
+# maximum byte length of any given URI
+use constant MAX_URI_LENGTH => 1024;
 
 # used for meta rules and "if" conditionals in Conf::Parser
 use constant ARITH_EXPRESSION_LEXER => qr/(?:

Modified: spamassassin/branches/3.1/lib/Mail/SpamAssassin/EvalTests.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/3.1/lib/Mail/SpamAssassin/EvalTests.pm?view=diff&rev=507101&r1=507100&r2=507101
==============================================================================
--- spamassassin/branches/3.1/lib/Mail/SpamAssassin/EvalTests.pm (original)
+++ spamassassin/branches/3.1/lib/Mail/SpamAssassin/EvalTests.pm Tue Feb 13 09:04:48 2007
@@ -3167,4 +3167,9 @@
   return 0;
 }
 
+sub check_uri_truncated {
+  my $self = shift;
+  return $self->{'uri_truncated'};
+}
+ 
 1;

Modified: spamassassin/branches/3.1/lib/Mail/SpamAssassin/HTML.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/3.1/lib/Mail/SpamAssassin/HTML.pm?view=diff&rev=507101&r1=507100&r2=507101
==============================================================================
--- spamassassin/branches/3.1/lib/Mail/SpamAssassin/HTML.pm (original)
+++ spamassassin/branches/3.1/lib/Mail/SpamAssassin/HTML.pm Tue Feb 13 09:04:48 2007
@@ -26,6 +26,7 @@
 
 use HTML::Parser 3.24 ();
 use Mail::SpamAssassin::Logger;
+use Mail::SpamAssassin::Constants qw(:sa);
 
 use vars qw($re_loose $re_strict $re_other @ISA @EXPORT @EXPORT_OK);
 
@@ -143,6 +144,7 @@
   $self->put_results(anchor => $self->{anchor});
 
   $self->put_results(uri_detail => $self->{uri});
+  $self->put_results(uri_truncated => $self->{uri_truncated});
 
   # final results scalars
   $self->put_results(image_area => $self->{image_area});
@@ -359,9 +361,7 @@
 sub push_uri {
   my ($self, $type, $uri) = @_;
 
-  # URIs don't have leading/trailing whitespace ...
-  $uri =~ s/^\s+//;
-  $uri =~ s/\s+$//;
+  $uri = $self->canon_uri($uri);
 
   my $target = target_uri($self->{base_href} || "", $uri);
 
@@ -371,6 +371,22 @@
   }
 }
 
+sub canon_uri {
+  my ($self, $uri) = @_;
+
+  # URIs don't have leading/trailing whitespace ...
+  $uri =~ s/^\s+//;
+  $uri =~ s/\s+$//;
+
+  # Make sure all the URIs are nice and short
+  if (length $uri > MAX_URI_LENGTH) {
+    $self->{'uri_truncated'} = 1;
+    $uri = substr $uri, 0, MAX_URI_LENGTH;
+  }
+
+  return $uri;
+}
+
 sub html_uri {
   my ($self, $tag, $attr) = @_;
 
@@ -397,6 +413,8 @@
   }
   elsif ($tag eq "base") {
     if (my $uri = $attr->{href}) {
+      $uri = $self->canon_uri($uri);
+
       # use <BASE HREF="URI"> to turn relative links into absolute links
 
       # even if it is a base URI, handle like a normal URI as well
@@ -690,7 +708,7 @@
 
   # special text delimiters - <a> and <title>
   if ($tag eq "a") {
-    $self->{anchor_last} = (exists $attr->{href} ? $attr->{href} : "");
+    $self->{anchor_last} = (exists $attr->{href} ? $self->canon_uri($attr->{href}) : "");
     push(@{$self->{uri}->{$self->{anchor_last}}->{anchor_text}}, '');
     push(@{$self->{anchor}}, '');
   }

Modified: spamassassin/branches/3.1/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/3.1/lib/Mail/SpamAssassin/PerMsgStatus.pm?view=diff&rev=507101&r1=507100&r2=507101
==============================================================================
--- spamassassin/branches/3.1/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/branches/3.1/lib/Mail/SpamAssassin/PerMsgStatus.pm Tue Feb 13 09:04:48 2007
@@ -2009,6 +2009,7 @@
   # get URIs from HTML parsing
   # use the metadata version since $self->{html} may not be setup
   my $detail = $self->{msg}->{metadata}->{html}->{uri_detail} || { };
+  $self->{'uri_truncated'} = 1 if $self->{msg}->{metadata}->{html}->{uri_truncated};
 
   # don't keep dereferencing ...
   my $redirector_patterns = $self->{conf}->{redirector_patterns};
@@ -2143,6 +2144,14 @@
 
         #warn("uri: got URI: $uri\n");
         push @uris, $uri;
+      }
+    }
+
+    # Make sure all the URIs are nice and short
+    foreach my $uri ( @uris ) {
+      if (length $uri > MAX_URI_LENGTH) {
+        $self->{'uri_truncated'} = 1;
+        $uri = substr $uri, 0, MAX_URI_LENGTH;
       }
     }
 

Modified: spamassassin/branches/3.1/rules/20_body_tests.cf
URL: http://svn.apache.org/viewvc/spamassassin/branches/3.1/rules/20_body_tests.cf?view=diff&rev=507101&r1=507100&r2=507101
==============================================================================
--- spamassassin/branches/3.1/rules/20_body_tests.cf (original)
+++ spamassassin/branches/3.1/rules/20_body_tests.cf Tue Feb 13 09:04:48 2007
@@ -148,3 +148,6 @@
 describe INTERRUPTUS          Message looks to contain HTML-interrupted text
 
 body MULTIPART_ALT_NON_TEXT     eval:check_ma_non_text()
+
+body URI_TRUNCATED	eval:check_uri_truncated()
+describe URI_TRUNCATED	Message contained a URI which was truncated