You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2004/08/04 06:13:21 UTC

svn commit: rev 35665 - spamassassin/branches/b2_6_0/lib/Mail/SpamAssassin

Author: felicity
Date: Tue Aug  3 21:13:21 2004
New Revision: 35665

Modified:
   spamassassin/branches/b2_6_0/lib/Mail/SpamAssassin/Bayes.pm
   spamassassin/branches/b2_6_0/lib/Mail/SpamAssassin/PerMsgStatus.pm
Log:
patch to truncate long headers.  fixes some performance issues on certain messages.

Modified: spamassassin/branches/b2_6_0/lib/Mail/SpamAssassin/Bayes.pm
==============================================================================
--- spamassassin/branches/b2_6_0/lib/Mail/SpamAssassin/Bayes.pm	(original)
+++ spamassassin/branches/b2_6_0/lib/Mail/SpamAssassin/Bayes.pm	Tue Aug  3 21:13:21 2004
@@ -137,6 +137,15 @@
 use constant HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
 use constant BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
 
+# maximum byte length of a header key
+use constant MAX_HEADER_KEY_LENGTH => 256;
+
+# maximum byte length of a header value including continued lines
+use constant MAX_HEADER_VALUE_LENGTH => 8192;
+
+# maximum byte length of entire header
+use constant MAX_HEADER_LENGTH => 65536;
+
 # We store header-mined tokens in the db with a "HHeaderName:val" format.
 # some headers may contain lots of gibberish tokens, so allow a little basic
 # compression by mapping the header name at least here.  these are the headers
@@ -432,7 +441,28 @@
 sub tokenize_headers {
   my ($self, $msg) = @_;
 
-  my $hdrs = $msg->get_all_headers();
+  my @hdrs = ();
+  my $length = 0;
+
+  my $hdr;
+  foreach $hdr ($msg->get_all_headers()) {
+    last if ($length + length($hdr) > MAX_HEADER_LENGTH);
+
+    my($key, $value) = split(/:/, $hdr, 2);
+
+    # limit the length of the pairs we store
+    if (length($key) > MAX_HEADER_KEY_LENGTH) {
+      $key = substr($key, 0, MAX_HEADER_KEY_LENGTH);
+    }
+    if (length($value) > MAX_HEADER_VALUE_LENGTH) {
+      $value = substr($value, 0, MAX_HEADER_VALUE_LENGTH);
+    }
+    push(@hdrs, "$key:$value");
+    $length += length "$key:$value";
+  }
+
+  my $hdrs = join('', @hdrs);
+  undef @hdrs;
 
   # jm: do not learn additional metadata (X-Languages, X-Relays-Untrusted)
   # until we can generate that while running sa-learn. TODO

Modified: spamassassin/branches/b2_6_0/lib/Mail/SpamAssassin/PerMsgStatus.pm
==============================================================================
--- spamassassin/branches/b2_6_0/lib/Mail/SpamAssassin/PerMsgStatus.pm	(original)
+++ spamassassin/branches/b2_6_0/lib/Mail/SpamAssassin/PerMsgStatus.pm	Tue Aug  3 21:13:21 2004
@@ -48,6 +48,16 @@
 
 use constant MAX_BODY_LINE_LENGTH =>	2048;
 
+# maximum byte length of a header key
+use constant MAX_HEADER_KEY_LENGTH => 256;
+
+# maximum byte length of a header value including continued lines
+use constant MAX_HEADER_VALUE_LENGTH => 8192;
+
+# maximum byte length of entire header
+use constant MAX_HEADER_LENGTH => 65536;
+
+
 use vars qw{
   @ISA $base64alphabet
 };
@@ -1314,7 +1324,26 @@
     my $getraw = ($hdrname eq 'ALL' || $hdrname =~ s/:raw$//);
 
     if ($hdrname eq 'ALL') {
-      $_ = $self->{msg}->get_all_headers();
+      my @hdrs = ();
+      my $length = 0;
+
+      my $hdr;
+      foreach $hdr ($self->{msg}->get_all_headers()) {
+	last if ($length + length($hdr) > MAX_HEADER_LENGTH);
+
+	my($key, $value) = split(/:/, $hdr, 2);
+        # limit the length of the pairs we store
+        if (length($key) > MAX_HEADER_KEY_LENGTH) {
+          $key = substr($key, 0, MAX_HEADER_KEY_LENGTH);
+        }
+        if (length($value) > MAX_HEADER_VALUE_LENGTH) {
+          $value = substr($value, 0, MAX_HEADER_VALUE_LENGTH);
+        }
+	push(@hdrs, "$key:$value");
+	$length += length "$key:$value";
+      }
+
+      $_ = join('', @hdrs);
     }
     # ToCc: the combined recipients list
     elsif ($hdrname eq 'ToCc') {
@@ -1324,7 +1353,14 @@
 	$_ .= ", " if /\S/;
       }
       $_ .= join ("\n", $self->{msg}->get_header ('Cc'));
-      undef $_ if $_ eq '';
+      if ($_ eq '') {
+        undef $_;
+      }
+      else {
+        if (length($_) > MAX_HEADER_VALUE_LENGTH) {
+          $_ = substr($_, 0, MAX_HEADER_VALUE_LENGTH);
+        }
+      }
     }
     # MESSAGEID: handle lists which move the real message-id to another
     # header for resending.
@@ -1334,12 +1370,18 @@
 		$self->{msg}->get_header ('Resent-Message-Id'),
 		$self->{msg}->get_header ('X-Original-Message-ID'), # bug 2122
 		$self->{msg}->get_header ('Message-Id'));
+      if (length($_) > MAX_HEADER_VALUE_LENGTH) {
+        $_ = substr($_, 0, MAX_HEADER_VALUE_LENGTH);
+      }
     }
     # a conventional header
     else {
       my @hdrs = $self->{msg}->get_header ($hdrname);
       if ($#hdrs >= 0) {
 	$_ = join ("\n", @hdrs);
+        if (length($_) > MAX_HEADER_VALUE_LENGTH) {
+          $_ = substr($_, 0, MAX_HEADER_VALUE_LENGTH);
+        }
       }
       else {
 	$_ = undef;