You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by si...@apache.org on 2004/10/17 20:23:41 UTC

svn commit: rev 54970 - in spamassassin/branches/3.0/lib/Mail/SpamAssassin: . Message

Author: sidney
Date: Sun Oct 17 11:23:40 2004
New Revision: 54970

Modified:
   spamassassin/branches/3.0/lib/Mail/SpamAssassin/Message/Metadata.pm
   spamassassin/branches/3.0/lib/Mail/SpamAssassin/TextCat.pm
Log:
bug 3776: restrict size of message body fed to TextCat to 10kbytes which is enough for reliable classification and prevents excessive time and memory consumption

Modified: spamassassin/branches/3.0/lib/Mail/SpamAssassin/Message/Metadata.pm
==============================================================================
--- spamassassin/branches/3.0/lib/Mail/SpamAssassin/Message/Metadata.pm	(original)
+++ spamassassin/branches/3.0/lib/Mail/SpamAssassin/Message/Metadata.pm	Sun Oct 17 11:23:40 2004
@@ -110,18 +110,30 @@
   $body = join ("\n", @{$body});
   $body =~ s/^Subject://i;
 
+  my $len = length($body);
+
+  # truncate after 10k; that should be plenty to classify it
+  if ($len > 10000) {
+    substr ($body, 10000) = '';
+    $len = 10000;
+  }
+
   # note body text length, since the check_languages() eval rule also
   # uses it
-  $self->{languages_body_len} = length($body);
+  $self->{languages_body_len} = $len;
 
   # need about 256 bytes for reasonably accurate match (experimentally derived)
-  if ($self->{languages_body_len} < 256) {
+  if ($len < 256) {
     dbg("Message too short for language analysis");
     $self->{textcat_matches} = [];
     return;
   }
 
-  my @matches = Mail::SpamAssassin::TextCat::classify($self, $body, $main->{languages_filename});
+  my @matches = Mail::SpamAssassin::TextCat::classify($self,
+                                \$body, $main->{languages_filename});
+
+  undef $body;          # free that memory
+
   $self->{textcat_matches} = \@matches;
   my $matches_str = join(' ', @matches);
 

Modified: spamassassin/branches/3.0/lib/Mail/SpamAssassin/TextCat.pm
==============================================================================
--- spamassassin/branches/3.0/lib/Mail/SpamAssassin/TextCat.pm	(original)
+++ spamassassin/branches/3.0/lib/Mail/SpamAssassin/TextCat.pm	Sun Oct 17 11:23:40 2004
@@ -24,7 +24,6 @@
 );
 
 my @nm;
-my $non_word_characters='0-9\s';
 
 # settings
 $opt_a = 10;
@@ -49,12 +48,12 @@
 #         values are 1.05 or 1.1.
 
 sub classify {
-  my ($self, $input, $languages_filename) = @_;
+  my ($self, $inputptr, $languages_filename) = @_;
   my %results;
   my $maxp = $opt_t;
 
   # create ngrams for input
-  my @unknown = create_lm($input);
+  my @unknown = create_lm($inputptr);
 
   # load language models once
   if (! @nm) {
@@ -123,20 +122,20 @@
   my %ngram;
   my @sorted;
 
-  ($_) = @_;
-
-  for (split("[$non_word_characters]+")) {
-    $_ = "\000" . $_ . "\000";
-    my $len = length($_);
+  # my $non_word_characters = qr/[0-9\s]/;
+  for my $word (split(/[0-9\s]+/, ${$_[0]}))
+  {
+    $word = "\000" . $word . "\000";
+    my $len = length($word);
     my $flen = $len;
     my $i;
     for ($i = 0; $i < $flen; $i++) {
       $len--;
-      $ngram{substr($_, $i, 1)}++;
-      ($len < 1) ? next : $ngram{substr($_, $i, 2)}++;
-      ($len < 2) ? next : $ngram{substr($_, $i, 3)}++;
-      ($len < 3) ? next : $ngram{substr($_, $i, 4)}++;
-      if ($len > 3) { $ngram{substr($_, $i, 5)}++ };
+      $ngram{substr($word, $i, 1)}++;
+      ($len < 1) ? next : $ngram{substr($word, $i, 2)}++;
+      ($len < 2) ? next : $ngram{substr($word, $i, 3)}++;
+      ($len < 3) ? next : $ngram{substr($word, $i, 4)}++;
+      if ($len > 3) { $ngram{substr($word, $i, 5)}++ };
     }
   }