You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by si...@apache.org on 2004/10/17 20:23:41 UTC
svn commit: rev 54970 - in spamassassin/branches/3.0/lib/Mail/SpamAssassin: . Message
Author: sidney
Date: Sun Oct 17 11:23:40 2004
New Revision: 54970
Modified:
spamassassin/branches/3.0/lib/Mail/SpamAssassin/Message/Metadata.pm
spamassassin/branches/3.0/lib/Mail/SpamAssassin/TextCat.pm
Log:
bug 3776: restrict size of message body fed to TextCat to 10kbytes which is enough for reliable classification and prevents excessive time and memory consumption
Modified: spamassassin/branches/3.0/lib/Mail/SpamAssassin/Message/Metadata.pm
==============================================================================
--- spamassassin/branches/3.0/lib/Mail/SpamAssassin/Message/Metadata.pm (original)
+++ spamassassin/branches/3.0/lib/Mail/SpamAssassin/Message/Metadata.pm Sun Oct 17 11:23:40 2004
@@ -110,18 +110,30 @@
$body = join ("\n", @{$body});
$body =~ s/^Subject://i;
+ my $len = length($body);
+
+ # truncate after 10k; that should be plenty to classify it
+ if ($len > 10000) {
+ substr ($body, 10000) = '';
+ $len = 10000;
+ }
+
# note body text length, since the check_languages() eval rule also
# uses it
- $self->{languages_body_len} = length($body);
+ $self->{languages_body_len} = $len;
# need about 256 bytes for reasonably accurate match (experimentally derived)
- if ($self->{languages_body_len} < 256) {
+ if ($len < 256) {
dbg("Message too short for language analysis");
$self->{textcat_matches} = [];
return;
}
- my @matches = Mail::SpamAssassin::TextCat::classify($self, $body, $main->{languages_filename});
+ my @matches = Mail::SpamAssassin::TextCat::classify($self,
+ \$body, $main->{languages_filename});
+
+ undef $body; # free that memory
+
$self->{textcat_matches} = \@matches;
my $matches_str = join(' ', @matches);
Modified: spamassassin/branches/3.0/lib/Mail/SpamAssassin/TextCat.pm
==============================================================================
--- spamassassin/branches/3.0/lib/Mail/SpamAssassin/TextCat.pm (original)
+++ spamassassin/branches/3.0/lib/Mail/SpamAssassin/TextCat.pm Sun Oct 17 11:23:40 2004
@@ -24,7 +24,6 @@
);
my @nm;
-my $non_word_characters='0-9\s';
# settings
$opt_a = 10;
@@ -49,12 +48,12 @@
# values are 1.05 or 1.1.
sub classify {
- my ($self, $input, $languages_filename) = @_;
+ my ($self, $inputptr, $languages_filename) = @_;
my %results;
my $maxp = $opt_t;
# create ngrams for input
- my @unknown = create_lm($input);
+ my @unknown = create_lm($inputptr);
# load language models once
if (! @nm) {
@@ -123,20 +122,20 @@
my %ngram;
my @sorted;
- ($_) = @_;
-
- for (split("[$non_word_characters]+")) {
- $_ = "\000" . $_ . "\000";
- my $len = length($_);
+ # my $non_word_characters = qr/[0-9\s]/;
+ for my $word (split(/[0-9\s]+/, ${$_[0]}))
+ {
+ $word = "\000" . $word . "\000";
+ my $len = length($word);
my $flen = $len;
my $i;
for ($i = 0; $i < $flen; $i++) {
$len--;
- $ngram{substr($_, $i, 1)}++;
- ($len < 1) ? next : $ngram{substr($_, $i, 2)}++;
- ($len < 2) ? next : $ngram{substr($_, $i, 3)}++;
- ($len < 3) ? next : $ngram{substr($_, $i, 4)}++;
- if ($len > 3) { $ngram{substr($_, $i, 5)}++ };
+ $ngram{substr($word, $i, 1)}++;
+ ($len < 1) ? next : $ngram{substr($word, $i, 2)}++;
+ ($len < 2) ? next : $ngram{substr($word, $i, 3)}++;
+ ($len < 3) ? next : $ngram{substr($word, $i, 4)}++;
+ if ($len > 3) { $ngram{substr($word, $i, 5)}++ };
}
}