You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by pa...@apache.org on 2006/05/29 01:39:06 UTC
svn commit: r410004 - /spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TextCat.pm

Author: parker
Date: Sun May 28 16:39:03 2006
New Revision: 410004

URL: http://svn.apache.org/viewvc?rev=410004&view=rev
Log:
Bug 4902: Make multiple TextCat settings changable via configuration

Modified:
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TextCat.pm

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TextCat.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TextCat.pm?rev=410004&r1=410003&r2=410004&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TextCat.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TextCat.pm Sun May 28 16:39:03 2006
@@ -53,28 +53,6 @@
 # language models
 my @nm;
 
-# TextCat settings
-my $opt_a = 10;
-my $opt_f = 0;
-my $opt_t = 400;
-my $opt_u = 1.05;
-
-# $opt_a  If the number of languages to be returned by &classify is larger
-#         than the value of $opt_a then an empty list is returned signifying
-#         that the language is unknown.
-#
-# $opt_f  Before sorting is performed, the ngrams which occur $opt_f times
-#         or less are removed.  This can be used to speed up the program for
-#         longer inputs.  For shorter inputs, this should be set to 0.
-#
-# $opt_t  This option indicates the maximum number of ngrams that should be
-#         compared with each of the language models (note that each of those
-#         models is used completely).
-#
-# $opt_u  &classify returns a list of the best-scoring language together with
-#         all languages which are less than $opt_u times worse.  Typical
-#         values are 1.05 or 1.1.
-
 sub new {
   my $class = shift;
   my $mailsaobject = shift;
@@ -309,6 +287,58 @@
     type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING,
   });
 
+=item textcat_max_languages N (default: 5)
+
+The maximum number of languages before the classification is considered unknown.
+
+=cut
+
+  push (@cmds, {
+    setting => 'textcat_max_languages',
+    default => 5,
+    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
+  });
+
+=item textcat_optimal_ngrams N (default: 0)
+
+If the number of ngrams is lower than this number then they will be removed.  This
+can be used to speed up the program for longer inputs.  For shorter inputs, this
+should be set to 0.
+
+=cut
+
+  push (@cmds, {
+    setting => 'textcat_optimal_ngrams',
+    default => 0,
+    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
+  });
+
+=item textcat_max_ngrams N (default: 400)
+
+The maximum number of ngrams that should be compared with each of the languages
+models (note that each of those models is used completely).
+
+=cut
+
+  push (@cmds, {
+    setting => 'textcat_max_ngrams',
+    default => 400,
+    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
+  });
+
+=item textcat_acceptable_score N (default: 1.05)
+
+Include any language that scores at least C<textcat_acceptable_score> in the
+returned list of languages
+
+=cut
+
+  push (@cmds, {
+    setting => 'textcat_acceptable_score',
+    default => 1.05,
+    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
+  });
+
   $conf->{parser}->register_commands(\@cmds);
 }
 
@@ -351,13 +381,13 @@
 }
 
 sub classify {
-  my ($inputptr, %skip) = @_;
+  my ($inputptr, $conf, %skip) = @_;
   my %results;
-  my $maxp = $opt_t;
+  my $maxp = $conf->{textcat_max_ngrams};
 
   # create ngrams for input
   # limit to 10000 characters, enough for accuracy and still fast enough
-  my @unknown = create_lm($inputptr);
+  my @unknown = create_lm($inputptr, $conf);
 
   # test each language
   foreach my $ngram (@nm) {
@@ -380,10 +410,10 @@
   my $best = $results{$results[0]};
 
   my @answers = (shift(@results));
-  while (@results && $results{$results[0]} < ($opt_u * $best)) {
+  while (@results && $results{$results[0]} < ($conf->{textcat_acceptable_score} * $best)) {
     @answers = (@answers, shift(@results));
   }
-  if (@answers > $opt_a) {
+  if (@answers > $conf->{textcat_max_languages}) {
     dbg("textcat: can't determine language uniquely enough");
     return ();
   }
@@ -394,6 +424,7 @@
 }
 
 sub create_lm {
+  my ($inputptr, $conf) = @_;
   my %ngram;
   my @sorted;
 
@@ -414,17 +445,17 @@
     }
   }
 
-  if ($opt_f > 0) {
+  if ($conf->{textcat_optimal_ngrams} > 0) {
     # as suggested by Karel P. de Vos <k....@elsevier.nl> we speed
     # up sorting by removing singletons, however I have very bad
     # results for short inputs, this way
     @sorted = sort { $ngram{$b} <=> $ngram{$a} }
-		   (grep { $ngram{$_} > $opt_f } keys %ngram);
+		   (grep { $ngram{$_} > $conf->{textcat_optimal_ngrams} } keys %ngram);
   }
   else {
     @sorted = sort { $ngram{$b} <=> $ngram{$a} } keys %ngram;
   }
-  splice(@sorted, $opt_t) if (@sorted > $opt_t);
+  splice(@sorted, $conf->{textcat_max_ngrams}) if (@sorted > $conf->{textcat_max_ngrams});
 
   return @sorted;
 }
@@ -457,7 +488,7 @@
     $skip{$_} = 1 for split(' ', $opts->{conf}->{inactive_languages});
     delete $skip{$_} for split(' ', $opts->{conf}->{ok_languages});
     dbg("textcat: classifying, skipping: " . join(" ", keys %skip));
-    @matches = classify(\$body, %skip);
+    @matches = classify(\$body, $opts->{conf}, %skip);
   }
   else {
     dbg("textcat: message too short for language analysis");