You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by he...@apache.org on 2011/05/09 20:04:41 UTC

svn commit: r1101127 - /spamassassin/branches/3.3/lib/Mail/SpamAssassin/Plugin/TextCat.pm

Author: hege
Date: Mon May  9 18:04:40 2011
New Revision: 1101127

URL: http://svn.apache.org/viewvc?rev=1101127&view=rev
Log:
Bug 6229: TextCat is too case sensitive

Modified:
    spamassassin/branches/3.3/lib/Mail/SpamAssassin/Plugin/TextCat.pm

Modified: spamassassin/branches/3.3/lib/Mail/SpamAssassin/Plugin/TextCat.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/3.3/lib/Mail/SpamAssassin/Plugin/TextCat.pm?rev=1101127&r1=1101126&r2=1101127&view=diff
==============================================================================
--- spamassassin/branches/3.3/lib/Mail/SpamAssassin/Plugin/TextCat.pm (original)
+++ spamassassin/branches/3.3/lib/Mail/SpamAssassin/Plugin/TextCat.pm Mon May  9 18:04:40 2011
@@ -294,7 +294,7 @@ Rhaeto-Romance, Sanskrit, Scots, Sloveni
     type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING,
   });
 
-=item textcat_max_languages N (default: 5)
+=item textcat_max_languages N (default: 3)
 
 The maximum number of languages before the classification is considered unknown.
 
@@ -302,7 +302,7 @@ The maximum number of languages before t
 
   push (@cmds, {
     setting => 'textcat_max_languages',
-    default => 5,
+    default => 3,
     type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
   });
 
@@ -333,16 +333,16 @@ models (note that each of those models i
     type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
   });
 
-=item textcat_acceptable_score N (default: 1.05)
+=item textcat_acceptable_score N (default: 1.02)
 
 Include any language that scores at least C<textcat_acceptable_score> in the
-returned list of languages
+returned list of languages.
 
 =cut
 
   push (@cmds, {
     setting => 'textcat_acceptable_score',
-    default => 1.05,
+    default => 1.02,
     type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
   });
 
@@ -443,6 +443,11 @@ sub create_lm {
   # my $non_word_characters = qr/[0-9\s]/;
   for my $word (split(/[0-9\s]+/, ${$_[0]}))
   {
+    # Bug 6229: Current TextCat database only works well with
+    # lowercase input, lets work around it until it's properly
+    # generated and/or locale issues are resolved..
+    $word =~ tr/A-Z\xc0-\xd6\xd8-\xde/a-z\xe0-\xf6\xf8-\xfe/
+        if $word =~ /[A-Z]/ && $word =~ /[a-zA-Z\xc0-\xd6\xd8-\xde\xe0-\xf6\xf8-\xfe]{4}/;
     $word = "\000" . $word . "\000";
     my $len = length($word);
     my $flen = $len;