You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by gb...@apache.org on 2020/06/17 14:38:24 UTC

svn commit: r1878925 - in /spamassassin/trunk: MANIFEST lib/Mail/SpamAssassin/Conf.pm lib/Mail/SpamAssassin/Plugin/Bayes.pm rules/60_bayes_stopwords.cf

Author: gbechis
Date: Wed Jun 17 14:38:23 2020
New Revision: 1878925

URL: http://svn.apache.org/viewvc?rev=1878925&view=rev
Log:
Make bayes stopwords configurable.
Default values unchanged, regexps generated starting
from Python nltk.corpus data.
fixes bz# 7720

Added:
    spamassassin/trunk/rules/60_bayes_stopwords.cf
Modified:
    spamassassin/trunk/MANIFEST
    spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm

Modified: spamassassin/trunk/MANIFEST
URL: http://svn.apache.org/viewvc/spamassassin/trunk/MANIFEST?rev=1878925&r1=1878924&r2=1878925&view=diff
==============================================================================
--- spamassassin/trunk/MANIFEST (original)
+++ spamassassin/trunk/MANIFEST Wed Jun 17 14:38:23 2020
@@ -149,6 +149,7 @@ rules/v341.pre
 rules/v342.pre
 rules/v343.pre
 rules/20_aux_tlds.cf
+rules/60_bayes_stopwords.cf
 rules-extras/README.txt
 rules-extras/10_uridnsbl_skip_financial.cf
 sa-awl.raw

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm?rev=1878925&r1=1878924&r2=1878925&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm Wed Jun 17 14:38:23 2020
@@ -5376,6 +5376,7 @@ sub feature_dns_block_rule { 1 } # suppo
 sub feature_compile_regexp { 1 } # Util::compile_regexp
 sub feature_meta_rules_matching { 1 } # meta rules_matching() expression
 sub feature_subjprefix { 1 } # add subject prefixes rule option
+sub feature_bayes_stopwords { 1 } # multi language stopwords in Bayes
 sub feature_get_host { 1 } # $pms->get() :host :domain :ip :revip # was implemented together with AskDNS::has_tag_header # Bug 7734
 sub has_tflags_nosubject { 1 } # tflags nosubject
 sub perl_min_version_5010000 { return $] >= 5.010000 }  # perl version check ("perl_version" not neatly backwards-compatible)

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm?rev=1878925&r1=1878924&r2=1878925&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm Wed Jun 17 14:38:23 2020
@@ -37,7 +37,32 @@ And the chi-square probability combiner
 
 The results are incorporated into SpamAssassin as the BAYES_* rules.
 
-=head1 METHODS
+=head1 ADMINISTRATOR SETTINGS
+
+=over 4
+
+=item bayes_stopword_languages lang             (default: en)
+
+Languages enabled in bayes stopwords processing, every language have a default stopwords regexp,
+tokens matching this regular expressions will not be considered in bayes processing.
+
+Custom regular expressions for additional languages can be defined in C<local.cf>.
+
+Custom regular expressions can be specified by using the C<bayes_stopword_lang> keyword like in
+the following example:
+
+ bayes_stopword_languages en
+ bayes_stopword_en (?:you|me)
+
+=back
+
+=over 4
+
+=item bayes_max_token_length (default: 15)
+
+Configure the maximum number of character a token could contain
+
+=back
 
 =cut
 
@@ -53,7 +78,7 @@ use Digest::SHA qw(sha1 sha1_hex);
 use Mail::SpamAssassin::Plugin;
 use Mail::SpamAssassin::PerMsgStatus;
 use Mail::SpamAssassin::Logger;
-use Mail::SpamAssassin::Util qw(untaint_var);
+use Mail::SpamAssassin::Util qw(compile_regexp untaint_var);
 
 # pick ONLY ONE of these combining implementations.
 use Mail::SpamAssassin::Bayes::CombineChi;
@@ -216,6 +241,8 @@ use constant REQUIRE_SIGNIFICANT_TOKENS_
 
 # How long a token should we hold onto?  (note: German speakers typically
 # will require a longer token than English ones.)
+# This is just a default value, option can be changed using
+# bayes_max_token_length option
 use constant MAX_TOKEN_LENGTH => 15;
 
 ###########################################################################
@@ -232,10 +259,68 @@ sub new {
   $self->{conf} = $main->{conf};
   $self->{use_ignores} = 1;
 
+  $self->set_config($self->{conf});
   $self->register_eval_rule("check_bayes");
   $self;
 }
 
+sub set_config {
+    my ($self, $conf) = @_;
+    my @cmds;
+    my $invalid_lang = 0;
+    my ($re, $def_lang);
+
+    push(@cmds, {
+      setting => 'bayes_max_token_length',
+      default => MAX_TOKEN_LENGTH,
+      type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
+    });
+
+    push(@cmds, {
+        setting => 'bayes_stopword_languages',
+        type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING,
+        is_admin => 1,
+        default => 'en',
+        code => sub {
+          my ($self, $key, $value, $line) = @_;
+          my @lng = split(/,/, $value);
+          foreach my $lang ( @lng ) {
+            dbg("bayes: stopwords for language $lang enabled");
+            if ($lang !~ /^([a-z]{2})$/) {
+              $invalid_lang = 1;
+            }
+          }
+          return $Mail::SpamAssassin::Conf::INVALID_VALUE unless $invalid_lang eq 0;
+          $self->{bayes_stopword_languages} = $value;
+        }
+    });
+    $conf->{parser}->register_commands(\@cmds);
+}
+
+sub parse_config {
+    my ($self, $opt) = @_;
+    my $languages = $self->{conf}->{bayes_stopword_languages};
+
+    if ($opt->{key} =~ /^bayes_stopword_([a-z]{2})$/i) {
+        $self->inhibit_further_callbacks();
+
+        my $lang = lc($1);
+        my @opts = split(/\s+/, $opt->{value});
+        foreach my $re (@opts)
+        {
+          my ($rec, $err) = compile_regexp($re, 0);
+          if (!$rec) {
+            warn "bayes: invalid regex for language $lang: $@\n";
+            return 0;
+          }
+          # dbg("bayes: setting regexp for language $lang");
+          $self->{conf}->{bayes_stopword}{$lang} = $rec
+        }
+        return 1;
+    }
+    return 0;
+}
+
 sub finish {
   my $self = shift;
   if ($self->{store}) {
@@ -1080,6 +1165,7 @@ sub tokenize {
     # dbg("bayes: token: %s", $token);
     $tokens{substr(sha1($token), -5)} = $token  if $token ne '';
   }
+  undef $self->{tokens};
 
   # return the keys == tokens ...
   return \%tokens;
@@ -1146,8 +1232,24 @@ sub _tokenize_line {
     # area, and it just slows us down to record them.
     # See http://wiki.apache.org/spamassassin/BayesStopList for more info.
     #
-    next if $len < 3 ||
-	($token =~ /^(?:a(?:ble|l(?:ready|l)|n[dy]|re)|b(?:ecause|oth)|c(?:an|ome)|e(?:ach|mail|ven)|f(?:ew|irst|or|rom)|give|h(?:a(?:ve|s)|ttp)|i(?:n(?:formation|to)|t\'s)|just|know|l(?:ike|o(?:ng|ok))|m(?:a(?:de|il(?:(?:ing|to))?|ke|ny)|o(?:re|st)|uch)|n(?:eed|o[tw]|umber)|o(?:ff|n(?:ly|e)|ut|wn)|p(?:eople|lace)|right|s(?:ame|ee|uch)|t(?:h(?:at|is|rough|e)|ime)|using|w(?:eb|h(?:ere|y)|ith(?:out)?|or(?:ld|k))|y(?:ears?|ou(?:(?:\'re|r))?))$/i);
+    next if $len < 3;
+    foreach my $lang ( split /,/, $self->{conf}->{bayes_stopword_languages} ) {
+      if ( not defined $self->{conf}->{bayes_stopword}{$lang} ) {
+        dbg("Missing stopwords regexp for language $lang");
+        next;
+      }
+      # check regexp only once
+      next if(exists $self->{tokens}{$lang}{$token});
+      $self->{tokens}{$lang}{$token} = 1;
+      # dbg("bayes: using stopwords for language $lang");
+      if ($token =~ /^$self->{conf}->{bayes_stopword}{$lang}$/i) {
+        dbg("bayes: skipped token \"$token\" because it's in stopword list for language \"$lang\"");
+        next;
+      } else {
+        # XXX for debugging purposes
+        # dbg("bayes: using token \"$token\" not matching regexp \"$self->{conf}->{bayes_stopword}{$lang}\" for language \"$lang\"");
+      }
+    }
 
     # are we in the body?  If so, apply some body-specific breakouts
     if ($region == 1 || $region == 2) {
@@ -1166,7 +1268,7 @@ sub _tokenize_line {
     # used as part of split tokens such as "HTo:D*net" indicating that 
     # the domain ".net" appeared in the To header.
     #
-    if ($len > MAX_TOKEN_LENGTH && $token !~ /\*/) {
+    if ($len > $self->{main}->{conf}->{bayes_max_token_length} && $token !~ /\*/) {
 
       if (TOKENIZE_LONG_8BIT_SEQS_AS_UTF8_CHARS && $token =~ /[\x80-\xBF]{2}/) {
 	# Bug 7135

Added: spamassassin/trunk/rules/60_bayes_stopwords.cf
URL: http://svn.apache.org/viewvc/spamassassin/trunk/rules/60_bayes_stopwords.cf?rev=1878925&view=auto
==============================================================================
--- spamassassin/trunk/rules/60_bayes_stopwords.cf (added)
+++ spamassassin/trunk/rules/60_bayes_stopwords.cf Wed Jun 17 14:38:23 2020
@@ -0,0 +1,31 @@
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at:
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+
+if (version >= 4.000000)
+ if can(Mail::SpamAssassin::Conf::feature_bayes_stopwords)
+  bayes_stopword_en (?:a(?:ble|l(?:ready|l)|n[dy]|re)|b(?:ecause|oth)|c(?:an|ome)|e(?:ach|mail|ven)|f(?:ew|irst|or|rom)|give|h(?:a(?:ve|s)|ttp)|i(?:n(?:formation|to)|t\'s)|just|know|l(?:ike|o(?:ng|ok))|m(?:a(?:de|il(?:(?:ing|to))?|ke|ny)|o(?:re|st)|uch)|n(?:eed|o[tw]|umber)|o(?:ff|n(?:ly|e)|ut|wn)|p(?:eople|lace)|right|s(?:ame|ee|uch)|t(?:h(?:at|is|rough|e)|ime)|using|w(?:eb|h(?:ere|y)|ith(?:out)?|or(?:ld|k))|y(?:ears?|ou(?:(?:\'re|r))?))$
+  # bayes_stopword_en (?:a(?:bo(?:ut|ve)|fter|gain(?:st)?|ren(?:\'t)?)|b(?:e(?:cause|en|fore|ing|low|tween)|oth)|couldn(?:\'t)?|d(?:idn(?:\'t)?|o(?:es(?:n(?:\'t)?)?|ing|n\'t|wn)|uring)|each|f(?:rom|urther)|h(?:a(?:dn(?:\'t)?|sn(?:\'t)?|v(?:e(?:n(?:\'t)?)?|ing))|er(?:s(?:elf)?|e)|imself)|i(?:nto|sn\'t|t(?:\'s|self))|just|m(?:ightn(?:\'t)?|o(?:re|st)|ustn(?:\'t)?|yself)|needn(?:\'t)?|o(?:n(?:ce|ly)|ther|urs(?:elves)?|ver)|s(?:ame|h(?:an(?:\'t)?|e\'s|ould(?:(?:\'ve|n(?:\'t)?))?)|ome|uch)|th(?:a(?:t(?:\'ll)?|n)|e(?:irs?|m(?:selves)?|re|se|[ny])|is|ose|rough)|un(?:der|til)|very|w(?:asn(?:\'t)?|ere(?:n(?:\'t)?)?|h(?:at|e(?:re|n)|i(?:ch|le)|om)|i(?:ll|th)|o(?:n\'t|uldn(?:\'t)?))|you(?:\'(?:ll|re|ve|d)|r(?:s(?:el(?:ves|f))?)?))
+
+  bayes_stopword_es (?:a(?:lg(?:un(?:as|os)|o)|ntes?)|c(?:o(?:mo|ntra)|ua(?:ndo|l))|d(?:esde|onde|urante)|e(?:ll(?:as?|os)|ntre|r(?:a(?:is|[ns])|es)|s(?:as|os|t(?:a(?:(?:ba(?:(?:is|[ns]))?|d(?:(?:as?|os?))?|mos|ndo|r(?:(?:emos|\xc3(?:\xa1[ns]?|\xa9(?:is)?|\xada(?:(?:is|mos|[ns]))?)))?|s))?|e(?:mos)?|o[sy]?|uv(?:i(?:e(?:r(?:a(?:(?:is|[ns]))?|on)|se(?:(?:is|[ns]))?)|mos|ste(?:is)?|\xc3\xa9(?:ramos|semos))|[eo])|\xc3(?:\xa1(?:(?:bamos|is|[ns]))?|\xa9(?:(?:is|[ns]))?))))|fu(?:e(?:r(?:a(?:(?:is|[ns]))?|on)|se(?:(?:is|[ns]))?)|i(?:mos|ste(?:is)?)|\xc3\xa9(?:ramos|semos))|h(?:a(?:b(?:i(?:d(?:as?|os?)|endo)|r(?:emos|\xc3(?:\xa1[ns]?|\xa9(?:is)?|\xada(?:(?:is|mos|[ns]))?))|\xc3(?:\xa9is|\xada(?:(?:is|mos|[ns]))?))|sta|y(?:a(?:(?:mos|[ns]))?|\xc3\xa1is))|emos|ub(?:i(?:e(?:r(?:a(?:(?:is|[ns]))?|on)|se(?:(?:is|[ns]))?)|mos|ste(?:is)?|\xc3\xa9(?:ramos|semos))|[eo]))|m(?:uchos?|\xc3\xad(?:as|os))|n(?:ada|osotr(?:as|os)|uestr(?:as?|os?))|otr(?:as?|os?)|p(?:ara|ero|o(?:co|rque))|quien(?:es)?|s(?:e(
 ?:a(?:mos|[ns])|ntid(?:(?:as?|os?))?|r(?:emos|\xc3(?:\xa1[ns]?|\xa9(?:is)?|\xada(?:(?:is|mos|[ns]))?))|\xc3\xa1is)|i(?:ente|ntiendo)|o(?:bre|is|mos)|uy(?:as?|os?))|t(?:a(?:mbi\xc3\xa9n|nto)|en(?:dr(?:emos|\xc3(?:\xa1[ns]?|\xa9(?:is)?|\xada(?:(?:is|mos|[ns]))?))|e(?:mos|d)|g(?:a(?:(?:mos|[ns]))?|\xc3\xa1is|o)|i(?:d(?:as?|os?)|endo)|\xc3(?:\xa9is|\xada(?:(?:is|mos|[ns]))?))|iene[ns]?|odos?|u(?:v(?:i(?:e(?:r(?:a(?:(?:is|[ns]))?|on)|se(?:(?:is|[ns]))?)|mos|ste(?:is)?|\xc3\xa9(?:ramos|semos))|[eo])|y(?:as?|os?)))|unos|v(?:osotr(?:as|os)|uestr(?:as?|os?))|\xc3\xa9ramos)
+
+  bayes_stopword_fr (?:a(?:ie(?:nt|s)|ur(?:a(?:(?:i(?:(?:ent|[st]))?|s))?|ez|i(?:ez|ons)|on[st])|v(?:ai(?:ent|[st])|e[cz]|i(?:ez|ons)|ons)|y(?:ant(?:(?:es?|s))?|ez|ons))|dans|e(?:lle|u(?:es|rent|ss(?:e(?:(?:nt|s))?|i(?:ez|ons)))|\xc3\xbb(?:mes|tes))|f(?:u(?:rent|ss(?:e(?:(?:nt|s))?|i(?:ez|ons)))|\xc3\xbb(?:mes|tes))|leur|m(?:ais|\xc3\xaame)|no(?:tre|us)|pour|s(?:er(?:a(?:(?:i(?:(?:ent|[st]))?|s))?|ez|i(?:ez|ons)|on[st])|o(?:i(?:ent|[st])|mmes|nt|y(?:ez|ons))|uis)|vo(?:tre|us)|\xc3(?:\xa9t(?:a(?:i(?:ent|[st])|nt(?:(?:es?|s))?)|i(?:ez|ons)|\xc3\xa9(?:es?|s))|\xaates))
+
+  bayes_stopword_de (?:a(?:ber|l(?:le[mnrs]?|so)|nder(?:(?:e[mnrs]?|[mnrs]))?|uch)|bist|d(?:a(?:mit|nn|ss(?:elbe)?|zu)|e(?:in(?:e[mnrs]?)?|mselben|n(?:selben|n)|r(?:er|selben?)|sse(?:lben|n))|i(?:ch|es(?:e(?:(?:lben?|[mnrs]))?)?)|o(?:ch|rt)|urch)|e(?:in(?:e[mnrs]?|ig(?:e[mnrs]?)?|mal)|twas|u(?:ch|er|re[mnrs]?))|ge(?:gen|wesen)|h(?:a(?:ben?|tten?)|i(?:er|nter))|i(?:h(?:nen|re[mnrs]?)|ndem)|je(?:de[mnrs]?|ne[mnrs]?|tzt)|k(?:ann|ein(?:e[mnrs]?)?|\xc3\xb6nn(?:en|te))|m(?:a(?:chen|nche[mnrs]?)|ein(?:e[mnrs]?)?|ich|uss(?:te)?)|n(?:ach|ichts?|och)|o(?:der|hne)|s(?:e(?:hr|in(?:e[mnrs]?)?|lbst)|i(?:ch|nd)|o(?:l(?:che[mnrs]?|l(?:te)?)|n(?:dern|st)))|un(?:ser(?:e[mns]?)?|ter)|viel|w(?:ar(?:en|st)|e(?:i(?:ter|l)|lche[mnrs]?|nn|rden?)|i(?:eder|ll|r(?:st|d))|oll(?:en|te)|\xc3(?:\xa4hrend|\xbcrden?))|zw(?:ar|ischen)|\xc3\xbcber)
+
+  bayes_stopword_it (?:a(?:bbia(?:(?:mo|no|te))?|gli|ll[aeo]|nche|v(?:e(?:mmo|ndo|s(?:s(?:e(?:ro)?|i(?:mo)?)|t[ei])|te|v(?:a(?:(?:mo|no|te))?|[io]))|r(?:a(?:nno|i)|e(?:bbe(?:ro)?|m(?:mo|o)|st[ei]|te|i)|\xc3[\xa0\xb2])|ut[aeio]))|co(?:me|ntro)|d(?:a(?:gli?|ll[aeo]?)|e(?:gli?|ll[aeo]?)|ove)|e(?:bb(?:e(?:ro)?|i)|ra(?:no|va(?:mo|te))|ssendo)|f(?:a(?:c(?:ci(?:a(?:(?:mo|no|te))?|o)|e(?:mmo|ndo|s(?:s(?:e(?:ro)?|i(?:mo)?)|t[ei])|v(?:a(?:(?:mo|no|te))?|[io])))|nno|r(?:a(?:nno|i)|e(?:bbe(?:ro)?|m(?:mo|o)|st[ei]|te|i)|\xc3[\xa0\xb2]))|ec(?:e(?:ro)?|i)|os(?:s(?:e(?:ro)?|i(?:mo)?)|t[ei])|u(?:mmo|rono))|hanno|loro|miei|n(?:e(?:gli?|ll[aeo]?)|ostr[aeio])|perch\xc3\xa9|qu(?:a(?:le|nt[aeio])|e(?:ll[aeio]|st[aeio]))|s(?:ar(?:a(?:nno|i)|e(?:bbe(?:ro)?|m(?:mo|o)|st[ei]|te|i)|\xc3[\xa0\xb2])|i(?:a(?:mo|no|te)|ete)|ono|t(?:a(?:n(?:do|no)|r(?:a(?:nno|i)|e(?:bbe(?:ro)?|m(?:mo|o)|st[ei]|te|i)|\xc3[\xa0\xb2])|v(?:a(?:(?:mo|no|te))?|[io])|i)|e(?:mmo|s(?:s(?:e(?:ro)?|i(?:mo)?)|t[ei])|tt(?:e(?:ro)?|i))|ia(?:(?:
 mo|no|te))?)|u(?:gli?|ll[aeo]?|oi))|tu(?:oi|tt[io])|vostr[aeio])
+ endif
+endif