You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by gb...@apache.org on 2020/06/17 14:38:24 UTC
svn commit: r1878925 - in /spamassassin/trunk: MANIFEST
lib/Mail/SpamAssassin/Conf.pm lib/Mail/SpamAssassin/Plugin/Bayes.pm
rules/60_bayes_stopwords.cf
Author: gbechis
Date: Wed Jun 17 14:38:23 2020
New Revision: 1878925
URL: http://svn.apache.org/viewvc?rev=1878925&view=rev
Log:
Make bayes stopwords configurable.
Default values unchanged, regexps generated starting
from Python nltk.corpus data.
fixes bz# 7720
Added:
spamassassin/trunk/rules/60_bayes_stopwords.cf
Modified:
spamassassin/trunk/MANIFEST
spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm
Modified: spamassassin/trunk/MANIFEST
URL: http://svn.apache.org/viewvc/spamassassin/trunk/MANIFEST?rev=1878925&r1=1878924&r2=1878925&view=diff
==============================================================================
--- spamassassin/trunk/MANIFEST (original)
+++ spamassassin/trunk/MANIFEST Wed Jun 17 14:38:23 2020
@@ -149,6 +149,7 @@ rules/v341.pre
rules/v342.pre
rules/v343.pre
rules/20_aux_tlds.cf
+rules/60_bayes_stopwords.cf
rules-extras/README.txt
rules-extras/10_uridnsbl_skip_financial.cf
sa-awl.raw
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm?rev=1878925&r1=1878924&r2=1878925&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm Wed Jun 17 14:38:23 2020
@@ -5376,6 +5376,7 @@ sub feature_dns_block_rule { 1 } # suppo
sub feature_compile_regexp { 1 } # Util::compile_regexp
sub feature_meta_rules_matching { 1 } # meta rules_matching() expression
sub feature_subjprefix { 1 } # add subject prefixes rule option
+sub feature_bayes_stopwords { 1 } # multi language stopwords in Bayes
sub feature_get_host { 1 } # $pms->get() :host :domain :ip :revip # was implemented together with AskDNS::has_tag_header # Bug 7734
sub has_tflags_nosubject { 1 } # tflags nosubject
sub perl_min_version_5010000 { return $] >= 5.010000 } # perl version check ("perl_version" not neatly backwards-compatible)
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm?rev=1878925&r1=1878924&r2=1878925&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm Wed Jun 17 14:38:23 2020
@@ -37,7 +37,32 @@ And the chi-square probability combiner
The results are incorporated into SpamAssassin as the BAYES_* rules.
-=head1 METHODS
+=head1 ADMINISTRATOR SETTINGS
+
+=over 4
+
+=item bayes_stopword_languages lang (default: en)
+
+Languages enabled in bayes stopwords processing, every language have a default stopwords regexp,
+tokens matching this regular expressions will not be considered in bayes processing.
+
+Custom regular expressions for additional languages can be defined in C<local.cf>.
+
+Custom regular expressions can be specified by using the C<bayes_stopword_lang> keyword like in
+the following example:
+
+ bayes_stopword_languages en
+ bayes_stopword_en (?:you|me)
+
+=back
+
+=over 4
+
+=item bayes_max_token_length (default: 15)
+
+Configure the maximum number of character a token could contain
+
+=back
=cut
@@ -53,7 +78,7 @@ use Digest::SHA qw(sha1 sha1_hex);
use Mail::SpamAssassin::Plugin;
use Mail::SpamAssassin::PerMsgStatus;
use Mail::SpamAssassin::Logger;
-use Mail::SpamAssassin::Util qw(untaint_var);
+use Mail::SpamAssassin::Util qw(compile_regexp untaint_var);
# pick ONLY ONE of these combining implementations.
use Mail::SpamAssassin::Bayes::CombineChi;
@@ -216,6 +241,8 @@ use constant REQUIRE_SIGNIFICANT_TOKENS_
# How long a token should we hold onto? (note: German speakers typically
# will require a longer token than English ones.)
+# This is just a default value, option can be changed using
+# bayes_max_token_length option
use constant MAX_TOKEN_LENGTH => 15;
###########################################################################
@@ -232,10 +259,68 @@ sub new {
$self->{conf} = $main->{conf};
$self->{use_ignores} = 1;
+ $self->set_config($self->{conf});
$self->register_eval_rule("check_bayes");
$self;
}
+sub set_config {
+ my ($self, $conf) = @_;
+ my @cmds;
+ my $invalid_lang = 0;
+ my ($re, $def_lang);
+
+ push(@cmds, {
+ setting => 'bayes_max_token_length',
+ default => MAX_TOKEN_LENGTH,
+ type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
+ });
+
+ push(@cmds, {
+ setting => 'bayes_stopword_languages',
+ type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING,
+ is_admin => 1,
+ default => 'en',
+ code => sub {
+ my ($self, $key, $value, $line) = @_;
+ my @lng = split(/,/, $value);
+ foreach my $lang ( @lng ) {
+ dbg("bayes: stopwords for language $lang enabled");
+ if ($lang !~ /^([a-z]{2})$/) {
+ $invalid_lang = 1;
+ }
+ }
+ return $Mail::SpamAssassin::Conf::INVALID_VALUE unless $invalid_lang eq 0;
+ $self->{bayes_stopword_languages} = $value;
+ }
+ });
+ $conf->{parser}->register_commands(\@cmds);
+}
+
+sub parse_config {
+ my ($self, $opt) = @_;
+ my $languages = $self->{conf}->{bayes_stopword_languages};
+
+ if ($opt->{key} =~ /^bayes_stopword_([a-z]{2})$/i) {
+ $self->inhibit_further_callbacks();
+
+ my $lang = lc($1);
+ my @opts = split(/\s+/, $opt->{value});
+ foreach my $re (@opts)
+ {
+ my ($rec, $err) = compile_regexp($re, 0);
+ if (!$rec) {
+ warn "bayes: invalid regex for language $lang: $@\n";
+ return 0;
+ }
+ # dbg("bayes: setting regexp for language $lang");
+ $self->{conf}->{bayes_stopword}{$lang} = $rec
+ }
+ return 1;
+ }
+ return 0;
+}
+
sub finish {
my $self = shift;
if ($self->{store}) {
@@ -1080,6 +1165,7 @@ sub tokenize {
# dbg("bayes: token: %s", $token);
$tokens{substr(sha1($token), -5)} = $token if $token ne '';
}
+ undef $self->{tokens};
# return the keys == tokens ...
return \%tokens;
@@ -1146,8 +1232,24 @@ sub _tokenize_line {
# area, and it just slows us down to record them.
# See http://wiki.apache.org/spamassassin/BayesStopList for more info.
#
- next if $len < 3 ||
- ($token =~ /^(?:a(?:ble|l(?:ready|l)|n[dy]|re)|b(?:ecause|oth)|c(?:an|ome)|e(?:ach|mail|ven)|f(?:ew|irst|or|rom)|give|h(?:a(?:ve|s)|ttp)|i(?:n(?:formation|to)|t\'s)|just|know|l(?:ike|o(?:ng|ok))|m(?:a(?:de|il(?:(?:ing|to))?|ke|ny)|o(?:re|st)|uch)|n(?:eed|o[tw]|umber)|o(?:ff|n(?:ly|e)|ut|wn)|p(?:eople|lace)|right|s(?:ame|ee|uch)|t(?:h(?:at|is|rough|e)|ime)|using|w(?:eb|h(?:ere|y)|ith(?:out)?|or(?:ld|k))|y(?:ears?|ou(?:(?:\'re|r))?))$/i);
+ next if $len < 3;
+ foreach my $lang ( split /,/, $self->{conf}->{bayes_stopword_languages} ) {
+ if ( not defined $self->{conf}->{bayes_stopword}{$lang} ) {
+ dbg("Missing stopwords regexp for language $lang");
+ next;
+ }
+ # check regexp only once
+ next if(exists $self->{tokens}{$lang}{$token});
+ $self->{tokens}{$lang}{$token} = 1;
+ # dbg("bayes: using stopwords for language $lang");
+ if ($token =~ /^$self->{conf}->{bayes_stopword}{$lang}$/i) {
+ dbg("bayes: skipped token \"$token\" because it's in stopword list for language \"$lang\"");
+ next;
+ } else {
+ # XXX for debugging purposes
+ # dbg("bayes: using token \"$token\" not matching regexp \"$self->{conf}->{bayes_stopword}{$lang}\" for language \"$lang\"");
+ }
+ }
# are we in the body? If so, apply some body-specific breakouts
if ($region == 1 || $region == 2) {
@@ -1166,7 +1268,7 @@ sub _tokenize_line {
# used as part of split tokens such as "HTo:D*net" indicating that
# the domain ".net" appeared in the To header.
#
- if ($len > MAX_TOKEN_LENGTH && $token !~ /\*/) {
+ if ($len > $self->{main}->{conf}->{bayes_max_token_length} && $token !~ /\*/) {
if (TOKENIZE_LONG_8BIT_SEQS_AS_UTF8_CHARS && $token =~ /[\x80-\xBF]{2}/) {
# Bug 7135
Added: spamassassin/trunk/rules/60_bayes_stopwords.cf
URL: http://svn.apache.org/viewvc/spamassassin/trunk/rules/60_bayes_stopwords.cf?rev=1878925&view=auto
==============================================================================
--- spamassassin/trunk/rules/60_bayes_stopwords.cf (added)
+++ spamassassin/trunk/rules/60_bayes_stopwords.cf Wed Jun 17 14:38:23 2020
@@ -0,0 +1,31 @@
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+
+if (version >= 4.000000)
+ if can(Mail::SpamAssassin::Conf::feature_bayes_stopwords)
+ bayes_stopword_en (?:a(?:ble|l(?:ready|l)|n[dy]|re)|b(?:ecause|oth)|c(?:an|ome)|e(?:ach|mail|ven)|f(?:ew|irst|or|rom)|give|h(?:a(?:ve|s)|ttp)|i(?:n(?:formation|to)|t\'s)|just|know|l(?:ike|o(?:ng|ok))|m(?:a(?:de|il(?:(?:ing|to))?|ke|ny)|o(?:re|st)|uch)|n(?:eed|o[tw]|umber)|o(?:ff|n(?:ly|e)|ut|wn)|p(?:eople|lace)|right|s(?:ame|ee|uch)|t(?:h(?:at|is|rough|e)|ime)|using|w(?:eb|h(?:ere|y)|ith(?:out)?|or(?:ld|k))|y(?:ears?|ou(?:(?:\'re|r))?))$
+ # bayes_stopword_en (?:a(?:bo(?:ut|ve)|fter|gain(?:st)?|ren(?:\'t)?)|b(?:e(?:cause|en|fore|ing|low|tween)|oth)|couldn(?:\'t)?|d(?:idn(?:\'t)?|o(?:es(?:n(?:\'t)?)?|ing|n\'t|wn)|uring)|each|f(?:rom|urther)|h(?:a(?:dn(?:\'t)?|sn(?:\'t)?|v(?:e(?:n(?:\'t)?)?|ing))|er(?:s(?:elf)?|e)|imself)|i(?:nto|sn\'t|t(?:\'s|self))|just|m(?:ightn(?:\'t)?|o(?:re|st)|ustn(?:\'t)?|yself)|needn(?:\'t)?|o(?:n(?:ce|ly)|ther|urs(?:elves)?|ver)|s(?:ame|h(?:an(?:\'t)?|e\'s|ould(?:(?:\'ve|n(?:\'t)?))?)|ome|uch)|th(?:a(?:t(?:\'ll)?|n)|e(?:irs?|m(?:selves)?|re|se|[ny])|is|ose|rough)|un(?:der|til)|very|w(?:asn(?:\'t)?|ere(?:n(?:\'t)?)?|h(?:at|e(?:re|n)|i(?:ch|le)|om)|i(?:ll|th)|o(?:n\'t|uldn(?:\'t)?))|you(?:\'(?:ll|re|ve|d)|r(?:s(?:el(?:ves|f))?)?))
+
+ bayes_stopword_es (?:a(?:lg(?:un(?:as|os)|o)|ntes?)|c(?:o(?:mo|ntra)|ua(?:ndo|l))|d(?:esde|onde|urante)|e(?:ll(?:as?|os)|ntre|r(?:a(?:is|[ns])|es)|s(?:as|os|t(?:a(?:(?:ba(?:(?:is|[ns]))?|d(?:(?:as?|os?))?|mos|ndo|r(?:(?:emos|\xc3(?:\xa1[ns]?|\xa9(?:is)?|\xada(?:(?:is|mos|[ns]))?)))?|s))?|e(?:mos)?|o[sy]?|uv(?:i(?:e(?:r(?:a(?:(?:is|[ns]))?|on)|se(?:(?:is|[ns]))?)|mos|ste(?:is)?|\xc3\xa9(?:ramos|semos))|[eo])|\xc3(?:\xa1(?:(?:bamos|is|[ns]))?|\xa9(?:(?:is|[ns]))?))))|fu(?:e(?:r(?:a(?:(?:is|[ns]))?|on)|se(?:(?:is|[ns]))?)|i(?:mos|ste(?:is)?)|\xc3\xa9(?:ramos|semos))|h(?:a(?:b(?:i(?:d(?:as?|os?)|endo)|r(?:emos|\xc3(?:\xa1[ns]?|\xa9(?:is)?|\xada(?:(?:is|mos|[ns]))?))|\xc3(?:\xa9is|\xada(?:(?:is|mos|[ns]))?))|sta|y(?:a(?:(?:mos|[ns]))?|\xc3\xa1is))|emos|ub(?:i(?:e(?:r(?:a(?:(?:is|[ns]))?|on)|se(?:(?:is|[ns]))?)|mos|ste(?:is)?|\xc3\xa9(?:ramos|semos))|[eo]))|m(?:uchos?|\xc3\xad(?:as|os))|n(?:ada|osotr(?:as|os)|uestr(?:as?|os?))|otr(?:as?|os?)|p(?:ara|ero|o(?:co|rque))|quien(?:es)?|s(?:e(
?:a(?:mos|[ns])|ntid(?:(?:as?|os?))?|r(?:emos|\xc3(?:\xa1[ns]?|\xa9(?:is)?|\xada(?:(?:is|mos|[ns]))?))|\xc3\xa1is)|i(?:ente|ntiendo)|o(?:bre|is|mos)|uy(?:as?|os?))|t(?:a(?:mbi\xc3\xa9n|nto)|en(?:dr(?:emos|\xc3(?:\xa1[ns]?|\xa9(?:is)?|\xada(?:(?:is|mos|[ns]))?))|e(?:mos|d)|g(?:a(?:(?:mos|[ns]))?|\xc3\xa1is|o)|i(?:d(?:as?|os?)|endo)|\xc3(?:\xa9is|\xada(?:(?:is|mos|[ns]))?))|iene[ns]?|odos?|u(?:v(?:i(?:e(?:r(?:a(?:(?:is|[ns]))?|on)|se(?:(?:is|[ns]))?)|mos|ste(?:is)?|\xc3\xa9(?:ramos|semos))|[eo])|y(?:as?|os?)))|unos|v(?:osotr(?:as|os)|uestr(?:as?|os?))|\xc3\xa9ramos)
+
+ bayes_stopword_fr (?:a(?:ie(?:nt|s)|ur(?:a(?:(?:i(?:(?:ent|[st]))?|s))?|ez|i(?:ez|ons)|on[st])|v(?:ai(?:ent|[st])|e[cz]|i(?:ez|ons)|ons)|y(?:ant(?:(?:es?|s))?|ez|ons))|dans|e(?:lle|u(?:es|rent|ss(?:e(?:(?:nt|s))?|i(?:ez|ons)))|\xc3\xbb(?:mes|tes))|f(?:u(?:rent|ss(?:e(?:(?:nt|s))?|i(?:ez|ons)))|\xc3\xbb(?:mes|tes))|leur|m(?:ais|\xc3\xaame)|no(?:tre|us)|pour|s(?:er(?:a(?:(?:i(?:(?:ent|[st]))?|s))?|ez|i(?:ez|ons)|on[st])|o(?:i(?:ent|[st])|mmes|nt|y(?:ez|ons))|uis)|vo(?:tre|us)|\xc3(?:\xa9t(?:a(?:i(?:ent|[st])|nt(?:(?:es?|s))?)|i(?:ez|ons)|\xc3\xa9(?:es?|s))|\xaates))
+
+ bayes_stopword_de (?:a(?:ber|l(?:le[mnrs]?|so)|nder(?:(?:e[mnrs]?|[mnrs]))?|uch)|bist|d(?:a(?:mit|nn|ss(?:elbe)?|zu)|e(?:in(?:e[mnrs]?)?|mselben|n(?:selben|n)|r(?:er|selben?)|sse(?:lben|n))|i(?:ch|es(?:e(?:(?:lben?|[mnrs]))?)?)|o(?:ch|rt)|urch)|e(?:in(?:e[mnrs]?|ig(?:e[mnrs]?)?|mal)|twas|u(?:ch|er|re[mnrs]?))|ge(?:gen|wesen)|h(?:a(?:ben?|tten?)|i(?:er|nter))|i(?:h(?:nen|re[mnrs]?)|ndem)|je(?:de[mnrs]?|ne[mnrs]?|tzt)|k(?:ann|ein(?:e[mnrs]?)?|\xc3\xb6nn(?:en|te))|m(?:a(?:chen|nche[mnrs]?)|ein(?:e[mnrs]?)?|ich|uss(?:te)?)|n(?:ach|ichts?|och)|o(?:der|hne)|s(?:e(?:hr|in(?:e[mnrs]?)?|lbst)|i(?:ch|nd)|o(?:l(?:che[mnrs]?|l(?:te)?)|n(?:dern|st)))|un(?:ser(?:e[mns]?)?|ter)|viel|w(?:ar(?:en|st)|e(?:i(?:ter|l)|lche[mnrs]?|nn|rden?)|i(?:eder|ll|r(?:st|d))|oll(?:en|te)|\xc3(?:\xa4hrend|\xbcrden?))|zw(?:ar|ischen)|\xc3\xbcber)
+
+ bayes_stopword_it (?:a(?:bbia(?:(?:mo|no|te))?|gli|ll[aeo]|nche|v(?:e(?:mmo|ndo|s(?:s(?:e(?:ro)?|i(?:mo)?)|t[ei])|te|v(?:a(?:(?:mo|no|te))?|[io]))|r(?:a(?:nno|i)|e(?:bbe(?:ro)?|m(?:mo|o)|st[ei]|te|i)|\xc3[\xa0\xb2])|ut[aeio]))|co(?:me|ntro)|d(?:a(?:gli?|ll[aeo]?)|e(?:gli?|ll[aeo]?)|ove)|e(?:bb(?:e(?:ro)?|i)|ra(?:no|va(?:mo|te))|ssendo)|f(?:a(?:c(?:ci(?:a(?:(?:mo|no|te))?|o)|e(?:mmo|ndo|s(?:s(?:e(?:ro)?|i(?:mo)?)|t[ei])|v(?:a(?:(?:mo|no|te))?|[io])))|nno|r(?:a(?:nno|i)|e(?:bbe(?:ro)?|m(?:mo|o)|st[ei]|te|i)|\xc3[\xa0\xb2]))|ec(?:e(?:ro)?|i)|os(?:s(?:e(?:ro)?|i(?:mo)?)|t[ei])|u(?:mmo|rono))|hanno|loro|miei|n(?:e(?:gli?|ll[aeo]?)|ostr[aeio])|perch\xc3\xa9|qu(?:a(?:le|nt[aeio])|e(?:ll[aeio]|st[aeio]))|s(?:ar(?:a(?:nno|i)|e(?:bbe(?:ro)?|m(?:mo|o)|st[ei]|te|i)|\xc3[\xa0\xb2])|i(?:a(?:mo|no|te)|ete)|ono|t(?:a(?:n(?:do|no)|r(?:a(?:nno|i)|e(?:bbe(?:ro)?|m(?:mo|o)|st[ei]|te|i)|\xc3[\xa0\xb2])|v(?:a(?:(?:mo|no|te))?|[io])|i)|e(?:mmo|s(?:s(?:e(?:ro)?|i(?:mo)?)|t[ei])|tt(?:e(?:ro)?|i))|ia(?:(?:
mo|no|te))?)|u(?:gli?|ll[aeo]?|oi))|tu(?:oi|tt[io])|vostr[aeio])
+ endif
+endif