You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by he...@apache.org on 2021/04/13 10:27:19 UTC
svn commit: r1888719 -
/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm
Author: hege
Date: Tue Apr 13 10:27:19 2021
New Revision: 1888719
URL: http://svn.apache.org/viewvc?rev=1888719&view=rev
Log:
Stopword fixes and cleanups
Modified:
spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm?rev=1888719&r1=1888718&r2=1888719&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Bayes.pm Tue Apr 13 10:27:19 2021
@@ -43,16 +43,24 @@ The results are incorporated into SpamAs
=item bayes_stopword_languages lang (default: en)
-Languages enabled in bayes stopwords processing, every language have a default stopwords regexp,
-tokens matching this regular expressions will not be considered in bayes processing.
+Languages enabled in bayes stopwords processing, every language have a
+default stopwords regexp, tokens matching this regular expression will not
+be considered in bayes processing.
Custom regular expressions for additional languages can be defined in C<local.cf>.
-Custom regular expressions can be specified by using the C<bayes_stopword_lang> keyword like in
-the following example:
+Custom regular expressions can be specified by using the C<bayes_stopword_lang>
+keyword like in the following example:
- bayes_stopword_languages en
+ bayes_stopword_languages en se
bayes_stopword_en (?:you|me)
+ bayes_stopword_se (?:du|mig)
+
+Regexps will be anchored automatically at beginning and end.
+
+Only one bayes_stopword_languages or bayes_stopword_xx configuration line
+can be used. New configuration line will override the old one, for example
+the ones from SpamAssassin default ruleset (60_bayes_stopwords.cf).
=back
@@ -259,66 +267,93 @@ sub new {
$self->{conf} = $main->{conf};
$self->{use_ignores} = 1;
+ # Old default stopword list, need to have hardcoded one incase sa-update is not available
+ $self->{bayes_stopword}{en} = qr/(?:a(?:ble|l(?:ready|l)|n[dy]|re)|b(?:ecause|oth)|c(?:an|ome)|e(?:ach|mail|ven)|f(?:ew|irst|or|rom)|give|h(?:a(?:ve|s)|ttp)|i(?:n(?:formation|to)|t\'s)|just|know|l(?:ike|o(?:ng|ok))|m(?:a(?:de|il(?:(?:ing|to))?|ke|ny)|o(?:re|st)|uch)|n(?:eed|o[tw]|umber)|o(?:ff|n(?:ly|e)|ut|wn)|p(?:eople|lace)|right|s(?:ame|ee|uch)|t(?:h(?:at|is|rough|e)|ime)|using|w(?:eb|h(?:ere|y)|ith(?:out)?|or(?:ld|k))|y(?:ears?|ou(?:(?:\'re|r))?))/;
+
$self->set_config($self->{conf});
$self->register_eval_rule("check_bayes", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS);
$self;
}
sub set_config {
- my ($self, $conf) = @_;
- my @cmds;
- my $invalid_lang = 0;
- my ($re, $def_lang);
-
- push(@cmds, {
- setting => 'bayes_max_token_length',
- default => MAX_TOKEN_LENGTH,
- type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
- });
-
- push(@cmds, {
- setting => 'bayes_stopword_languages',
- type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING,
- is_admin => 1,
- default => 'en',
- code => sub {
- my ($self, $key, $value, $line) = @_;
- my @lng = split(/,/, $value);
- foreach my $lang ( @lng ) {
- dbg("bayes: stopwords for language $lang enabled");
- if ($lang !~ /^([a-z]{2})$/) {
- $invalid_lang = 1;
- }
- }
- return $Mail::SpamAssassin::Conf::INVALID_VALUE unless $invalid_lang eq 0;
- $self->{bayes_stopword_languages} = $value;
+ my ($self, $conf) = @_;
+ my @cmds;
+
+ push(@cmds, {
+ setting => 'bayes_max_token_length',
+ default => MAX_TOKEN_LENGTH,
+ is_admin => 1,
+ type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
+ });
+
+ push(@cmds, {
+ setting => 'bayes_stopword_languages',
+ default => ['en'],
+ is_admin => 1,
+ type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRINGLIST,
+ code => sub {
+ my ($self, $key, $value, $line) = @_;
+ my @langs;
+ foreach my $lang (split(/(?:\s*,\s*|\s+)/, lc($value))) {
+ if ($lang !~ /^([a-z]{2})$/) {
+ return $Mail::SpamAssassin::Conf::INVALID_VALUE;
}
- });
- $conf->{parser}->register_commands(\@cmds);
+ push @langs, $lang;
+ }
+ if (!@langs) {
+ return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
+ }
+ @{$self->{bayes_stopword_languages}} = @langs;
+ }
+ });
+
+ $conf->{parser}->register_commands(\@cmds);
}
sub parse_config {
- my ($self, $opt) = @_;
- my $languages = $self->{conf}->{bayes_stopword_languages};
+ my ($self, $opts) = @_;
- if ($opt->{key} =~ /^bayes_stopword_([a-z]{2})$/i) {
- $self->inhibit_further_callbacks();
+ # Ignore users's configuration lines
+ return 0 if $opts->{user_config};
- my $lang = lc($1);
- my @opts = split(/\s+/, $opt->{value});
- foreach my $re (@opts)
- {
- my ($rec, $err) = compile_regexp($re, 0);
- if (!$rec) {
- warn "bayes: invalid regex for language $lang: $@\n";
- return 0;
- }
- # dbg("bayes: setting regexp for language $lang");
- $self->{conf}->{bayes_stopword}{$lang} = $rec
+ if ($opts->{key} =~ /^bayes_stopword_([a-z]{2})$/i) {
+ $self->inhibit_further_callbacks();
+ my $lang = lc($1);
+ foreach my $re (split(/\s+/, $opts->{value})) {
+ my ($rec, $err) = compile_regexp($re, 0);
+ if (!$rec) {
+ warn "bayes: invalid regexp for $opts->{key}: $err\n";
+ return 0;
}
- return 1;
+ $self->{bayes_stopword}{$lang} = $rec;
+ }
+ return 1;
+ }
+
+ return 0;
+}
+
+sub finish_parsing_end {
+ my ($self, $opts) = @_;
+ my $conf = $opts->{conf};
+
+ my @langs;
+ foreach my $lang (@{$conf->{bayes_stopword_languages}}) {
+ if (defined $self->{bayes_stopword}{$lang}) {
+ push @langs, $lang;
+ } else {
+ warn "bayes: missing stopwords regexp for language '$lang'\n";
}
- return 0;
+ }
+ if (@langs) {
+ dbg("bayes: stopwords for languages enabled: ".join(' ', @langs));
+ @{$conf->{bayes_stopword_languages}} = @langs;
+ } else {
+ dbg("bayes: no stopword languages enabled");
+ $conf->{bayes_stopword_languages} = [];
+ }
+
+ return 0;
}
sub finish {
@@ -753,7 +788,6 @@ sub learner_is_scan_available {
sub scan {
my ($self, $permsgstatus, $msg) = @_;
- my $score;
return unless $self->{conf}->{use_learner};
@@ -838,6 +872,7 @@ sub scan {
if (@pw_keys > N_SIGNIFICANT_TOKENS) { $#pw_keys = N_SIGNIFICANT_TOKENS - 1 }
my @sorted;
+ my $score;
foreach my $tok (@pw_keys) {
next if $tok_strength{$tok} <
$Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
@@ -1091,8 +1126,10 @@ sub _get_msgdata_from_permsgstatus {
# The calling functions expect a uniq'ed array of tokens ...
sub tokenize {
my ($self, $msg, $msgdata) = @_;
+ my $conf = $self->{conf};
+ my $t_src = $conf->{bayes_token_sources};
- my $t_src = $self->{conf}->{bayes_token_sources};
+ $self->{stopword_cache} = ();
# visible tokens from the body
my @tokens_body;
@@ -1156,6 +1193,8 @@ sub tokenize {
dbg("bayes: tokenized header: %d tokens", scalar @tokens_header);
}
+ delete $self->{stopword_cache};
+
# Go ahead and uniq the array, skip null tokens (can happen sometimes)
# generate an SHA1 hash and take the lower 40 bits as our token
my %tokens;
@@ -1165,7 +1204,6 @@ sub tokenize {
# dbg("bayes: token: %s", $token);
$tokens{substr(sha1($token), -5)} = $token if $token ne '';
}
- undef $self->{tokens};
# return the keys == tokens ...
return \%tokens;
@@ -1177,6 +1215,7 @@ sub _tokenize_line {
my $region = $_[3];
local ($_) = $_[1];
+ my $conf = $self->{conf};
my @rettokens;
# include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings,
@@ -1233,22 +1272,20 @@ sub _tokenize_line {
# See http://wiki.apache.org/spamassassin/BayesStopList for more info.
#
next if $len < 3;
- foreach my $lang ( split /,/, $self->{conf}->{bayes_stopword_languages} ) {
- if ( not defined $self->{conf}->{bayes_stopword}{$lang} ) {
- dbg("Missing stopwords regexp for language $lang");
- next;
- }
- # check regexp only once
- next if(exists $self->{tokens}{$lang}{$token});
- $self->{tokens}{$lang}{$token} = 1;
- # dbg("bayes: using stopwords for language $lang");
- if ($token =~ /^$self->{conf}->{bayes_stopword}{$lang}$/i) {
- dbg("bayes: skipped token \"$token\" because it's in stopword list for language \"$lang\"");
- next;
- } else {
- # XXX for debugging purposes
- # dbg("bayes: using token \"$token\" not matching regexp \"$self->{conf}->{bayes_stopword}{$lang}\" for language \"$lang\"");
+
+ # check stopwords regexp if not cached
+ if (!exists $self->{stopword_cache}{$token}) {
+ foreach my $lang (@{$conf->{bayes_stopword_languages}}) {
+ if ($token =~ /^$self->{bayes_stopword}{$lang}$/i) {
+ dbg("bayes: skipped token '$token' because it's in stopword list for language '$lang'");
+ $self->{stopword_cache}{$token} = 1;
+ next;
+ }
}
+ $self->{stopword_cache}{$token} = 0;
+ } else {
+ # bail out if cached known
+ next if $self->{stopword_cache}{$token};
}
# are we in the body? If so, apply some body-specific breakouts
@@ -1268,7 +1305,7 @@ sub _tokenize_line {
# used as part of split tokens such as "HTo:D*net" indicating that
# the domain ".net" appeared in the To header.
#
- if ($len > $self->{main}->{conf}->{bayes_max_token_length} && $token !~ /\*/) {
+ if ($len > $conf->{bayes_max_token_length} && index($token, '*') == -1) {
if (TOKENIZE_LONG_8BIT_SEQS_AS_UTF8_CHARS && $token =~ /[\x80-\xBF]{2}/) {
# Bug 7135