You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by gb...@apache.org on 2020/11/02 18:14:47 UTC
svn commit: r1883069 -
/spamassassin/branches/3.4/lib/Mail/SpamAssassin/Plugin/TextCat.pm
Author: gbechis
Date: Mon Nov 2 18:14:47 2020
New Revision: 1883069
URL: http://svn.apache.org/viewvc?rev=1883069&view=rev
Log:
backport TextCat improvements from trunk
fix bz #7866
Modified:
spamassassin/branches/3.4/lib/Mail/SpamAssassin/Plugin/TextCat.pm
Modified: spamassassin/branches/3.4/lib/Mail/SpamAssassin/Plugin/TextCat.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/3.4/lib/Mail/SpamAssassin/Plugin/TextCat.pm?rev=1883069&r1=1883068&r2=1883069&view=diff
==============================================================================
--- spamassassin/branches/3.4/lib/Mail/SpamAssassin/Plugin/TextCat.pm (original)
+++ spamassassin/branches/3.4/lib/Mail/SpamAssassin/Plugin/TextCat.pm Mon Nov 2 18:14:47 2020
@@ -80,6 +80,7 @@ sub new {
if (! @nm) {
if (!defined $mailsaobject->{languages_filename}) {
warn "textcat: languages filename not defined\n";
+ $self->{textcat_disabled} = 1;
}
else {
load_models($mailsaobject->{languages_filename});
@@ -484,9 +485,11 @@ sub create_lm {
# Note that $$inputptr may or may not be in perl characters (utf8 flag set)
my $is_unicode = utf8::is_utf8($$inputptr);
- # my $non_word_characters = qr/[0-9\s]/;
- for my $word (split(/[0-9\s]+/, $$inputptr))
+ # "Split the text into separate tokens consisting only of letters and
+ # apostrophes. Digits and punctuation are discarded."
+ while ($$inputptr =~ /([^0-9\s\-!"#\$\%\&()*+,.\/:;<=>?\@\[\\\]\^_`{|}~]+)/gs)
{
+ my $word = $1;
# Bug 6229: Current TextCat database only works well with lowercase input
if ($is_unicode) {
# Unicode rules are used for the case change
@@ -500,8 +503,7 @@ sub create_lm {
$word = "\000" . $word . "\000";
my $len = length($word);
my $flen = $len;
- my $i;
- for ($i = 0; $i < $flen; $i++) {
+ for (my $i = 0; $i < $flen; $i++) {
$len--;
$ngram{substr($word, $i, 1)}++;
($len < 1) ? next : $ngram{substr($word, $i, 2)}++;
@@ -516,10 +518,10 @@ sub create_lm {
# up sorting by removing singletons, however I have very bad
# results for short inputs, this way
@sorted = sort { $ngram{$b} <=> $ngram{$a} }
- (grep { $ngram{$_} > $conf->{textcat_optimal_ngrams} } keys %ngram);
+ (grep { $ngram{$_} > $conf->{textcat_optimal_ngrams} } sort keys %ngram);
}
else {
- @sorted = sort { $ngram{$b} <=> $ngram{$a} } keys %ngram;
+ @sorted = sort { $ngram{$b} <=> $ngram{$a} } sort keys %ngram;
}
splice(@sorted, $conf->{textcat_max_ngrams}) if (@sorted > $conf->{textcat_max_ngrams});
@@ -531,12 +533,18 @@ sub create_lm {
sub extract_metadata {
my ($self, $opts) = @_;
+ return if $self->{textcat_disabled};
+
my $msg = $opts->{msg};
my $body = $msg->get_rendered_body_text_array();
$body = join("\n", @{$body});
$body =~ s/^Subject://i;
+ # Strip anything that looks like url or email, enhances results
+ $body =~ s{https?://\S+}{ }gs;
+ $body =~ s{\S+?\@[a-zA-Z]\S+}{ }gs;
+
my $len = length($body);
# truncate after 10k; that should be plenty to classify it
if ($len > 10000) {
@@ -572,6 +580,8 @@ sub extract_metadata {
sub check_language {
my ($self, $scan) = @_;
+ return 0 if $self->{textcat_disabled};
+
my $msg = $scan->{msg};
my @languages = split(/\s+/, $scan->{conf}->{ok_languages});
@@ -612,6 +622,8 @@ sub check_language {
sub check_body_8bits {
my ($self, $scan, $body) = @_;
+ return 0 if $self->{textcat_disabled};
+
my @languages = split(/\s+/, $scan->{conf}->{ok_languages});
for (@languages) {