You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by he...@apache.org on 2018/11/05 15:57:29 UTC
svn commit: r1845819 -
/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TextCat.pm
Author: hege
Date: Mon Nov 5 15:57:29 2018
New Revision: 1845819
URL: http://svn.apache.org/viewvc?rev=1845819&view=rev
Log:
Better accuracy by stripping uris and emails from text, also ignore punctuations.
Modified:
spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TextCat.pm
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TextCat.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TextCat.pm?rev=1845819&r1=1845818&r2=1845819&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TextCat.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TextCat.pm Mon Nov 5 15:57:29 2018
@@ -476,9 +476,11 @@ sub create_lm {
# Note that $$inputptr may or may not be in perl characters (utf8 flag set)
my $is_unicode = utf8::is_utf8($$inputptr);
- # my $non_word_characters = qr/[0-9\s]/;
- for my $word (split(/[0-9\s]+/, $$inputptr))
+ # "Split the text into separate tokens consisting only of letters and
+ # apostrophes. Digits and punctuation are discarded."
+ while ($$inputptr =~ /([^0-9\s\-!"#\$\%\&()*+,.\/:;<=>?\@\[\\\]\^_`{|}~]+)/gs)
{
+ my $word = $1;
# Bug 6229: Current TextCat database only works well with lowercase input
if ($is_unicode) {
# Unicode rules are used for the case change
@@ -492,8 +494,7 @@ sub create_lm {
$word = "\000" . $word . "\000";
my $len = length($word);
my $flen = $len;
- my $i;
- for ($i = 0; $i < $flen; $i++) {
+ for (my $i = 0; $i < $flen; $i++) {
$len--;
$ngram{substr($word, $i, 1)}++;
($len < 1) ? next : $ngram{substr($word, $i, 2)}++;
@@ -508,10 +509,10 @@ sub create_lm {
# up sorting by removing singletons, however I have very bad
# results for short inputs, this way
@sorted = sort { $ngram{$b} <=> $ngram{$a} }
- (grep { $ngram{$_} > $conf->{textcat_optimal_ngrams} } keys %ngram);
+ (grep { $ngram{$_} > $conf->{textcat_optimal_ngrams} } sort keys %ngram);
}
else {
- @sorted = sort { $ngram{$b} <=> $ngram{$a} } keys %ngram;
+ @sorted = sort { $ngram{$b} <=> $ngram{$a} } sort keys %ngram;
}
splice(@sorted, $conf->{textcat_max_ngrams}) if (@sorted > $conf->{textcat_max_ngrams});
@@ -531,6 +532,10 @@ sub extract_metadata {
$body = join("\n", @{$body});
$body =~ s/^Subject://i;
+ # Strip anything that looks like url or email, enhances results
+ $body =~ s{https?://\S+}{ }gs;
+ $body =~ s{\S+?\@[a-zA-Z]\S+}{ }gs;
+
my $len = length($body);
# truncate after 10k; that should be plenty to classify it
if ($len > 10000) {