You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by gb...@apache.org on 2020/11/02 18:14:47 UTC
svn commit: r1883069 - /spamassassin/branches/3.4/lib/Mail/SpamAssassin/Plugin/TextCat.pm

Author: gbechis
Date: Mon Nov  2 18:14:47 2020
New Revision: 1883069

URL: http://svn.apache.org/viewvc?rev=1883069&view=rev
Log:
backport TextCat improvements from trunk
fix bz #7866

Modified:
    spamassassin/branches/3.4/lib/Mail/SpamAssassin/Plugin/TextCat.pm

Modified: spamassassin/branches/3.4/lib/Mail/SpamAssassin/Plugin/TextCat.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/3.4/lib/Mail/SpamAssassin/Plugin/TextCat.pm?rev=1883069&r1=1883068&r2=1883069&view=diff
==============================================================================
--- spamassassin/branches/3.4/lib/Mail/SpamAssassin/Plugin/TextCat.pm (original)
+++ spamassassin/branches/3.4/lib/Mail/SpamAssassin/Plugin/TextCat.pm Mon Nov  2 18:14:47 2020
@@ -80,6 +80,7 @@ sub new {
   if (! @nm) {
     if (!defined $mailsaobject->{languages_filename}) {
       warn "textcat: languages filename not defined\n";
+      $self->{textcat_disabled} = 1;
     }
     else {
       load_models($mailsaobject->{languages_filename});
@@ -484,9 +485,11 @@ sub create_lm {
   # Note that $$inputptr may or may not be in perl characters (utf8 flag set)
   my $is_unicode = utf8::is_utf8($$inputptr);
 
-  # my $non_word_characters = qr/[0-9\s]/;
-  for my $word (split(/[0-9\s]+/, $$inputptr))
+  # "Split the text into separate tokens consisting only of letters and
+  # apostrophes. Digits and punctuation are discarded."
+  while ($$inputptr =~ /([^0-9\s\-!"#\$\%\&()*+,.\/:;<=>?\@\[\\\]\^_`{|}~]+)/gs)
   {
+    my $word = $1;
     # Bug 6229: Current TextCat database only works well with lowercase input
     if ($is_unicode) {
       # Unicode rules are used for the case change
@@ -500,8 +503,7 @@ sub create_lm {
     $word = "\000" . $word . "\000";
     my $len = length($word);
     my $flen = $len;
-    my $i;
-    for ($i = 0; $i < $flen; $i++) {
+    for (my $i = 0; $i < $flen; $i++) {
       $len--;
       $ngram{substr($word, $i, 1)}++;
       ($len < 1) ? next : $ngram{substr($word, $i, 2)}++;
@@ -516,10 +518,10 @@ sub create_lm {
     # up sorting by removing singletons, however I have very bad
     # results for short inputs, this way
     @sorted = sort { $ngram{$b} <=> $ngram{$a} }
-		   (grep { $ngram{$_} > $conf->{textcat_optimal_ngrams} } keys %ngram);
+      (grep { $ngram{$_} > $conf->{textcat_optimal_ngrams} } sort keys %ngram);
   }
   else {
-    @sorted = sort { $ngram{$b} <=> $ngram{$a} } keys %ngram;
+    @sorted = sort { $ngram{$b} <=> $ngram{$a} } sort keys %ngram;
   }
   splice(@sorted, $conf->{textcat_max_ngrams}) if (@sorted > $conf->{textcat_max_ngrams});
 
@@ -531,12 +533,18 @@ sub create_lm {
 sub extract_metadata {
   my ($self, $opts) = @_;
 
+  return if $self->{textcat_disabled};
+
   my $msg = $opts->{msg};
 
   my $body = $msg->get_rendered_body_text_array();
   $body = join("\n", @{$body});
   $body =~ s/^Subject://i;
 
+  # Strip anything that looks like url or email, enhances results
+  $body =~ s{https?://\S+}{ }gs;
+  $body =~ s{\S+?\@[a-zA-Z]\S+}{ }gs;
+
   my $len = length($body);
   # truncate after 10k; that should be plenty to classify it
   if ($len > 10000) {
@@ -572,6 +580,8 @@ sub extract_metadata {
 sub check_language {
   my ($self, $scan) = @_;
 
+  return 0 if $self->{textcat_disabled};
+
   my $msg = $scan->{msg};
 
   my @languages = split(/\s+/, $scan->{conf}->{ok_languages});
@@ -612,6 +622,8 @@ sub check_language {
 sub check_body_8bits {
   my ($self, $scan, $body) = @_;
 
+  return 0 if $self->{textcat_disabled};
+
   my @languages = split(/\s+/, $scan->{conf}->{ok_languages});
 
   for (@languages) {