You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by mm...@apache.org on 2015/10/08 20:11:19 UTC

svn commit: r1707595 - in /spamassassin/trunk/lib/Mail/SpamAssassin/Plugin: FreeMail.pm TextCat.pm VBounce.pm

Author: mmartinec
Date: Thu Oct  8 18:11:18 2015
New Revision: 1707595

URL: http://svn.apache.org/viewvc?rev=1707595&view=rev
Log:
get plugins FreeMail, TextCat and VBounce ready to deal with perl characters if they happen to reach them in a mail body

Modified:
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/FreeMail.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TextCat.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/VBounce.pm

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/FreeMail.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/FreeMail.pm?rev=1707595&r1=1707594&r2=1707595&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/FreeMail.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/FreeMail.pm Thu Oct  8 18:11:18 2015
@@ -348,14 +348,15 @@ sub _parse_body {
             s{<?(?<!mailto:)$self->{email_regex}(?:>|\s{1,10}(?!(?:fa(?:x|csi)|tel|phone|e?-?mail))[a-z]{2,11}:)}{ }gi;
             while (/$self->{email_regex}/g) {
                 my $email = lc($1);
-                push(@body_emails, $email) unless defined $seen{$email};
+                utf8::encode($email) if utf8::is_utf8($email); # chars to UTF-8
+                push(@body_emails, $email) unless $seen{$email};
                 $seen{$email} = 1;
                 last BODY if @body_emails >= 40; # sanity
             }
         }
         my $count_all = 0;
         my $count_fm = 0;
-        foreach my $email (@body_emails) {
+        foreach my $email (@body_emails) {  # as UTF-8 octets
             if (++$count_all == $pms->{main}->{conf}->{freemail_max_body_emails}) {
                 if ($pms->{main}->{conf}->{freemail_skip_when_over_max}) {
                     $pms->{freemail_skip_body} = 1;

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TextCat.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TextCat.pm?rev=1707595&r1=1707594&r2=1707595&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TextCat.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TextCat.pm Thu Oct  8 18:11:18 2015
@@ -473,14 +473,22 @@ sub create_lm {
   my %ngram;
   my @sorted;
 
+  # Note that $$inputptr may or may not be in perl characters (utf8 flag set)
+  my $is_unicode = utf8::is_utf8($$inputptr);
+
   # my $non_word_characters = qr/[0-9\s]/;
-  for my $word (split(/[0-9\s]+/, ${$_[0]}))
+  for my $word (split(/[0-9\s]+/, $$inputptr))
   {
-    # Bug 6229: Current TextCat database only works well with
-    # lowercase input, lets work around it until it's properly
-    # generated and/or locale issues are resolved..
-    $word =~ tr/A-Z\xc0-\xd6\xd8-\xde/a-z\xe0-\xf6\xf8-\xfe/
-        if $word =~ /[A-Z]/ && $word =~ /[a-zA-Z\xc0-\xd6\xd8-\xde\xe0-\xf6\xf8-\xfe]{4}/;
+    # Bug 6229: Current TextCat database only works well with lowercase input
+    if ($is_unicode) {
+      # Unicode rules are used for the case change
+      $word = lc $word  if $word =~ /\w{4}/;
+      utf8::encode($word);  # encode Unicode characters to UTF-8 octets
+    } elsif ($word =~ /[A-Z]/ &&
+             $word =~ /[a-zA-Z\xc0-\xd6\xd8-\xde\xe0-\xf6\xf8-\xfe]{4}/) {
+      # assume ISO 8859-1 / Windows-1252
+      $word =~ tr/A-Z\xc0-\xd6\xd8-\xde/a-z\xe0-\xf6\xf8-\xfe/;
+    }
     $word = "\000" . $word . "\000";
     my $len = length($word);
     my $flen = $len;

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/VBounce.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/VBounce.pm?rev=1707595&r1=1707594&r2=1707595&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/VBounce.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/VBounce.pm Thu Oct  8 18:11:18 2015
@@ -162,6 +162,7 @@ sub _relay_is_in_whitelist_bounce_relays
 sub _relay_is_in_list {
   my ($self, $list, $pms, $relay) = @_;
   $relay = lc $relay;
+  utf8::encode($relay) if utf8::is_utf8($relay);  # encode chars to UTF-8
 
   if (defined $list->{$relay}) { return 1; }