You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by mm...@apache.org on 2015/10/08 20:11:19 UTC
svn commit: r1707595 - in /spamassassin/trunk/lib/Mail/SpamAssassin/Plugin:
FreeMail.pm TextCat.pm VBounce.pm
Author: mmartinec
Date: Thu Oct 8 18:11:18 2015
New Revision: 1707595
URL: http://svn.apache.org/viewvc?rev=1707595&view=rev
Log:
get plugins FreeMail, TextCat and VBounce ready to deal with perl characters if they happen to reach them in a mail body
Modified:
spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/FreeMail.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TextCat.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/VBounce.pm
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/FreeMail.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/FreeMail.pm?rev=1707595&r1=1707594&r2=1707595&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/FreeMail.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/FreeMail.pm Thu Oct 8 18:11:18 2015
@@ -348,14 +348,15 @@ sub _parse_body {
s{<?(?<!mailto:)$self->{email_regex}(?:>|\s{1,10}(?!(?:fa(?:x|csi)|tel|phone|e?-?mail))[a-z]{2,11}:)}{ }gi;
while (/$self->{email_regex}/g) {
my $email = lc($1);
- push(@body_emails, $email) unless defined $seen{$email};
+ utf8::encode($email) if utf8::is_utf8($email); # chars to UTF-8
+ push(@body_emails, $email) unless $seen{$email};
$seen{$email} = 1;
last BODY if @body_emails >= 40; # sanity
}
}
my $count_all = 0;
my $count_fm = 0;
- foreach my $email (@body_emails) {
+ foreach my $email (@body_emails) { # as UTF-8 octets
if (++$count_all == $pms->{main}->{conf}->{freemail_max_body_emails}) {
if ($pms->{main}->{conf}->{freemail_skip_when_over_max}) {
$pms->{freemail_skip_body} = 1;
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TextCat.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TextCat.pm?rev=1707595&r1=1707594&r2=1707595&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TextCat.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/TextCat.pm Thu Oct 8 18:11:18 2015
@@ -473,14 +473,22 @@ sub create_lm {
my %ngram;
my @sorted;
+ # Note that $$inputptr may or may not be in perl characters (utf8 flag set)
+ my $is_unicode = utf8::is_utf8($$inputptr);
+
# my $non_word_characters = qr/[0-9\s]/;
- for my $word (split(/[0-9\s]+/, ${$_[0]}))
+ for my $word (split(/[0-9\s]+/, $$inputptr))
{
- # Bug 6229: Current TextCat database only works well with
- # lowercase input, lets work around it until it's properly
- # generated and/or locale issues are resolved..
- $word =~ tr/A-Z\xc0-\xd6\xd8-\xde/a-z\xe0-\xf6\xf8-\xfe/
- if $word =~ /[A-Z]/ && $word =~ /[a-zA-Z\xc0-\xd6\xd8-\xde\xe0-\xf6\xf8-\xfe]{4}/;
+ # Bug 6229: Current TextCat database only works well with lowercase input
+ if ($is_unicode) {
+ # Unicode rules are used for the case change
+ $word = lc $word if $word =~ /\w{4}/;
+ utf8::encode($word); # encode Unicode characters to UTF-8 octets
+ } elsif ($word =~ /[A-Z]/ &&
+ $word =~ /[a-zA-Z\xc0-\xd6\xd8-\xde\xe0-\xf6\xf8-\xfe]{4}/) {
+ # assume ISO 8859-1 / Windows-1252
+ $word =~ tr/A-Z\xc0-\xd6\xd8-\xde/a-z\xe0-\xf6\xf8-\xfe/;
+ }
$word = "\000" . $word . "\000";
my $len = length($word);
my $flen = $len;
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/VBounce.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/VBounce.pm?rev=1707595&r1=1707594&r2=1707595&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/VBounce.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/VBounce.pm Thu Oct 8 18:11:18 2015
@@ -162,6 +162,7 @@ sub _relay_is_in_whitelist_bounce_relays
sub _relay_is_in_list {
my ($self, $list, $pms, $relay) = @_;
$relay = lc $relay;
+ utf8::encode($relay) if utf8::is_utf8($relay); # encode chars to UTF-8
if (defined $list->{$relay}) { return 1; }