You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by he...@apache.org on 2018/10/16 15:32:41 UTC
svn commit: r1844023 - in /spamassassin/trunk: MANIFEST UPGRADE
lib/Mail/SpamAssassin/Plugin/Pyzor.pm
Author: hege
Date: Tue Oct 16 15:32:41 2018
New Revision: 1844023
URL: http://svn.apache.org/viewvc?rev=1844023&view=rev
Log:
Added pyzor_fork, renamed pyzor_max to pyzor_count_min, added pyzor_whitelist_min/pyzor_whitelist_factor.
Modified:
spamassassin/trunk/MANIFEST
spamassassin/trunk/UPGRADE
spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
Modified: spamassassin/trunk/MANIFEST
URL: http://svn.apache.org/viewvc/spamassassin/trunk/MANIFEST?rev=1844023&r1=1844022&r2=1844023&view=diff
==============================================================================
--- spamassassin/trunk/MANIFEST (original)
+++ spamassassin/trunk/MANIFEST Tue Oct 16 15:32:41 2018
@@ -386,6 +386,7 @@ t/data/spam/bsmtpnull
t/data/spam/dnsbl.eml
t/data/spam/gtube.eml
t/data/spam/gtubedcc.eml
+t/data/spam/pyzor
t/data/spam/razor2
t/data/spam/relayUS.eml
t/data/spam/urilocalbl_net.eml
@@ -466,6 +467,7 @@ t/plugin_file.t
t/plugin_priorities.t
t/prefs_include.t
t/priorities.t
+t/pyzor.t
t/razor2.t
t/rcvd_parser.t
t/re_base_extraction.t
Modified: spamassassin/trunk/UPGRADE
URL: http://svn.apache.org/viewvc/spamassassin/trunk/UPGRADE?rev=1844023&r1=1844022&r2=1844023&view=diff
==============================================================================
--- spamassassin/trunk/UPGRADE (original)
+++ spamassassin/trunk/UPGRADE Tue Oct 16 15:32:41 2018
@@ -37,6 +37,10 @@ Note for Users Upgrading to SpamAssassin
Commercial reputation rules can be ignored with use_dcc_rep 0, to save few
CPU cycles.
+- Pyzor pyzor_fork option added. It will fork separate Pyzor process and
+ read in the results later asynchronously, increasing throughput.
+ Renamed pyzor_max setting to pyzor_count_min. Added pyzor_whitelist_min
+ and pyzor_whitelist_factor setting. Also try to ignore "empty body" FPs.
Note for Users Upgrading to SpamAssassin 3.4.2
----------------------------------------------
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Pyzor.pm?rev=1844023&r1=1844022&r2=1844023&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Pyzor.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Pyzor.pm Tue Oct 16 15:32:41 2018
@@ -44,6 +44,9 @@ use warnings;
# use bytes;
use re 'taint';
+use Storable;
+use POSIX qw(PIPE_BUF WNOHANG _exit);
+
our @ISA = qw(Mail::SpamAssassin::Plugin);
sub new {
@@ -91,7 +94,21 @@ Whether to use Pyzor, if it is available
type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
});
-=item pyzor_max NUMBER (default: 5)
+=item pyzor_fork (0|1) (default: 0)
+
+Instead of running Pyzor synchronously, fork separate process for it and
+read the results in later (similar to async DNS lookups). Increases
+throughput. Experimental.
+
+=cut
+
+ push(@cmds, {
+ setting => 'pyzor_fork',
+ default => 0,
+ type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
+ });
+
+=item pyzor_count_min NUMBER (default: 5)
This option sets how often a message's body checksum must have been
reported to the Pyzor server before SpamAssassin will consider the Pyzor
@@ -103,11 +120,52 @@ set this to a relatively low value, e.g.
=cut
push (@cmds, {
- setting => 'pyzor_max',
+ setting => 'pyzor_count_min',
default => 5,
type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
});
+ # Deprecated setting, the name makes no sense!
+ push (@cmds, {
+ setting => 'pyzor_max',
+ type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
+ code => sub {
+ my ($self, $key, $value, $line) = @_;
+ warn("deprecated setting used, change pyzor_max to pyzor_count_min\n");
+ if ($value !~ /^\d+$/) {
+ return $Mail::SpamAssassin::Conf::INVALID_VALUE;
+ }
+ $self->{pyzor_count_min} = $value;
+ }
+ });
+
+=item pyzor_whitelist_min NUMBER (default: 10)
+
+This option sets how often a message's body checksum must have been
+whitelisted to the Pyzor server for SpamAssassin to consider ignoring the
+result. Final decision is made by pyzor_whitelist_factor.
+
+=cut
+
+ push (@cmds, {
+ setting => 'pyzor_whitelist_min',
+ default => 10,
+ type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
+ });
+
+=item pyzor_whitelist_factor NUMBER (default: 0.2)
+
+Ignore Pyzor result if REPORTCOUNT x NUMBER >= pyzor_whitelist_min.
+For default setting this means: 50 reports requires 10 whitelistings.
+
+=cut
+
+ push (@cmds, {
+ setting => 'pyzor_whitelist_factor',
+ default => 0.2,
+ type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
+ });
+
=back
=head1 ADMINISTRATOR OPTIONS
@@ -202,164 +260,295 @@ you should use this, as the current PATH
sub is_pyzor_available {
my ($self) = @_;
- my $pyzor = $self->{main}->{conf}->{pyzor_path} || '';
- unless ($pyzor) {
- $pyzor = Mail::SpamAssassin::Util::find_executable_in_env_path('pyzor');
- }
+ my $pyzor = $self->{main}->{conf}->{pyzor_path} ||
+ Mail::SpamAssassin::Util::find_executable_in_env_path('pyzor');
+
unless ($pyzor && -x $pyzor) {
- dbg("pyzor: pyzor is not available: no pyzor executable found");
+ dbg("pyzor: no pyzor executable found");
+ $self->{pyzor_available} = 0;
return 0;
}
# remember any found pyzor
$self->{main}->{conf}->{pyzor_path} = $pyzor;
- dbg("pyzor: pyzor is available: " . $self->{main}->{conf}->{pyzor_path});
+ dbg("pyzor: pyzor is available: $pyzor");
return 1;
}
-sub get_pyzor_interface {
- my ($self) = @_;
+sub finish_parsing_start {
+ my ($self, $opts) = @_;
- if (!$self->{main}->{conf}->{use_pyzor}) {
- dbg("pyzor: use_pyzor option not enabled, disabling Pyzor");
- $self->{pyzor_interface} = "disabled";
- $self->{pyzor_available} = 0;
- }
- elsif ($self->is_pyzor_available()) {
- $self->{pyzor_interface} = "pyzor";
- $self->{pyzor_available} = 1;
- }
- else {
- dbg("pyzor: no pyzor found, disabling Pyzor");
- $self->{pyzor_available} = 0;
+ # If forking, hard adjust priority -100 to launch early
+ # Find rulenames from eval_to_rule mappings
+ if ($opts->{conf}->{pyzor_fork}) {
+ foreach (@{$opts->{conf}->{eval_to_rule}->{check_pyzor}}) {
+ if (exists $opts->{conf}->{tests}->{$_}) {
+ dbg("pyzor: adjusting rule $_ priority to -100");
+ $opts->{conf}->{priority}->{$_} = -100;
+ }
+ }
}
}
sub check_pyzor {
- my ($self, $permsgstatus, $full) = @_;
+ my ($self, $pms, $full) = @_;
- # initialize valid tags
- $permsgstatus->{tag_data}->{PYZOR} = "";
+ return 0 if !$self->{pyzor_available};
+ return 0 if !$self->{main}->{conf}->{use_pyzor};
+
+ return 0 if $pms->{pyzor_running};
+ $pms->{pyzor_running} = 1;
+
+ return 0 if !$self->is_pyzor_available();
my $timer = $self->{main}->time_method("check_pyzor");
- $self->get_pyzor_interface();
- return 0 unless $self->{pyzor_available};
+ # initialize valid tags
+ $pms->{tag_data}->{PYZOR} = '';
+
+ ## non-forking method
+
+ if (!$self->{main}->{conf}->{pyzor_fork}) {
+ my @results = $self->pyzor_lookup($pms, $full);
+ return $self->_check_result($pms, \@results);
+ }
+
+ ## forking method
- return $self->pyzor_lookup($permsgstatus, $full);
+ $pms->{pyzor_rulename} = $pms->get_current_eval_rule_name();
+
+ # create socketpair for communication
+ $pms->{pyzor_backchannel} = Mail::SpamAssassin::SubProcBackChannel->new();
+ eval {
+ $pms->{pyzor_backchannel}->setup_backchannel_parent_pre_fork();
+ } or do {
+ dbg("pyzor: backchannel pre-setup failed: $@");
+ delete $pms->{pyzor_backchannel};
+ return 0;
+ };
+
+ my $pid = fork();
+ if (!$pid) {
+ # Must be set so DESTROY blocks etc can ignore us when exiting
+ $ENV{'IS_FORKED_HELPER_PROCESS'} = 1;
+ dbg("pyzor: child process $$ forked");
+ $pms->{pyzor_backchannel}->setup_backchannel_child_post_fork();
+ my @results = $self->pyzor_lookup($pms, $full);
+ my $backmsg;
+ eval {
+ $backmsg = Storable::freeze(\@results);
+ };
+ if ($@) {
+ dbg("pyzor: child return value freeze failed: $@");
+ _exit(0); # prevent touching filehandles as per perlfork(1)
+ }
+ if (!syswrite($pms->{pyzor_backchannel}->{parent}, $backmsg)) {
+ dbg("pyzor: child backchannel write failed: $!");
+ }
+ _exit(0); # prevent touching filehandles as per perlfork(1)
+ }
+
+ eval {
+ $pms->{pyzor_backchannel}->setup_backchannel_parent_post_fork($pid);
+ } or do {
+ dbg("pyzor: backchannel post-setup failed: $@");
+ delete $pms->{pyzor_backchannel};
+ return 0;
+ };
+
+ return 0;
}
sub pyzor_lookup {
- my ($self, $permsgstatus, $fulltext) = @_;
- my @response;
- my $pyzor_count;
- my $pyzor_whitelisted;
- my $timeout = $self->{main}->{conf}->{pyzor_timeout};
+ my ($self, $pms, $fulltext) = @_;
- $pyzor_count = 0;
- $pyzor_whitelisted = 0;
- my $pid;
-
- # use a temp file here -- open2() is unreliable, buffering-wise, under spamd
- my $tmpf = $permsgstatus->create_fulltext_tmpfile($fulltext);
+ my $conf = $self->{main}->{conf};
+ my $timeout = $conf->{pyzor_timeout};
# note: not really tainted, this came from system configuration file
- my $path = untaint_file_path($self->{main}->{conf}->{pyzor_path});
- my $opts = untaint_var($self->{main}->{conf}->{pyzor_options}) || '';
+ my $path = untaint_file_path($conf->{pyzor_path});
+ my $opts = untaint_var($conf->{pyzor_options}) || '';
+
+ # use a temp file here -- open2() is unreliable, buffering-wise, under spamd
+ my $tmpf = $pms->create_fulltext_tmpfile($fulltext);
- $permsgstatus->enter_helper_run_mode();
+ $pms->enter_helper_run_mode();
+ my $pid;
+ my @resp;
my $timer = Mail::SpamAssassin::Timeout->new(
- { secs => $timeout, deadline => $permsgstatus->{master_deadline} });
+ { secs => $timeout, deadline => $pms->{master_deadline} });
my $err = $timer->run_and_catch(sub {
-
local $SIG{PIPE} = sub { die "__brokenpipe__ignore__\n" };
-
- dbg("pyzor: opening pipe: " . join(' ', $path, $opts, "check", "< $tmpf"));
+
+ dbg("pyzor: opening pipe: " . join(' ', $path, $opts, "check", "<$tmpf"));
$pid = Mail::SpamAssassin::Util::helper_app_pipe_open(*PYZOR,
$tmpf, 1, $path, split(' ', $opts), "check");
$pid or die "$!\n";
# read+split avoids a Perl I/O bug (Bug 5985)
- my($inbuf,$nread,$resp); $resp = '';
- while ( $nread=read(PYZOR,$inbuf,8192) ) { $resp .= $inbuf }
+ my($inbuf, $nread);
+ my $resp = '';
+ while ($nread = read(PYZOR, $inbuf, 8192)) { $resp .= $inbuf }
defined $nread or die "error reading from pipe: $!";
- @response = split(/^/m, $resp, -1); undef $resp;
+ @resp = split(/^/m, $resp, -1);
- my $errno = 0; close PYZOR or $errno = $!;
- if (proc_status_ok($?,$errno)) {
+ my $errno = 0;
+ close PYZOR or $errno = $!;
+ if (proc_status_ok($?, $errno)) {
dbg("pyzor: [%s] finished successfully", $pid);
- } elsif (proc_status_ok($?,$errno, 0,1)) { # sometimes it exits with 1
- dbg("pyzor: [%s] finished: %s", $pid, exit_status_str($?,$errno));
+ } elsif (proc_status_ok($?, $errno, 0, 1)) { # sometimes it exits with 1
+ dbg("pyzor: [%s] finished: %s", $pid, exit_status_str($?, $errno));
} else {
- info("pyzor: [%s] error: %s", $pid, exit_status_str($?,$errno));
- }
-
- if (!@response) {
- # this exact string is needed below
- warn("no response\n"); # yes, this is possible
- }
- chomp for @response;
- dbg("pyzor: got response: " . join("\\n", @response));
-
- if ($response[0] =~ /^Traceback/) {
- warn("internal error, python traceback seen in response\n");
+ info("pyzor: [%s] error: %s", $pid, exit_status_str($?, $errno));
}
});
if (defined(fileno(*PYZOR))) { # still open
if ($pid) {
- if (kill('TERM',$pid)) { dbg("pyzor: killed stale helper [$pid]") }
- else { dbg("pyzor: killing helper application [$pid] failed: $!") }
+ if (kill('TERM', $pid)) {
+ dbg("pyzor: killed stale helper [$pid]");
+ } else {
+ dbg("pyzor: killing helper application [$pid] failed: $!");
+ }
}
- my $errno = 0; close PYZOR or $errno = $!;
- proc_status_ok($?,$errno)
- or info("pyzor: [%s] error: %s", $pid, exit_status_str($?,$errno));
+ my $errno = 0;
+ close PYZOR or $errno = $!;
+ proc_status_ok($?, $errno)
+ or info("pyzor: [%s] error: %s", $pid, exit_status_str($?, $errno));
}
- $permsgstatus->delete_fulltext_tmpfile();
+ $pms->delete_fulltext_tmpfile();
- $permsgstatus->leave_helper_run_mode();
+ $pms->leave_helper_run_mode();
if ($timer->timed_out()) {
dbg("pyzor: check timed out after $timeout seconds");
+ return ();
+ } elsif ($err) {
+ chomp $err;
+ info("pyzor: check failed: $err");
+ return ();
+ }
+
+ return @resp;
+}
+
+sub check_tick {
+ my ($self, $opts) = @_;
+ $self->_check_forked_result($opts->{permsgstatus}, 0);
+}
+
+sub check_cleanup {
+ my ($self, $opts) = @_;
+ $self->_check_forked_result($opts->{permsgstatus}, 1);
+}
+
+sub _check_forked_result {
+ my ($self, $pms, $finish) = @_;
+
+ return 0 if !$pms->{pyzor_backchannel};
+
+ my $timer = $self->{main}->time_method("check_pyzor");
+
+ my $kid_pid = (keys %{$pms->{pyzor_backchannel}->{kids}})[0];
+ # if $finish, force waiting for the child
+ my $pid = waitpid($kid_pid, $finish ? 0 : WNOHANG);
+ if ($pid == 0) {
+ #dbg("pyzor: child process $kid_pid not finished yet, trying later");
+ return 0;
+ } elsif ($pid == -1) {
+ # child does not exist?
+ dbg("pyzor: child process $kid_pid already handled?");
+ delete $pms->{pyzor_backchannel};
return 0;
}
- if ($err) {
- chomp $err;
- if ($err eq "__brokenpipe__ignore__") {
- dbg("pyzor: check failed: broken pipe");
- } elsif ($err eq "no response") {
- dbg("pyzor: check failed: no response");
- } else {
- warn("pyzor: check failed: $err\n");
- }
+ dbg("pyzor: child process $kid_pid finished, reading results");
+
+ my $backmsg;
+ my $ret = sysread($pms->{pyzor_backchannel}->{latest_kid_fh}, $backmsg, PIPE_BUF);
+ if (!defined $ret || $ret == 0) {
+ dbg("pyzor: could not read result from child: ".($ret == 0 ? 0 : $!));
+ delete $pms->{pyzor_backchannel};
return 0;
}
- foreach my $one_response (@response) {
+ delete $pms->{pyzor_backchannel};
+
+ my $results;
+ eval {
+ $results = Storable::thaw($backmsg);
+ };
+ if ($@) {
+ dbg("pyzor: child return value thaw failed: $@");
+ return;
+ }
+
+ $self->_check_result($pms, $results);
+}
+
+sub _check_result {
+ my ($self, $pms, $results) = @_;
+
+ if (!@$results) {
+ dbg("pyzor: no response from server");
+ return 0;
+ }
+
+ my $count = 0;
+ my $count_wl = 0;
+ foreach my $res (@$results) {
+ chomp($res);
+ dbg("pyzor: got response: $res");
+ if ($res =~ /^Traceback/) {
+ info("pyzor: internal error, python traceback seen in response");
+ return 0;
+ }
# this regexp is intended to be a little bit forgiving
- if ($one_response =~ /^\S+\t.*?\t(\d+)\t(\d+)\s*$/) {
+ if ($res =~ /^\S+\t.*?\t(\d+)\t(\d+)\s*$/) {
# until pyzor servers can sync their DBs,
# sum counts obtained from all servers
- $pyzor_whitelisted += $2+0;
- $pyzor_count += $1+0;
- }
- else {
+ $count += untaint_var($1)+0; # crazy but needs untainting
+ $count_wl += untaint_var($2)+0;
+ } else {
# warn on failures to parse
- dbg("pyzor: failure to parse response \"$one_response\"");
+ info("pyzor: failure to parse response \"$res\"");
}
}
- $permsgstatus->set_tag('PYZOR', $pyzor_whitelisted ? "Whitelisted."
- : "Reported $pyzor_count times.");
+ my $conf = $self->{main}->{conf};
+
+ my $count_min = $conf->{pyzor_count_min};
+ my $wl_min = $conf->{pyzor_whitelist_min};
+
+ my $wl_limit = $count_wl >= $wl_min ?
+ $count * $conf->{pyzor_whitelist_factor} : 0;
+
+ dbg("pyzor: result: COUNT=$count/$count_min WHITELIST=$count_wl/$wl_min/%.1f",
+ $wl_limit);
+ $pms->set_tag('PYZOR', "Reported $count times, whitelisted $count_wl times.");
- if ($pyzor_count >= $self->{main}->{conf}->{pyzor_max}) {
- dbg("pyzor: listed: COUNT=$pyzor_count/$self->{main}->{conf}->{pyzor_max} WHITELIST=$pyzor_whitelisted");
+ # Empty body etc results in same hash, we should skip very large numbers..
+ if ($count >= 1000000 || $count_wl >= 10000) {
+ dbg("pyzor: result exceeded hardcoded limits, ignoring: count/wl 1000000/10000");
+ return 0;
+ }
+
+ # Whitelisted?
+ if ($wl_limit && $count_wl >= $wl_limit) {
+ dbg("pyzor: message whitelisted");
+ return 0;
+ }
+
+ if ($count >= $count_min) {
+ if ($conf->{pyzor_fork}) {
+ # forked needs to run got_hit()
+ $pms->got_hit($pms->{pyzor_rulename}, "", ruletype => 'eval');
+ }
return 1;
}
@@ -369,23 +558,24 @@ sub pyzor_lookup {
sub plugin_report {
my ($self, $options) = @_;
- return unless $self->{pyzor_available};
- return unless $self->{main}->{conf}->{use_pyzor};
-
- if (!$options->{report}->{options}->{dont_report_to_pyzor} && $self->is_pyzor_available())
- {
- # use temporary file: open2() is unreliable due to buffering under spamd
- my $tmpf = $options->{report}->create_fulltext_tmpfile($options->{text});
- if ($self->pyzor_report($options, $tmpf)) {
- $options->{report}->{report_available} = 1;
- info("reporter: spam reported to Pyzor");
- $options->{report}->{report_return} = 1;
- }
- else {
- info("reporter: could not report spam to Pyzor");
- }
- $options->{report}->delete_fulltext_tmpfile();
+ return if !$self->{pyzor_available};
+ return if !$self->{main}->{conf}->{use_pyzor};
+ return if !$options->{report}->{options}->{dont_report_to_pyzor};
+ return if !$self->is_pyzor_available();
+
+ # use temporary file: open2() is unreliable due to buffering under spamd
+ my $tmpf = $options->{report}->create_fulltext_tmpfile($options->{text});
+ if ($self->pyzor_report($options, $tmpf)) {
+ $options->{report}->{report_available} = 1;
+ info("reporter: spam reported to Pyzor");
+ $options->{report}->{report_return} = 1;
}
+ else {
+ info("reporter: could not report spam to Pyzor");
+ }
+ $options->{report}->delete_fulltext_tmpfile();
+
+ return 1;
}
sub pyzor_report {