You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by he...@apache.org on 2018/10/16 15:32:41 UTC

svn commit: r1844023 - in /spamassassin/trunk: MANIFEST UPGRADE lib/Mail/SpamAssassin/Plugin/Pyzor.pm

Author: hege
Date: Tue Oct 16 15:32:41 2018
New Revision: 1844023

URL: http://svn.apache.org/viewvc?rev=1844023&view=rev
Log:
Added pyzor_fork, renamed pyzor_max to pyzor_count_min, added pyzor_whitelist_min/pyzor_whitelist_factor.

Modified:
    spamassassin/trunk/MANIFEST
    spamassassin/trunk/UPGRADE
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Pyzor.pm

Modified: spamassassin/trunk/MANIFEST
URL: http://svn.apache.org/viewvc/spamassassin/trunk/MANIFEST?rev=1844023&r1=1844022&r2=1844023&view=diff
==============================================================================
--- spamassassin/trunk/MANIFEST (original)
+++ spamassassin/trunk/MANIFEST Tue Oct 16 15:32:41 2018
@@ -386,6 +386,7 @@ t/data/spam/bsmtpnull
 t/data/spam/dnsbl.eml
 t/data/spam/gtube.eml
 t/data/spam/gtubedcc.eml
+t/data/spam/pyzor
 t/data/spam/razor2
 t/data/spam/relayUS.eml
 t/data/spam/urilocalbl_net.eml
@@ -466,6 +467,7 @@ t/plugin_file.t
 t/plugin_priorities.t
 t/prefs_include.t
 t/priorities.t
+t/pyzor.t
 t/razor2.t
 t/rcvd_parser.t
 t/re_base_extraction.t

Modified: spamassassin/trunk/UPGRADE
URL: http://svn.apache.org/viewvc/spamassassin/trunk/UPGRADE?rev=1844023&r1=1844022&r2=1844023&view=diff
==============================================================================
--- spamassassin/trunk/UPGRADE (original)
+++ spamassassin/trunk/UPGRADE Tue Oct 16 15:32:41 2018
@@ -37,6 +37,10 @@ Note for Users Upgrading to SpamAssassin
   Commercial reputation rules can be ignored with use_dcc_rep 0, to save few
   CPU cycles.
 
+- Pyzor pyzor_fork option added. It will fork separate Pyzor process and
+  read in the results later asynchronously, increasing throughput.
+  Renamed pyzor_max setting to pyzor_count_min. Added pyzor_whitelist_min
+  and pyzor_whitelist_factor setting. Also try to ignore "empty body" FPs.
 
 Note for Users Upgrading to SpamAssassin 3.4.2
 ----------------------------------------------

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Pyzor.pm?rev=1844023&r1=1844022&r2=1844023&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Pyzor.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Pyzor.pm Tue Oct 16 15:32:41 2018
@@ -44,6 +44,9 @@ use warnings;
 # use bytes;
 use re 'taint';
 
+use Storable;
+use POSIX qw(PIPE_BUF WNOHANG _exit);
+
 our @ISA = qw(Mail::SpamAssassin::Plugin);
 
 sub new {
@@ -91,7 +94,21 @@ Whether to use Pyzor, if it is available
     type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
   });
 
-=item pyzor_max NUMBER		(default: 5)
+=item pyzor_fork (0|1)		(default: 0)
+
+Instead of running Pyzor synchronously, fork separate process for it and
+read the results in later (similar to async DNS lookups).  Increases
+throughput.  Experimental.
+
+=cut
+
+  push(@cmds, {
+    setting => 'pyzor_fork',
+    default => 0,
+    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
+  });
+
+=item pyzor_count_min NUMBER	(default: 5)
 
 This option sets how often a message's body checksum must have been
 reported to the Pyzor server before SpamAssassin will consider the Pyzor
@@ -103,11 +120,52 @@ set this to a relatively low value, e.g.
 =cut
 
   push (@cmds, {
-    setting => 'pyzor_max',
+    setting => 'pyzor_count_min',
     default => 5,
     type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
   });
 
+  # Deprecated setting, the name makes no sense!
+  push (@cmds, {
+    setting => 'pyzor_max',
+    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
+    code => sub {
+      my ($self, $key, $value, $line) = @_;
+      warn("deprecated setting used, change pyzor_max to pyzor_count_min\n");
+      if ($value !~ /^\d+$/) {
+        return $Mail::SpamAssassin::Conf::INVALID_VALUE;
+      }
+      $self->{pyzor_count_min} = $value;
+    }
+  });
+
+=item pyzor_whitelist_min NUMBER	(default: 10)
+
+This option sets how often a message's body checksum must have been
+whitelisted to the Pyzor server for SpamAssassin to consider ignoring the
+result.  Final decision is made by pyzor_whitelist_factor.
+
+=cut
+
+  push (@cmds, {
+    setting => 'pyzor_whitelist_min',
+    default => 10,
+    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
+  });
+
+=item pyzor_whitelist_factor NUMBER	(default: 0.2)
+
+Ignore Pyzor result if REPORTCOUNT x NUMBER >= pyzor_whitelist_min.
+For default setting this means: 50 reports requires 10 whitelistings.
+
+=cut
+
+  push (@cmds, {
+    setting => 'pyzor_whitelist_factor',
+    default => 0.2,
+    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
+  });
+
 =back
 
 =head1 ADMINISTRATOR OPTIONS
@@ -202,164 +260,295 @@ you should use this, as the current PATH
 sub is_pyzor_available {
   my ($self) = @_;
 
-  my $pyzor = $self->{main}->{conf}->{pyzor_path} || '';
-  unless ($pyzor) {
-    $pyzor = Mail::SpamAssassin::Util::find_executable_in_env_path('pyzor');
-  }
+  my $pyzor = $self->{main}->{conf}->{pyzor_path} ||
+    Mail::SpamAssassin::Util::find_executable_in_env_path('pyzor');
+
   unless ($pyzor && -x $pyzor) {
-    dbg("pyzor: pyzor is not available: no pyzor executable found");
+    dbg("pyzor: no pyzor executable found");
+    $self->{pyzor_available} = 0;
     return 0;
   }
 
   # remember any found pyzor
   $self->{main}->{conf}->{pyzor_path} = $pyzor;
 
-  dbg("pyzor: pyzor is available: " . $self->{main}->{conf}->{pyzor_path});
+  dbg("pyzor: pyzor is available: $pyzor");
   return 1;
 }
 
-sub get_pyzor_interface {
-  my ($self) = @_;
+sub finish_parsing_start {
+  my ($self, $opts) = @_;
 
-  if (!$self->{main}->{conf}->{use_pyzor}) {
-    dbg("pyzor: use_pyzor option not enabled, disabling Pyzor");
-    $self->{pyzor_interface} = "disabled";
-    $self->{pyzor_available} = 0;
-  }
-  elsif ($self->is_pyzor_available()) {
-    $self->{pyzor_interface} = "pyzor";
-    $self->{pyzor_available} = 1;
-  }
-  else {
-    dbg("pyzor: no pyzor found, disabling Pyzor");
-    $self->{pyzor_available} = 0;
+  # If forking, hard adjust priority -100 to launch early
+  # Find rulenames from eval_to_rule mappings
+  if ($opts->{conf}->{pyzor_fork}) {
+    foreach (@{$opts->{conf}->{eval_to_rule}->{check_pyzor}}) {
+      if (exists $opts->{conf}->{tests}->{$_}) {
+        dbg("pyzor: adjusting rule $_ priority to -100");
+        $opts->{conf}->{priority}->{$_} = -100;
+      }
+    }
   }
 }
 
 sub check_pyzor {
-  my ($self, $permsgstatus, $full) = @_;
+  my ($self, $pms, $full) = @_;
 
-  # initialize valid tags
-  $permsgstatus->{tag_data}->{PYZOR} = "";
+  return 0 if !$self->{pyzor_available};
+  return 0 if !$self->{main}->{conf}->{use_pyzor};
+
+  return 0 if $pms->{pyzor_running};
+  $pms->{pyzor_running} = 1;
+
+  return 0 if !$self->is_pyzor_available();
 
   my $timer = $self->{main}->time_method("check_pyzor");
 
-  $self->get_pyzor_interface();
-  return 0 unless $self->{pyzor_available};
+  # initialize valid tags
+  $pms->{tag_data}->{PYZOR} = '';
+
+  ## non-forking method
+
+  if (!$self->{main}->{conf}->{pyzor_fork}) {
+    my @results = $self->pyzor_lookup($pms, $full);
+    return $self->_check_result($pms, \@results);
+  }
+
+  ## forking method
 
-  return $self->pyzor_lookup($permsgstatus, $full);
+  $pms->{pyzor_rulename} = $pms->get_current_eval_rule_name();
+
+  # create socketpair for communication
+  $pms->{pyzor_backchannel} = Mail::SpamAssassin::SubProcBackChannel->new();
+  eval {
+    $pms->{pyzor_backchannel}->setup_backchannel_parent_pre_fork();
+  } or do {
+    dbg("pyzor: backchannel pre-setup failed: $@");
+    delete $pms->{pyzor_backchannel};
+    return 0;
+  };
+
+  my $pid = fork();
+  if (!$pid) {
+    # Must be set so DESTROY blocks etc can ignore us when exiting
+    $ENV{'IS_FORKED_HELPER_PROCESS'} = 1;
+    dbg("pyzor: child process $$ forked");
+    $pms->{pyzor_backchannel}->setup_backchannel_child_post_fork();
+    my @results = $self->pyzor_lookup($pms, $full);
+    my $backmsg;
+    eval {
+      $backmsg = Storable::freeze(\@results);
+    };
+    if ($@) {
+      dbg("pyzor: child return value freeze failed: $@");
+      _exit(0); # prevent touching filehandles as per perlfork(1)
+    }
+    if (!syswrite($pms->{pyzor_backchannel}->{parent}, $backmsg)) {
+      dbg("pyzor: child backchannel write failed: $!");
+    }
+    _exit(0); # prevent touching filehandles as per perlfork(1)
+  }
+
+  eval {
+    $pms->{pyzor_backchannel}->setup_backchannel_parent_post_fork($pid);
+  } or do {
+    dbg("pyzor: backchannel post-setup failed: $@");
+    delete $pms->{pyzor_backchannel};
+    return 0;
+  };
+
+  return 0;
 }
 
 sub pyzor_lookup {
-  my ($self, $permsgstatus, $fulltext) = @_;
-  my @response;
-  my $pyzor_count;
-  my $pyzor_whitelisted;
-  my $timeout = $self->{main}->{conf}->{pyzor_timeout};
+  my ($self, $pms, $fulltext) = @_;
 
-  $pyzor_count = 0;
-  $pyzor_whitelisted = 0;
-  my $pid;
-
-  # use a temp file here -- open2() is unreliable, buffering-wise, under spamd
-  my $tmpf = $permsgstatus->create_fulltext_tmpfile($fulltext);
+  my $conf = $self->{main}->{conf};
+  my $timeout = $conf->{pyzor_timeout};
 
   # note: not really tainted, this came from system configuration file
-  my $path = untaint_file_path($self->{main}->{conf}->{pyzor_path});
-  my $opts = untaint_var($self->{main}->{conf}->{pyzor_options}) || '';
+  my $path = untaint_file_path($conf->{pyzor_path});
+  my $opts = untaint_var($conf->{pyzor_options}) || '';
+
+  # use a temp file here -- open2() is unreliable, buffering-wise, under spamd
+  my $tmpf = $pms->create_fulltext_tmpfile($fulltext);
 
-  $permsgstatus->enter_helper_run_mode();
+  $pms->enter_helper_run_mode();
 
+  my $pid;
+  my @resp;
   my $timer = Mail::SpamAssassin::Timeout->new(
-           { secs => $timeout, deadline => $permsgstatus->{master_deadline} });
+           { secs => $timeout, deadline => $pms->{master_deadline} });
   my $err = $timer->run_and_catch(sub {
-
     local $SIG{PIPE} = sub { die "__brokenpipe__ignore__\n" };
- 
-    dbg("pyzor: opening pipe: " . join(' ', $path, $opts, "check", "< $tmpf"));
+
+    dbg("pyzor: opening pipe: " . join(' ', $path, $opts, "check", "<$tmpf"));
 
     $pid = Mail::SpamAssassin::Util::helper_app_pipe_open(*PYZOR,
 	$tmpf, 1, $path, split(' ', $opts), "check");
     $pid or die "$!\n";
 
     # read+split avoids a Perl I/O bug (Bug 5985)
-    my($inbuf,$nread,$resp); $resp = '';
-    while ( $nread=read(PYZOR,$inbuf,8192) ) { $resp .= $inbuf }
+    my($inbuf, $nread);
+    my $resp = '';
+    while ($nread = read(PYZOR, $inbuf, 8192)) { $resp .= $inbuf }
     defined $nread  or die "error reading from pipe: $!";
-    @response = split(/^/m, $resp, -1);  undef $resp;
+    @resp = split(/^/m, $resp, -1);
 
-    my $errno = 0;  close PYZOR or $errno = $!;
-    if (proc_status_ok($?,$errno)) {
+    my $errno = 0;
+    close PYZOR or $errno = $!;
+    if (proc_status_ok($?, $errno)) {
       dbg("pyzor: [%s] finished successfully", $pid);
-    } elsif (proc_status_ok($?,$errno, 0,1)) {  # sometimes it exits with 1
-      dbg("pyzor: [%s] finished: %s", $pid, exit_status_str($?,$errno));
+    } elsif (proc_status_ok($?, $errno, 0, 1)) {  # sometimes it exits with 1
+      dbg("pyzor: [%s] finished: %s", $pid, exit_status_str($?, $errno));
     } else {
-      info("pyzor: [%s] error: %s", $pid, exit_status_str($?,$errno));
-    }
-
-    if (!@response) {
-      # this exact string is needed below
-      warn("no response\n");	# yes, this is possible
-    }
-    chomp for @response;
-    dbg("pyzor: got response: " . join("\\n", @response));
-
-    if ($response[0] =~ /^Traceback/) {
-      warn("internal error, python traceback seen in response\n");
+      info("pyzor: [%s] error: %s", $pid, exit_status_str($?, $errno));
     }
 
   });
 
   if (defined(fileno(*PYZOR))) {  # still open
     if ($pid) {
-      if (kill('TERM',$pid)) { dbg("pyzor: killed stale helper [$pid]") }
-      else { dbg("pyzor: killing helper application [$pid] failed: $!") }
+      if (kill('TERM', $pid)) {
+        dbg("pyzor: killed stale helper [$pid]");
+      } else {
+        dbg("pyzor: killing helper application [$pid] failed: $!");
+      }
     }
-    my $errno = 0;  close PYZOR or $errno = $!;
-    proc_status_ok($?,$errno)
-      or info("pyzor: [%s] error: %s", $pid, exit_status_str($?,$errno));
+    my $errno = 0;
+    close PYZOR or $errno = $!;
+    proc_status_ok($?, $errno)
+      or info("pyzor: [%s] error: %s", $pid, exit_status_str($?, $errno));
   }
 
-  $permsgstatus->delete_fulltext_tmpfile();
+  $pms->delete_fulltext_tmpfile();
 
-  $permsgstatus->leave_helper_run_mode();
+  $pms->leave_helper_run_mode();
 
   if ($timer->timed_out()) {
     dbg("pyzor: check timed out after $timeout seconds");
+    return ();
+  } elsif ($err) {
+    chomp $err;
+    info("pyzor: check failed: $err");
+    return ();
+  }
+
+  return @resp;
+}
+
+sub check_tick {
+  my ($self, $opts) = @_;
+  $self->_check_forked_result($opts->{permsgstatus}, 0);
+}
+
+sub check_cleanup {
+  my ($self, $opts) = @_;
+  $self->_check_forked_result($opts->{permsgstatus}, 1);
+}
+
+sub _check_forked_result {
+  my ($self, $pms, $finish) = @_;
+
+  return 0 if !$pms->{pyzor_backchannel};
+
+  my $timer = $self->{main}->time_method("check_pyzor");
+
+  my $kid_pid = (keys %{$pms->{pyzor_backchannel}->{kids}})[0];
+  # if $finish, force waiting for the child
+  my $pid = waitpid($kid_pid, $finish ? 0 : WNOHANG);
+  if ($pid == 0) {
+    #dbg("pyzor: child process $kid_pid not finished yet, trying later");
+    return 0;
+  } elsif ($pid == -1) {
+    # child does not exist?
+    dbg("pyzor: child process $kid_pid already handled?");
+    delete $pms->{pyzor_backchannel};
     return 0;
   }
 
-  if ($err) {
-    chomp $err;
-    if ($err eq "__brokenpipe__ignore__") {
-      dbg("pyzor: check failed: broken pipe");
-    } elsif ($err eq "no response") {
-      dbg("pyzor: check failed: no response");
-    } else {
-      warn("pyzor: check failed: $err\n");
-    }
+  dbg("pyzor: child process $kid_pid finished, reading results");
+
+  my $backmsg;
+  my $ret = sysread($pms->{pyzor_backchannel}->{latest_kid_fh}, $backmsg, PIPE_BUF);
+  if (!defined $ret || $ret == 0) {
+    dbg("pyzor: could not read result from child: ".($ret == 0 ? 0 : $!));
+    delete $pms->{pyzor_backchannel};
     return 0;
   }
 
-  foreach my $one_response (@response) {
+  delete $pms->{pyzor_backchannel};
+
+  my $results;
+  eval {
+    $results = Storable::thaw($backmsg);
+  };
+  if ($@) {
+    dbg("pyzor: child return value thaw failed: $@");
+    return;
+  }
+
+  $self->_check_result($pms, $results);
+}
+
+sub _check_result {
+  my ($self, $pms, $results) = @_;
+
+  if (!@$results) {
+    dbg("pyzor: no response from server");
+    return 0;
+  }
+
+  my $count = 0;
+  my $count_wl = 0;
+  foreach my $res (@$results) {
+    chomp($res);
+    dbg("pyzor: got response: $res");
+    if ($res =~ /^Traceback/) {
+      info("pyzor: internal error, python traceback seen in response");
+      return 0;
+    }
     # this regexp is intended to be a little bit forgiving
-    if ($one_response =~ /^\S+\t.*?\t(\d+)\t(\d+)\s*$/) {
+    if ($res =~ /^\S+\t.*?\t(\d+)\t(\d+)\s*$/) {
       # until pyzor servers can sync their DBs,
       # sum counts obtained from all servers
-      $pyzor_whitelisted += $2+0;
-      $pyzor_count += $1+0;
-    }
-    else {
+      $count += untaint_var($1)+0; # crazy but needs untainting
+      $count_wl += untaint_var($2)+0;
+    } else {
       # warn on failures to parse
-      dbg("pyzor: failure to parse response \"$one_response\"");
+      info("pyzor: failure to parse response \"$res\"");
     }
   }
 
-  $permsgstatus->set_tag('PYZOR', $pyzor_whitelisted ? "Whitelisted."
-                                             : "Reported $pyzor_count times.");
+  my $conf = $self->{main}->{conf};
+
+  my $count_min = $conf->{pyzor_count_min};
+  my $wl_min = $conf->{pyzor_whitelist_min};
+
+  my $wl_limit = $count_wl >= $wl_min ?
+    $count * $conf->{pyzor_whitelist_factor} : 0;
+
+  dbg("pyzor: result: COUNT=$count/$count_min WHITELIST=$count_wl/$wl_min/%.1f",
+    $wl_limit);
+  $pms->set_tag('PYZOR', "Reported $count times, whitelisted $count_wl times.");
 
-  if ($pyzor_count >= $self->{main}->{conf}->{pyzor_max}) {
-    dbg("pyzor: listed: COUNT=$pyzor_count/$self->{main}->{conf}->{pyzor_max} WHITELIST=$pyzor_whitelisted");
+  # Empty body etc results in same hash, we should skip very large numbers..
+  if ($count >= 1000000 || $count_wl >= 10000) {
+    dbg("pyzor: result exceeded hardcoded limits, ignoring: count/wl 1000000/10000");
+    return 0;
+  }
+
+  # Whitelisted?
+  if ($wl_limit && $count_wl >= $wl_limit) {
+    dbg("pyzor: message whitelisted");
+    return 0;
+  }
+
+  if ($count >= $count_min) {
+    if ($conf->{pyzor_fork}) {
+      # forked needs to run got_hit()
+      $pms->got_hit($pms->{pyzor_rulename}, "", ruletype => 'eval');
+    }
     return 1;
   }
 
@@ -369,23 +558,24 @@ sub pyzor_lookup {
 sub plugin_report {
   my ($self, $options) = @_;
 
-  return unless $self->{pyzor_available};
-  return unless $self->{main}->{conf}->{use_pyzor};
-
-  if (!$options->{report}->{options}->{dont_report_to_pyzor} && $self->is_pyzor_available())
-  {
-    # use temporary file: open2() is unreliable due to buffering under spamd
-    my $tmpf = $options->{report}->create_fulltext_tmpfile($options->{text});
-    if ($self->pyzor_report($options, $tmpf)) {
-      $options->{report}->{report_available} = 1;
-      info("reporter: spam reported to Pyzor");
-      $options->{report}->{report_return} = 1;
-    }
-    else {
-      info("reporter: could not report spam to Pyzor");
-    }
-    $options->{report}->delete_fulltext_tmpfile();
+  return if !$self->{pyzor_available};
+  return if !$self->{main}->{conf}->{use_pyzor};
+  return if !$options->{report}->{options}->{dont_report_to_pyzor};
+  return if !$self->is_pyzor_available();
+
+  # use temporary file: open2() is unreliable due to buffering under spamd
+  my $tmpf = $options->{report}->create_fulltext_tmpfile($options->{text});
+  if ($self->pyzor_report($options, $tmpf)) {
+    $options->{report}->{report_available} = 1;
+    info("reporter: spam reported to Pyzor");
+    $options->{report}->{report_return} = 1;
   }
+  else {
+    info("reporter: could not report spam to Pyzor");
+  }
+  $options->{report}->delete_fulltext_tmpfile();
+
+  return 1;
 }
 
 sub pyzor_report {